[clblas] 01/125: Initial check-in of open source clBLAS code

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Fri May 29 06:57:15 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository clblas.

commit db35dbd368bba86cd7a51cb5202d12ae784802d6
Author: Kent Knox <kent.knox at amd>
Date:   Tue Aug 13 10:28:12 2013 -0500

    Initial check-in of open source clBLAS code
---
 .gitattributes                                    |    22 +
 .gitignore                                        |    19 +
 CHANGELOG                                         |   276 +
 CONTRIBUTING.md                                   |    36 +
 LICENSE                                           |   202 +
 NOTICE                                            |     5 +
 README.md                                         |   159 +
 doc/clBLAS.doxy                                   |  1902 +++
 src/CMakeLists.txt                                |   298 +
 src/FindNetlib.cmake                              |   109 +
 src/FindOpenCL.cmake                              |    96 +
 src/clAmdBlas.h                                   | 12242 ++++++++++++++++++++
 src/clAmdBlas.version.h                           |    22 +
 src/clBLAS-complex.h                              |    53 +
 src/clBLAS.def                                    |   215 +
 src/clBLAS.h                                      |  9650 +++++++++++++++
 src/client/CMakeLists.txt                         |    68 +
 src/client/clGemm.h                               |   627 +
 src/client/clfunc_common.hpp                      |   335 +
 src/client/clfunc_xgemm.hpp                       |   995 ++
 src/client/clfunc_xgemv.hpp                       |   367 +
 src/client/clfunc_xger.hpp                        |   419 +
 src/client/clfunc_xgerc.hpp                       |   301 +
 src/client/clfunc_xgeru.hpp                       |   236 +
 src/client/clfunc_xhemm.hpp                       |   394 +
 src/client/clfunc_xhemv.hpp                       |   275 +
 src/client/clfunc_xher.hpp                        |   305 +
 src/client/clfunc_xher2.hpp                       |   329 +
 src/client/clfunc_xsymm.hpp                       |   660 ++
 src/client/clfunc_xsymv.hpp                       |   256 +
 src/client/clfunc_xsyr.hpp                        |   224 +
 src/client/clfunc_xsyr2.hpp                       |   239 +
 src/client/clfunc_xsyr2k.hpp                      |   460 +
 src/client/clfunc_xsyrk.hpp                       |   372 +
 src/client/clfunc_xtrmm.hpp                       |   785 ++
 src/client/clfunc_xtrmv.hpp                       |   427 +
 src/client/clfunc_xtrsm.hpp                       |   785 ++
 src/client/clfunc_xtrsv.hpp                       |   420 +
 src/client/client.cpp                             |   531 +
 src/client/ctimer.h                               |    42 +
 src/client/makefile                               |    14 +
 src/client/statisticalTimer.cpp                   |   341 +
 src/client/statisticalTimer.h                     |   170 +
 src/client/stdafx.cpp                             |    25 +
 src/client/stdafx.h                               |    40 +
 src/client/targetver.h                            |    27 +
 src/client/testPerfWrapper.cpp                    |   206 +
 src/client/timer.cpp                              |   103 +
 src/client/timer.hpp                              |    50 +
 src/include/clblas_stddef.h                       |   131 +
 src/include/clkern.h                              |   199 +
 src/include/cltypes.h                             |    79 +
 src/include/dblock_kgen.h                         |   220 +
 src/include/defbool.h                             |    57 +
 src/include/devinfo.h                             |    99 +
 src/include/dis_warning.h                         |    65 +
 src/include/granulation.h                         |    79 +
 src/include/kern_cache.h                          |   187 +
 src/include/kernel_extra.h                        |   167 +
 src/include/kerngen.h                             |   685 ++
 src/include/list.h                                |   116 +
 src/include/mempat.h                              |    86 +
 src/include/msvc.h                                |    34 +
 src/include/mutex.h                               |    36 +
 src/include/solver.h                              |   196 +
 src/include/trace_malloc.h                        |    75 +
 src/library/CMakeLists.txt                        |   329 +
 src/library/blas/generic/blas_funcs.c             |    96 +
 src/library/blas/generic/common.c                 |   877 ++
 src/library/blas/generic/events.c                 |    75 +
 src/library/blas/generic/kdump.c                  |   188 +
 src/library/blas/generic/kernel_extra.c           |    27 +
 src/library/blas/generic/matrix_dims.c            |   186 +
 src/library/blas/generic/matrix_props.c           |   184 +
 src/library/blas/generic/problem_iter.c           |   121 +
 src/library/blas/generic/problem_iter.h           |    76 +
 src/library/blas/generic/solution_assert.c        |   195 +
 src/library/blas/generic/solution_assert.h        |    63 +
 src/library/blas/generic/solution_seq.c           |   465 +
 src/library/blas/generic/solution_seq_make.c      |  2364 ++++
 src/library/blas/gens/asum.cpp                    |   300 +
 src/library/blas/gens/axpy_reg.cpp                |   279 +
 src/library/blas/gens/blas_kgen.c                 |  1580 +++
 src/library/blas/gens/blas_kgen.h                 |   895 ++
 src/library/blas/gens/blas_subgroup.c             |   528 +
 src/library/blas/gens/blas_subgroup.h             |    69 +
 src/library/blas/gens/clTemplates/asum.cl         |    78 +
 src/library/blas/gens/clTemplates/axpy.cl         |    78 +
 src/library/blas/gens/clTemplates/copy.cl         |    68 +
 src/library/blas/gens/clTemplates/dot.cl          |    86 +
 src/library/blas/gens/clTemplates/gbmv.cl         |   292 +
 src/library/blas/gens/clTemplates/gemm.cl         |  1650 +++
 src/library/blas/gens/clTemplates/gemm_helper.cl  |    87 +
 src/library/blas/gens/clTemplates/ger.cl          |   293 +
 src/library/blas/gens/clTemplates/her.cl          |   533 +
 src/library/blas/gens/clTemplates/her2.cl         |   662 ++
 src/library/blas/gens/clTemplates/iamax.cl        |   108 +
 src/library/blas/gens/clTemplates/nrm2.cl         |   217 +
 src/library/blas/gens/clTemplates/reduction.cl    |   352 +
 src/library/blas/gens/clTemplates/rotg.cl         |   112 +
 src/library/blas/gens/clTemplates/rotm.cl         |   120 +
 src/library/blas/gens/clTemplates/rotmg.cl        |   208 +
 src/library/blas/gens/clTemplates/scal.cl         |    69 +
 src/library/blas/gens/clTemplates/swap.cl         |    83 +
 src/library/blas/gens/clTemplates/symm.cl         |  1020 ++
 src/library/blas/gens/clTemplates/symm_helper.cl  |   102 +
 src/library/blas/gens/clTemplates/syr.cl          |   474 +
 src/library/blas/gens/clTemplates/syr2.cl         |  1209 ++
 src/library/blas/gens/clTemplates/syr2_her2.cl    |   743 ++
 src/library/blas/gens/clTemplates/syr_her.cl      |   577 +
 src/library/blas/gens/clTemplates/trmv.cl         |   931 ++
 src/library/blas/gens/clTemplates/trsv.cl         |   437 +
 src/library/blas/gens/clTemplates/trsv_gemv.cl    |  1487 +++
 src/library/blas/gens/copy_reg.cpp                |   274 +
 src/library/blas/gens/decomposition.c             |   163 +
 src/library/blas/gens/dot.cpp                     |   303 +
 src/library/blas/gens/fetch.c                     |  2190 ++++
 src/library/blas/gens/fetch.h                     |   379 +
 src/library/blas/gens/gbmv.cpp                    |   482 +
 src/library/blas/gens/gemm.c                      |  1447 +++
 src/library/blas/gens/gemm_cached.cpp             |   503 +
 src/library/blas/gens/gemm_tail_cached.cpp        |   461 +
 src/library/blas/gens/gemv.c                      |   685 ++
 src/library/blas/gens/gen_helper.c                |   551 +
 src/library/blas/gens/gen_helper.h                |   138 +
 src/library/blas/gens/gen_init.c                  |   593 +
 src/library/blas/gens/ger_lds.cpp                 |   414 +
 src/library/blas/gens/her2_lds.cpp                |   365 +
 src/library/blas/gens/her_lds.cpp                 |   360 +
 src/library/blas/gens/iamax.cpp                   |   303 +
 src/library/blas/gens/init.h                      |   159 +
 src/library/blas/gens/kprintf.cpp                 |  2435 ++++
 src/library/blas/gens/legacy/blas_kgen_legacy.c   |   625 +
 src/library/blas/gens/legacy/blas_kgen_legacy.h   |   195 +
 src/library/blas/gens/legacy/blkmul.c             |   823 ++
 src/library/blas/gens/legacy/gemm_img.c           |   758 ++
 src/library/blas/gens/legacy/gemm_lds.c           |   562 +
 src/library/blas/gens/legacy/gen_helper_legacy.c  |   448 +
 src/library/blas/gens/legacy/gen_helper_legacy.h  |    77 +
 src/library/blas/gens/legacy/tests/CMakeLists.txt |    63 +
 src/library/blas/gens/legacy/tests/t_blkmul.c     |   733 ++
 src/library/blas/gens/legacy/trmm_img.c           |   850 ++
 src/library/blas/gens/legacy/trmm_lds.c           |   514 +
 src/library/blas/gens/legacy/trsm_cached_lds.c    |  1005 ++
 src/library/blas/gens/legacy/trsm_img.c           |  1165 ++
 src/library/blas/gens/legacy/trsm_kgen_legacy.c   |   190 +
 src/library/blas/gens/legacy/trsm_kgen_legacy.h   |    43 +
 src/library/blas/gens/legacy/trsm_lds.c           |   649 ++
 src/library/blas/gens/legacy/trxm_common_legacy.c |   250 +
 src/library/blas/gens/legacy/trxm_common_legacy.h |    88 +
 src/library/blas/gens/nrm2.cpp                    |   295 +
 src/library/blas/gens/reduction.cpp               |   311 +
 src/library/blas/gens/rotg_reg.cpp                |   216 +
 src/library/blas/gens/rotm_reg.cpp                |   291 +
 src/library/blas/gens/rotmg_reg.cpp               |   215 +
 src/library/blas/gens/scal_reg.cpp                |   268 +
 src/library/blas/gens/swap_reg.cpp                |   275 +
 src/library/blas/gens/symm_cached.cpp             |   279 +
 src/library/blas/gens/symv.c                      |  1141 ++
 src/library/blas/gens/syr2_lds.cpp                |   372 +
 src/library/blas/gens/syr_lds.cpp                 |   367 +
 src/library/blas/gens/syrxk.c                     |  2594 +++++
 src/library/blas/gens/tests/CMakeLists.txt        |    60 +
 src/library/blas/gens/tests/t_tilemul.c           |  1099 ++
 src/library/blas/gens/tile.c                      |   517 +
 src/library/blas/gens/tile.h                      |   424 +
 src/library/blas/gens/tile_iter.c                 |   296 +
 src/library/blas/gens/tile_iter.h                 |    79 +
 src/library/blas/gens/tilemul.c                   |   952 ++
 src/library/blas/gens/trmm.c                      |  1423 +++
 src/library/blas/gens/trmv_reg.cpp                |   490 +
 src/library/blas/gens/trsm.c                      |  1649 +++
 src/library/blas/gens/trsm_kgen.c                 |    50 +
 src/library/blas/gens/trsm_kgen.h                 |    28 +
 src/library/blas/gens/trsv_gemv.cpp               |   553 +
 src/library/blas/gens/trsv_trtri.cpp              |   548 +
 src/library/blas/gens/trxm_common.c               |   289 +
 src/library/blas/gens/trxm_common.h               |   139 +
 src/library/blas/gens/tuned_numbers.c             |   418 +
 src/library/blas/gens/tuned_numbers.h             |    45 +
 src/library/blas/gens/xxmv_common.c               |   346 +
 src/library/blas/gens/xxmv_common.h               |    74 +
 src/library/blas/impl.c                           |   129 +
 src/library/blas/include/blas_funcs.h             |    94 +
 src/library/blas/include/blas_mempat.h            |   378 +
 src/library/blas/include/clblas-internal.h        |   399 +
 src/library/blas/include/events.h                 |    29 +
 src/library/blas/include/kprintf.hpp              |   131 +
 src/library/blas/include/matrix_dims.h            |    81 +
 src/library/blas/include/matrix_props.h           |    70 +
 src/library/blas/include/solution_seq.h           |   178 +
 src/library/blas/init.c                           |   242 +
 src/library/blas/ixamax.c                         |   263 +
 src/library/blas/scimage.c                        |   312 +
 src/library/blas/xasum.c                          |   259 +
 src/library/blas/xaxpy.c                          |   243 +
 src/library/blas/xcopy.c                          |   228 +
 src/library/blas/xdot.c                           |   350 +
 src/library/blas/xgbmv.c                          |   276 +
 src/library/blas/xgemm.c                          |   259 +
 src/library/blas/xgemm2.c                         |   542 +
 src/library/blas/xgemv.c                          |   243 +
 src/library/blas/xger.c                           |   368 +
 src/library/blas/xhemm.c                          |   114 +
 src/library/blas/xhemv.c                          |   190 +
 src/library/blas/xher.c                           |   244 +
 src/library/blas/xher2.c                          |   275 +
 src/library/blas/xher2k.c                         |   246 +
 src/library/blas/xherk.c                          |   211 +
 src/library/blas/xhpmv.c                          |   185 +
 src/library/blas/xnrm2.c                          |   361 +
 src/library/blas/xrot.c                           |   224 +
 src/library/blas/xrotg.c                          |   234 +
 src/library/blas/xrotm.c                          |   173 +
 src/library/blas/xrotmg.c                         |   189 +
 src/library/blas/xscal.c                          |   277 +
 src/library/blas/xshbmv.c                         |   264 +
 src/library/blas/xspmv.c                          |   187 +
 src/library/blas/xswap.c                          |   228 +
 src/library/blas/xsymm.c                          |   436 +
 src/library/blas/xsymv.c                          |   201 +
 src/library/blas/xsyr.c                           |   248 +
 src/library/blas/xsyr2.c                          |   270 +
 src/library/blas/xsyr2k.c                         |   250 +
 src/library/blas/xsyrk.c                          |   233 +
 src/library/blas/xtbmv.c                          |   297 +
 src/library/blas/xtbsv.c                          |   824 ++
 src/library/blas/xtrmm.c                          |   245 +
 src/library/blas/xtrmv.c                          |   417 +
 src/library/blas/xtrsm.c                          |   249 +
 src/library/blas/xtrsv.c                          |   719 ++
 src/library/common/clkern.c                       |   258 +
 src/library/common/devinfo-cache.c                |   907 ++
 src/library/common/devinfo.c                      |   312 +
 src/library/common/gens/dblock_kgen.c             |  1497 +++
 src/library/common/kern_cache.c                   |   443 +
 src/library/common/kerngen_core.c                 |   623 +
 src/library/common/kgen_basic.c                   |   427 +
 src/library/common/kgen_guard.c                   |   159 +
 src/library/common/kgen_loop_helper.c             |   105 +
 src/library/common/list.c                         |   136 +
 src/library/common/misc.c                         |    61 +
 src/library/common/mutex.c                        |   128 +
 src/library/common/tests/CMakeLists.txt           |    65 +
 src/library/common/tests/t_dblock_kgen.c          |  1389 +++
 src/library/common/tests/t_gens_cache.c           |   381 +
 src/library/common/trace_malloc.c                 |   278 +
 src/library/tools/ktest/CMakeLists.txt            |   158 +
 src/library/tools/ktest/config-cmdline.cpp        |   690 ++
 src/library/tools/ktest/config.cpp                |   548 +
 src/library/tools/ktest/config.h                  |   128 +
 src/library/tools/ktest/ktest-common.h            |    32 +
 src/library/tools/ktest/ktest-patterns.h          |   435 +
 src/library/tools/ktest/ktest.cpp                 |   708 ++
 src/library/tools/ktest/ktest.h                   |   100 +
 src/library/tools/ktest/main.cpp                  |   336 +
 src/library/tools/ktest/naive/naive_blas.cpp      |   845 ++
 src/library/tools/ktest/scripts/verify_ktest.bash |   212 +
 src/library/tools/ktest/step-dump.cpp             |   332 +
 src/library/tools/ktest/step.cpp                  |   691 ++
 src/library/tools/ktest/step.h                    |   481 +
 src/library/tools/ktest/steps/gemm.cpp            |   170 +
 src/library/tools/ktest/steps/gemm.h              |    36 +
 src/library/tools/ktest/steps/gemv.cpp            |   143 +
 src/library/tools/ktest/steps/gemv.h              |    36 +
 src/library/tools/ktest/steps/symv.cpp            |   120 +
 src/library/tools/ktest/steps/symv.h              |    36 +
 src/library/tools/ktest/steps/syr2k.cpp           |   153 +
 src/library/tools/ktest/steps/syr2k.h             |    36 +
 src/library/tools/ktest/steps/syrk.cpp            |   136 +
 src/library/tools/ktest/steps/syrk.h              |    36 +
 src/library/tools/ktest/steps/trmm.cpp            |   134 +
 src/library/tools/ktest/steps/trmm.h              |    36 +
 src/library/tools/ktest/steps/trsm.cpp            |   142 +
 src/library/tools/ktest/steps/trsm.h              |    36 +
 src/library/tools/ktest/var.cpp                   |   199 +
 src/library/tools/ktest/var.h                     |   162 +
 src/library/tools/tplgen/CMakeLists.txt           |    20 +
 src/library/tools/tplgen/configure.bat            |    14 +
 src/library/tools/tplgen/tplgen.cpp               |   165 +
 src/library/tools/tune/CMakeLists.txt             |   156 +
 src/library/tools/tune/dimension.c                |   136 +
 src/library/tools/tune/fileio.c                   |   388 +
 src/library/tools/tune/fileio.h                   |    96 +
 src/library/tools/tune/storage_data.c             |   374 +
 src/library/tools/tune/storage_data.h             |   201 +
 src/library/tools/tune/storage_init.c             |   202 +
 src/library/tools/tune/storage_io.c               |   751 ++
 src/library/tools/tune/subdim.c                   |   768 ++
 src/library/tools/tune/subdim.h                   |   143 +
 src/library/tools/tune/toolslib.c                 |   540 +
 src/library/tools/tune/toolslib.h                 |    87 +
 src/library/tools/tune/tune.c                     |  2646 +++++
 src/library/tools/tune/tune.h                     |    43 +
 src/samples/CMakeLists.pack                       |   261 +
 src/samples/CMakeLists.txt                        |   357 +
 src/samples/clBlasVersion.c                       |    41 +
 src/samples/example_chbmv.c                       |   171 +
 src/samples/example_chemm.cpp                     |   178 +
 src/samples/example_cher.c                        |   159 +
 src/samples/example_cher2k.c                      |   185 +
 src/samples/example_cherk.cpp                     |   184 +
 src/samples/example_chpmv.c                       |   169 +
 src/samples/example_chpr.c                        |   166 +
 src/samples/example_csscal.c                      |   141 +
 src/samples/example_dtrmv.c                       |   174 +
 src/samples/example_isamax.c                      |   135 +
 src/samples/example_sasum.c                       |   131 +
 src/samples/example_saxpy.c                       |   155 +
 src/samples/example_scopy.c                       |   161 +
 src/samples/example_sdot.c                        |   147 +
 src/samples/example_sgbmv.c                       |   175 +
 src/samples/example_sgemm.c                       |   192 +
 src/samples/example_sgemv.c                       |   181 +
 src/samples/example_sger.c                        |   174 +
 src/samples/example_snrm2.c                       |   132 +
 src/samples/example_srot.c                        |   165 +
 src/samples/example_srotg.c                       |   137 +
 src/samples/example_srotm.c                       |   171 +
 src/samples/example_srotmg.c                      |   151 +
 src/samples/example_ssbmv.c                       |   171 +
 src/samples/example_sscal.c                       |   141 +
 src/samples/example_sspmv.c                       |   170 +
 src/samples/example_sspr.c                        |   168 +
 src/samples/example_sspr2.c                       |   180 +
 src/samples/example_sswap.c                       |   162 +
 src/samples/example_ssymm.c                       |   178 +
 src/samples/example_ssymv.c                       |   182 +
 src/samples/example_ssyr.c                        |   161 +
 src/samples/example_ssyr2.c                       |   175 +
 src/samples/example_ssyr2k.c                      |   193 +
 src/samples/example_ssyrk.c                       |   175 +
 src/samples/example_stbmv.c                       |   157 +
 src/samples/example_stbsv.c                       |   158 +
 src/samples/example_stpmv.c                       |   158 +
 src/samples/example_stpsv.c                       |   159 +
 src/samples/example_strmm.c                       |   173 +
 src/samples/example_strmv.c                       |   157 +
 src/samples/example_strsm.c                       |   175 +
 src/samples/example_strsv.c                       |   155 +
 src/samples/example_zhemv.cpp                     |   179 +
 src/samples/example_zher2.c                       |   172 +
 src/samples/example_zhpr2.c                       |   179 +
 src/scripts/perf/CMakeLists.txt                   |    30 +
 src/scripts/perf/blasPerformanceTesting.py        |   333 +
 src/scripts/perf/errorHandler.py                  |    68 +
 src/scripts/perf/measurePerformance.py            |   543 +
 src/scripts/perf/performanceUtility.py            |    97 +
 src/scripts/perf/plotPerformance.py               |   309 +
 src/targetver.h                                   |    29 +
 src/tests/BasicRoutines.cpp                       |   102 +
 src/tests/BlasBase.cpp                            |   525 +
 src/tests/CMakeLists.txt                          |   450 +
 src/tests/blas-cblas.c                            |    57 +
 src/tests/blas-wrapper.cpp                        |  2462 ++++
 src/tests/blas.c                                  |  4966 ++++++++
 src/tests/clBLAS-wrapper.cpp                      |  3463 ++++++
 src/tests/cmdline.c                               |   248 +
 src/tests/common.cpp                              |  1011 ++
 src/tests/correctness/BlasBase-corr.cpp           |    41 +
 src/tests/correctness/blas-lapack.c               |   870 ++
 src/tests/correctness/blas-lapack.h               |  1225 ++
 src/tests/correctness/corr-asum.cpp               |   212 +
 src/tests/correctness/corr-axpy.cpp               |   217 +
 src/tests/correctness/corr-copy.cpp               |   211 +
 src/tests/correctness/corr-dot.cpp                |   217 +
 src/tests/correctness/corr-dotc.cpp               |   204 +
 src/tests/correctness/corr-gbmv.cpp               |   248 +
 src/tests/correctness/corr-gemm.cpp               |   233 +
 src/tests/correctness/corr-gemm2.cpp              |   256 +
 src/tests/correctness/corr-gemv.cpp               |   246 +
 src/tests/correctness/corr-ger.cpp                |   265 +
 src/tests/correctness/corr-gerc.cpp               |   252 +
 src/tests/correctness/corr-hbmv.cpp               |   223 +
 src/tests/correctness/corr-hemm.cpp               |   256 +
 src/tests/correctness/corr-hemv.cpp               |   256 +
 src/tests/correctness/corr-her.cpp                |   210 +
 src/tests/correctness/corr-her2.cpp               |   224 +
 src/tests/correctness/corr-her2k.cpp              |   212 +
 src/tests/correctness/corr-herk.cpp               |   240 +
 src/tests/correctness/corr-hpmv.cpp               |   221 +
 src/tests/correctness/corr-hpr.cpp                |   209 +
 src/tests/correctness/corr-hpr2.cpp               |   222 +
 src/tests/correctness/corr-iamax.cpp              |   206 +
 src/tests/correctness/corr-nrm2.cpp               |   218 +
 src/tests/correctness/corr-rot.cpp                |   234 +
 src/tests/correctness/corr-rotg.cpp               |   292 +
 src/tests/correctness/corr-rotm.cpp               |   232 +
 src/tests/correctness/corr-rotmg.cpp              |   283 +
 src/tests/correctness/corr-sbmv.cpp               |   224 +
 src/tests/correctness/corr-scal.cpp               |   215 +
 src/tests/correctness/corr-spmv.cpp               |   220 +
 src/tests/correctness/corr-spr.cpp                |   228 +
 src/tests/correctness/corr-spr2.cpp               |   216 +
 src/tests/correctness/corr-swap.cpp               |   221 +
 src/tests/correctness/corr-symm.cpp               |   281 +
 src/tests/correctness/corr-symv.cpp               |   223 +
 src/tests/correctness/corr-syr.cpp                |   266 +
 src/tests/correctness/corr-syr2.cpp               |   218 +
 src/tests/correctness/corr-syr2k.cpp              |   260 +
 src/tests/correctness/corr-syrk.cpp               |   244 +
 src/tests/correctness/corr-tbmv.cpp               |   233 +
 src/tests/correctness/corr-tbsv.cpp               |   242 +
 src/tests/correctness/corr-tpmv.cpp               |   252 +
 src/tests/correctness/corr-tpsv.cpp               |   252 +
 src/tests/correctness/corr-trmm.cpp               |   215 +
 src/tests/correctness/corr-trmv.cpp               |   258 +
 src/tests/correctness/corr-trsm.cpp               |   454 +
 src/tests/correctness/corr-trsv.cpp               |   252 +
 src/tests/correctness/delta.h                     |    36 +
 src/tests/correctness/tcase-filter.cpp            |   217 +
 src/tests/correctness/tcase-filter.h              |    30 +
 src/tests/correctness/test-correctness.cpp        |  3406 ++++++
 src/tests/correctness/trsm-delta.h                |   240 +
 src/tests/correctness/trsv-delta.h                |   296 +
 src/tests/functional/BlasBase-func.cpp            |   117 +
 src/tests/functional/func-error.cpp               |  1354 +++
 src/tests/functional/func-event.cpp               |  1609 +++
 src/tests/functional/func-images.cpp              |   268 +
 src/tests/functional/func-queue.cpp               |   881 ++
 src/tests/functional/func-thread.cpp              |   938 ++
 src/tests/functional/func.h                       |  2804 +++++
 src/tests/functional/test-functional.cpp          |   111 +
 src/tests/include/BlasBase.h                      |   225 +
 src/tests/include/ExtraTestSizes.h                |   270 +
 src/tests/include/asum.h                          |    81 +
 src/tests/include/axpy.h                          |    94 +
 src/tests/include/blas-cblas.h                    |   243 +
 src/tests/include/blas-internal.h                 |  2003 ++++
 src/tests/include/blas-math.h                     |   369 +
 src/tests/include/blas-random.h                   |  1236 ++
 src/tests/include/blas-wrapper.h                  |  1987 ++++
 src/tests/include/clBLAS-wrapper.h                |  2235 ++++
 src/tests/include/cmdline.h                       |   105 +
 src/tests/include/common.h                        |   697 ++
 src/tests/include/copy.h                          |    83 +
 src/tests/include/dot.h                           |    88 +
 src/tests/include/dotc.h                          |    88 +
 src/tests/include/gbmv.h                          |   183 +
 src/tests/include/gemm-2.h                        |   181 +
 src/tests/include/gemm.h                          |   183 +
 src/tests/include/gemv.h                          |   257 +
 src/tests/include/ger.h                           |   129 +
 src/tests/include/gerc.h                          |   123 +
 src/tests/include/hbmv.h                          |   120 +
 src/tests/include/hemm.h                          |   141 +
 src/tests/include/hemv.h                          |   149 +
 src/tests/include/her.h                           |   175 +
 src/tests/include/her2.h                          |   195 +
 src/tests/include/her2k.h                         |   167 +
 src/tests/include/herk.h                          |   160 +
 src/tests/include/hpmv.h                          |    27 +
 src/tests/include/hpr.h                           |    28 +
 src/tests/include/hpr2.h                          |    25 +
 src/tests/include/iamax.h                         |    78 +
 src/tests/include/matrix.h                        |   798 ++
 src/tests/include/nrm2.h                          |    81 +
 src/tests/include/rot.h                           |    87 +
 src/tests/include/rotg.h                          |    76 +
 src/tests/include/rotm.h                          |    86 +
 src/tests/include/rotmg.h                         |   108 +
 src/tests/include/sbmv.h                          |   177 +
 src/tests/include/scal.h                          |    82 +
 src/tests/include/spmv.h                          |   212 +
 src/tests/include/spr.h                           |    23 +
 src/tests/include/spr2.h                          |    26 +
 src/tests/include/swap.h                          |    89 +
 src/tests/include/symm.h                          |   143 +
 src/tests/include/symv.h                          |   184 +
 src/tests/include/syr.h                           |   129 +
 src/tests/include/syr2.h                          |   136 +
 src/tests/include/syr2k.h                         |   173 +
 src/tests/include/syrk.h                          |   155 +
 src/tests/include/tbmv.h                          |   145 +
 src/tests/include/tbsv.h                          |   224 +
 src/tests/include/test-limits.h                   |    63 +
 src/tests/include/testDG.h                        |    56 +
 src/tests/include/timer.h                         |    58 +
 src/tests/include/tpmv.h                          |    25 +
 src/tests/include/tpsv.h                          |    25 +
 src/tests/include/trmm.h                          |   160 +
 src/tests/include/trmv.h                          |   124 +
 src/tests/include/trsm.h                          |   163 +
 src/tests/include/trsv.h                          |   123 +
 src/tests/performance/BlasBase-perf.cpp           |   118 +
 src/tests/performance/PerformanceRecorder.cpp     |   151 +
 src/tests/performance/PerformanceRecorder.h       |    93 +
 src/tests/performance/PerformanceTest.cpp         |   133 +
 src/tests/performance/PerformanceTest.h           |    59 +
 src/tests/performance/TrxmPerformanceTest.cpp     |   362 +
 src/tests/performance/perf-asum.cpp               |   300 +
 src/tests/performance/perf-axpy.cpp               |   344 +
 src/tests/performance/perf-copy.cpp               |   322 +
 src/tests/performance/perf-dot.cpp                |   316 +
 src/tests/performance/perf-dotc.cpp               |   300 +
 src/tests/performance/perf-gbmv.cpp               |   353 +
 src/tests/performance/perf-gemm.cpp               |   368 +
 src/tests/performance/perf-gemm2.cpp              |   397 +
 src/tests/performance/perf-gemv.cpp               |   344 +
 src/tests/performance/perf-ger.cpp                |   393 +
 src/tests/performance/perf-gerc.cpp               |   384 +
 src/tests/performance/perf-hbmv.cpp               |   321 +
 src/tests/performance/perf-hemm.cpp               |   371 +
 src/tests/performance/perf-hemv.cpp               |   347 +
 src/tests/performance/perf-her.cpp                |   324 +
 src/tests/performance/perf-her2.cpp               |   348 +
 src/tests/performance/perf-her2k.cpp              |   353 +
 src/tests/performance/perf-herk.cpp               |   345 +
 src/tests/performance/perf-hpmv.cpp               |   346 +
 src/tests/performance/perf-hpr.cpp                |   319 +
 src/tests/performance/perf-hpr2.cpp               |   350 +
 src/tests/performance/perf-iamax.cpp              |   303 +
 src/tests/performance/perf-nrm2.cpp               |   302 +
 src/tests/performance/perf-rot.cpp                |   364 +
 src/tests/performance/perf-rotg.cpp               |   418 +
 src/tests/performance/perf-rotm.cpp               |   377 +
 src/tests/performance/perf-rotmg.cpp              |   420 +
 src/tests/performance/perf-sbmv.cpp               |   328 +
 src/tests/performance/perf-scal.cpp               |   336 +
 src/tests/performance/perf-spmv.cpp               |   344 +
 src/tests/performance/perf-spr.cpp                |   337 +
 src/tests/performance/perf-spr2.cpp               |   336 +
 src/tests/performance/perf-swap.cpp               |   352 +
 src/tests/performance/perf-symm.cpp               |   404 +
 src/tests/performance/perf-symv.cpp               |   351 +
 src/tests/performance/perf-syr.cpp                |   340 +
 src/tests/performance/perf-syr2.cpp               |   342 +
 src/tests/performance/perf-syr2k.cpp              |   343 +
 src/tests/performance/perf-syrk.cpp               |   327 +
 src/tests/performance/perf-tbmv.cpp               |   329 +
 src/tests/performance/perf-tbsv.cpp               |   327 +
 src/tests/performance/perf-tpmv.cpp               |   380 +
 src/tests/performance/perf-tpsv.cpp               |   365 +
 src/tests/performance/perf-trmm.cpp               |    74 +
 src/tests/performance/perf-trmv.cpp               |   384 +
 src/tests/performance/perf-trsm.cpp               |    67 +
 src/tests/performance/perf-trsv.cpp               |   353 +
 src/tests/performance/test-performance.cpp        |  1405 +++
 src/tests/timer.c                                 |   125 +
 src/version.h.in                                  |    22 +
 540 files changed, 214287 insertions(+)

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..412eeda
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,22 @@
+# Auto detect text files and perform LF normalization
+* text=auto
+
+# Custom for Visual Studio
+*.cs     diff=csharp
+*.sln    merge=union
+*.csproj merge=union
+*.vbproj merge=union
+*.fsproj merge=union
+*.dbproj merge=union
+
+# Standard to msysgit
+*.doc	 diff=astextplain
+*.DOC	 diff=astextplain
+*.docx diff=astextplain
+*.DOCX diff=astextplain
+*.dot  diff=astextplain
+*.DOT  diff=astextplain
+*.pdf  diff=astextplain
+*.PDF	 diff=astextplain
+*.rtf	 diff=astextplain
+*.RTF	 diff=astextplain
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7ae9f4d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,19 @@
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Generated kernel template files
+*.clT
diff --git a/CHANGELOG b/CHANGELOG
new file mode 100644
index 0000000..9cd3d90
--- /dev/null
+++ b/CHANGELOG
@@ -0,0 +1,276 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+clBLAS Readme
+
+Version:       1.10
+Release Date:  April 2013
+
+ChangeLog:
+____________
+Current Version:
+New:
+  * New Level 1 routines added (an 'x' implies all 4 precisions)
+        xSWAP, xCOPY, xSCAL, CSSCAL, ZDSCAL, xAXPY, SDOT, DDOT, 
+        CDOTU, ZDOTU, CDOTC, ZDOTC, xROTG, SROTMG, DROTMG,
+		SROT, DROT, CSROT, ZDROT, SROTM, DROTM, SNRM2, DNRM2,
+		SCNRM2, DZNRM2, ixAMAX, SASUM, DASUM, SCASUM, DZASUM
+  * Samples have been added for the new functions 
+  * This release tested using the 9.012 runtime driver and the 2.8 APPSDK
+Fixed:
+  * Failures in *trsm functions with clMAGMA tests
+Known Issues:
+  * Failures & hangs in ztrmm, *trsv, *tpsv functions on Southern Island GPU devices
+  * Failures in zgemm functions on Northern Island GPU devices
+  * Failures & hangs are expected to be fixed in the upcoming AMD graphics driver versions.
+		It is strongly recommended that users keep their graphics driver versions up to date. 
+		
+____________
+Version 1.8.291:
+Fixed:
+  * Failures in the following functions: ssyr2, ssyr2k, strsm, strsv, ssyrk, cher, 
+        ctrsv, csymm, cher2, ztrmm on Southern Island GPU devices.
+  * Failures in the following functions: dsyr, dsyr2, dgemv, dsyrk,
+        dsyr2k, zsyr2k on Trinity platforms. 
+Known Issues:
+  * Failures in *trsm functions with clMAGMA tests
+  
+____________
+Version 1.8.269 (Beta, clMAGMA support):
+New:
+  * No new routines
+  * This release tested using the 8.961 runtime driver and the 2.6 APPSDK
+
+Known Issues:
+  * The clBLASTune executable has been observed to hang on Windows.  If 
+        this happens, abort execution of the tune program; it is not required 
+        for correct operation of the BLAS routines (as of 8.872).
+  * clBLAS can return invalid results on CPU devices (as 
+        of 8.961).  The CPU device is primarily a test/debug device, and GPU 
+		devices are unaffected.
+  * clBLAS can return invalid results for double precision functions (dsyr, 
+        dsyr2, dgemv, dsyrk, dsyr2k, zsyr2k) on Trinity platforms (as of 
+        8.961).
+  * clBLAS can return invalid results (ssyr2, ssyr2k, strsm, strsv, ssyrk, cher, 
+        ctrsv, csymm, cher2, ztrmm) on Southern Island GPU devices (as of 8.961).
+
+____________
+Version 1.7 (Beta, clMAGMA support):
+New:
+  * New Level 3 routines added (an 'x' implies all 4 precisions)
+		CHER2K, ZHER2K
+  * New Level 2 routines added (an 'x' implies all 4 precisions)
+        xTPMV, xTPSV, SSPVM, DSPMV, CHPMV, ZHPMV, SSPR, DSPR, CHPR, ZHPR, 
+        SSPR2, DSPR2, CHPR2, ZHPR2, xGBMV, CHBMV, ZHBMV, SSBMV, DSBMV, 
+        xTBMV, xTBSV
+  * Samples have been added for the new functions, but are not fully tested 
+  * This release tested using the 8.951 runtime driver and the 2.6 APPSDK
+  * Note that documentation is incomplete for the new functions
+
+Known Issues:
+  * The clBLASTune executable has been observed to hang on Windows.  If 
+        this happens, abort execution of the tune program; it is not required 
+        for correct operation of the BLAS routines (as of 8.872).
+  * clBLAS can return invalid results on CPU devices that support AVX (as 
+        of 8.951).  CPU devices that support up to SSE3 are unaffected.  The 
+        CPU device is primarily a test/debug device, and GPU devices are 
+        unaffected.
+  * clBLAS can return invalid results for double precision functions (dsyr, 
+        dsyr2, dgemv, dsyrk, dsyr2k, zsyr2k) on Trinity platforms (as of 
+        8.951).
+  * clBLAS can return invalid results (ssyr, ssyr2, strsv, ctrsv, ssyrk, 
+        ssyr2k, ztrmm) on Southern Island GPU devices (as of 8.951).
+
+____________
+Version 1.6:
+New:
+  * New Level 3 routines added (an 'x' implies all 4 precisions)
+        CSYRK, ZSYRK, CSYR2K, ZSYR2K, CHEMM, ZHEMM, CHERK, ZHERK, xSYMM
+  * New Level 2 routines added (an 'x' implies all 4 precisions)
+        CGEMV, ZGEMV, xTRMV, xTRSV, CHEMV, ZHEMV, SGER, DGER, CGERU, ZGERU, 
+		CGERC, ZGERC, CHER, ZHER, CHER2, ZHER2, SSYR, DSYR, SSYR2, DSYR2
+  * For all the original functions prior to 1.6, a new API has been introduced
+        with an *Ex suffix.  These extended API's add new parameters that allow
+		users to specify an offset to a matrix argument.  This allows efficient
+		sub-matrix indexing within a clBLAS routine without requiring expensive
+		sub-matrix copy operations.
+  * Samples have been added for the new functions
+  * Preview: Support for AMD Radeon� HD7000 series GPUs
+  * This release tested using the 8.92 runtime driver and the 2.6 APP SDK
+
+Known Issues:
+  * The clBLASTune executable has been observed to hang on Windows.  If this
+        happens, abort execution of the tune program; it is not required for 
+		correct operation of the BLAS routines (as of 8.872).
+  * The CPU device for clBLAS is not functioning for this release (as of 
+        8.872).  The CPU device is primarily a test/debug device, and GPU 
+		devices are unaffected.
+
+____________
+Version 1.4:
+New:
+  * New Level 3 routines added
+        SSYRK, DSYRK, SSYR2K, DSYR2K
+  * New Level 2 routines added
+        SGEMV, DGEMV, SSYMV, DSYMV
+  * The image support functions (clblasAddScratchImage, 
+        clblasRemoveScratchImage) have been deprecated.  Images are no 
+		longer required for the highest performance.
+  * InstallShield is now used for APPML libraries.  The default install 
+        location has changed from c:\amd\clBLAS to 
+		C:\Program Files (x86)\AMD\clBLAS.  It is recommended that previous 
+		versions of clBLAS are uninstalled first.
+  * Samples have been added for the new functions
+  * This release tested using the 8.872 runtime driver and the 2.5 APP SDK
+
+Known Issues:
+  * The clBLASTune executable has been observed to hang on Windows.  If this
+        happens, abort execution of the tune program; it is not required for 
+		correct operation of the BLAS routines (as of 8.872).
+  * The CPU device for clBLAS is not functioning for this release (as of 
+        8.872).  The CPU device is primarily a test/debug device, and GPU 
+		devices are unaffected.
+
+
+____________
+Version 1.2:
+  * The library now supports both 32- and 64-bit Windows and Linux operating 
+        systems.
+  * xTRSM routines are available in 1.2.
+  * clBLAS routines return clBLASStatus error codes, instead of native 
+        OpenCL error codes
+
+Fixed:
+  * xTRMM routines were not properly handling implicit unit diagonal 
+        elements and implicit off-diagonal zero values specified by the BLAS 
+        parameters SIDE, UPLO and DIAG.
+  * Possible crash with CPU device on 32-bit systems.
+  * clblasDgemm routine return an invalid event as its last argument.
+  * clBLAS routines return clblasStatus error codes, instead of 
+        native OpenCL error codes.
+		
+Known Issues:
+  * The clBLASTune executable has been observed to hang on Windows.  If this
+        happens, abort execution of the tune program; it is not required for 
+		correct operation of the BLAS routines (as of 8.872).
+  * The CPU device for clBLAS is not functioning for this release (as of 
+        8.872).  The CPU device is primarily a test/debug device, and GPU 
+		devices are unaffected.
+		
+____________________
+Version 1.0:
+  * Initial release
+
+Known Issues:
+  * Available only on Linux64.
+  * xTRMM routines were not properly handling implicit unit diagonal elements 
+        and implicit off-diagonal zero values specified by the BLAS parameters
+		SIDE, UPLO and DIAG
+  * clblasDgemm returned an invalid event as its last argument
+	  
+_____________
+Building the Samples:
+
+To install the Linux versions of clBLAS, uncompress the initial download, then 
+execute the install script.
+
+For example:
+
+	tar -xf clBLAS-${version}-Linux.tar.gz
+		- This installs three files into the local directory, one being an 
+            executable bash script.
+
+	sudo mkdir /opt/clBLAS-${version}
+		- This pre-creates the install directory with proper permissions 
+            in /opt if it is to be installed there. (This is the default.)
+
+	./install-clBLAS-${version}.sh
+        - This prints an EULA and uncompresses files into the chosen install 
+		directory.
+
+	cd ${installDir}/bin64
+	export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${OpenCLLibDir}:${clBLASLibDir}
+		- Be sure to export library dependencies to resolve all external 
+            linkages to the client program; you can create a bash script to 
+			help automate this procedure.
+
+	./example_sgemm
+		- Run a simple client; one example is provided for each supported 
+                  main BLAS function family.
+
+The sample program does not ship with native build files; instead, a CMake 
+file is shipped, and the user generates a native build file for their system.
+
+For example:
+
+	cd ${installDir}
+
+	mkdir samplesBin/
+		- This creates a sister directory to the samples directory that 
+                  houses the native makefiles and the generated files from the 
+                  build.
+
+	cd samplesBin/
+	ccmake ../samples/
+		- ccmake is a curses-based cmake program; it takes a parameter 
+                  that specifies the location of the source code to compile.
+		- Hit 'c' to configure for the platform; ensure that the 
+                  dependencies to external libraries are satisfied, including 
+                  paths to 'ATI Stream SDK'.
+		- After dependencies are satisfied, hit 'c' again to finalize 
+                  configuration. Then, hit 'g' to generate a makefile and 
+                  exit ccmake.
+
+	make help
+		- Look at the options available for make.
+
+	make
+		- Build the sample client program.
+
+	./example_sgemm
+		- Run a simple client; one example is provided for each supported main 
+		BLAS function family.
+_______________________________________________________________________________
+(C) 2010-2013 Advanced Micro Devices, Inc. All rights reserved. AMD, the AMD 
+Arrow logo, ATI, the ATI logo, Radeon, FireStream, FireGL, Catalyst, and 
+combinations thereof are trademarks of Advanced Micro Devices, Inc. Microsoft 
+(R), Windows, and Windows Vista (R) are registered trademarks of Microsoft 
+Corporation in the U.S. and/or other jurisdictions. OpenCL and the OpenCL logo 
+are trademarks of Apple Inc. used by permission by Khronos. Other names are for 
+informational purposes only and may be trademarks of their respective owners.
+
+The contents of this document are provided in connection with Advanced Micro 
+Devices, Inc. ("AMD") products. AMD makes no representations or warranties with 
+respect to the accuracy or completeness of the contents of this publication and 
+reserves the right to make changes to specifications and product descriptions 
+at any time without notice. The information contained herein may be of a 
+preliminary or advance nature and is subject to change without notice. No 
+license, whether express, implied, arising by estoppel or otherwise, to any 
+intellectual property rights is granted by this publication. Except as set forth
+in AMD's Standard Terms and Conditions of Sale, AMD assumes no liability 
+whatsoever, and disclaims any express or implied warranty, relating to its 
+products including, but not limited to, the implied warranty of 
+merchantability, fitness for a particular purpose, or infringement of any 
+intellectual property right.
+
+AMD's products are not designed, intended, authorized or warranted for use as 
+components in systems intended for surgical implant into the body, or in other 
+applications intended to support or sustain life, or in any other application 
+in which the failure of AMD's product could create a situation where personal 
+injury, death, or severe property or environmental damage may occur. AMD 
+reserves the right to discontinue or make changes to its products at any time 
+without notice.
+_______________________________________________________________________________
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..61932f1
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,36 @@
+## Contributor guidelines
+
+Contributing code to this project is intended to be light weight and intuitive to users familiar with GitHub to actively encourage contributions, but a process is documented and should be followed to prevent chaos, confusion and despair.  
+
+## The mechanics of contributing code
+Firstly, in order to contribute code to this project, a contributor must have a valid and current [GitHub account](https://help.github.com/articles/set-up-git) available to use.  Given an account,
+* The potential contributor forks this project into his/her account following the traditional [forking](https://help.github.com/articles/fork-a-repo) model native to GitHub
+* After forking, the contributor [clones their repository](https://help.github.com/articles/create-a-repo) locally on their machine
+* Code is developed and checked into the contributor's repository.  These commits are eventually pushed upstream to their GitHub repository
+* The contributor then issues a [pull-request](https://help.github.com/articles/using-pull-requests) against the **develop** branch of this repository, which is the [git flow](http://nvie.com/posts/a-successful-git-branching-model/) workflow which is well suited for working with GitHub
+    * A [git extention](https://github.com/nvie/gitflow) has been developed to ease the use of the 'git flow' methodology, but requires manual installation by the user.  Refer to the projects wiki
+
+At this point, the repository maintainers will be notified by GitHub that a 'pull request' exists pending against their repository.  A code review should be completed within a few days, depending on the scope of submitted code, and the code will either be accepted, rejected or commented on for extra feedback.
+
+## Code submission guidelines
+We want to ensure that the project code base maintains a level of quality over time, such that future contributors find it as easy to jump into the code as hopefully it is today.  As such, pull requests should 
+* remember that clMath is a project licensed under the [Apache License, Version 2.0]( http://www.apache.org/licenses/LICENSE-2.0 ).  If you are not already familiar, please review the license before issuing a pull request.  We intend this project to be open to external contributors, and encourage developers to contribute code back that they believe will provide value to the overall community.  We will interpret an explicit 'pull request' back to this repository as an implicit acknowledge [...]
+* follow the [code style guidelines]( ) of the project as posted to the project wiki.  Unfortunately, there was no unifying code guidelines defined between the BLAS & FFT projects, but code submissions should not mix styles within an individual file.  We have since defined and posted a code style guideline for the projects and we expect the code to slowly transition to the new
+guidelines over time
+    *  separate check-ins that modify a files style from the ones that add/change/delete code.
+* target the **develop** branch in the repository
+* ensure that the [code properly builds]( https://github.com/kknox/clBLAS/wiki/Build )
+* cannot break existing test cases
+    * we encourage contributors to [run the test-short]( https://github.com/kknox/clBLAS/wiki/Testing ) suite of tests on their end before the pull-request
+        * if possible, upload the test results associated with the pull request to a personal [gist repository]( https://gist.github.com/ ) and insert a link to the test results in the pull request so that collaborators can browse the results
+        * if no test results are provided with the pull request, official collaborators will run the test suite on their test machines against the patch before we will accept the pull-request
+            * if we detect failing test cases, we will request that the code associated with the pull request be fixed before the pull request will be merged
+    * if new functionality is introduced with the pull request, sufficient test cases should be added to verify the new functionality is correct
+        * new tests should integrate with the existing [googletest framework]( https://code.google.com/p/googletest/wiki/Primer ) located in the src/tests directory of the repo
+        * if the collaborators feel the new tests do not provide sufficient coverage, feedback on the pull request will be left with suggestions on how to improve the tests before the pull request will be merged
+
+Pull requests will be reviewed by the set of collaborators that are assigned for the repository.  Pull requests may be accepted, declined or a conversation may start on the pull request thread with feedback.  If the pull request is trivial and all the submission guidelines defined above are honored, the pull request may be accepted without delay.  If the pull request is good, but the guidelines defined above are not followed, the collaborators may leave feedback on the pull request and e [...]
+
+## Is it possible to become an official collaborator of the repository?
+Yes, we hope to promote trusted members of the community, who have proven themselves to be competent and request to take on the extra responsibility to be official collaborators of the project.  When an individual requests to be an official collaborator, current project collaborators will browse through the history of the requester's prior pull requests and take a vote amongst themselves if the requester should be promoted to collaborator.  These individuals will then have the right to a [...]
+
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/NOTICE b/NOTICE
new file mode 100644
index 0000000..16619d3
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,5 @@
+AMD clBLAS
+    Copyright 2013 Advanced Micro Devices, Inc.
+
+    This product includes software developed at
+    Advanced Micro Devices, Inc. (http://www.amd.com).
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..6b763cf
--- /dev/null
+++ b/README.md
@@ -0,0 +1,159 @@
+clBLAS
+=====
+
+clMATH is a software library containing FFT and BLAS functions written in OpenCL. In addition to GPU devices, the libraries also support running on CPU devices to facilitate debugging and multicore programming.
+
+<a href="http://developer.amd.com/tools-and-sdks/heterogeneous-computing/amd-accelerated-parallel-processing-math-libraries/">APPML 1.10</a> is the most current generally available version of the library, and pre-built binaries are available for download on both Linux and Windows platforms.
+
+This repository houses the code for the OpenCL™ BLAS portion of APPML.  The complete set of BLAS level 1, 2 & 3 routines has been  implemented. Please see <a href="http://www.netlib.org/blas/index.html"> Netlib BLAS </a> for the list of routines.  For more information on supported graphics cards, see the <a href="http://developer.amd.com/tools-and-sdks/heterogeneous-computing/amd-accelerated-parallel-processing-app-sdk/system-requirements-driver-compatibility/">AMD APP System Requirements</a>.
+
+The primary goal of clBLAS is to make it easier for developers to utilize the inherent performance and power efficiency benefits of heterogeneous computing.  clBLAS interfaces do not hide nor wrap OpenCL interfaces, but rather leaves OpenCL state management to the control of the user to allow for maximum performance and flexibility.  The clBLAS library does generate and enqueue optimized OpenCL kernels, relieving the user from the task of writing, optimizing and maintaining kernel code t [...]
+
+## clBLAS Wiki
+The [project wiki](https://github.com/kknox/clBLAS/wiki) contains helpful documentation, including a [build primer](https://github.com/kknox/clBLAS/wiki/Build)
+
+## Contributing code
+Please refer to and read the [Contributing](CONTRIBUTING.md) document for guidelines on how to contribute code to this open source project
+
+## License
+The source for clFFT is licensed under the [Apache License, Version 2.0]( http://www.apache.org/licenses/LICENSE-2.0 )
+
+## Example
+The simple example below shows how to use clBLAS to compute an OpenCL accelerated SGEMM
+
+```c
+#include <sys/types.h>
+#include <stdio.h>
+
+/* Include the clBLAS header. It includes the appropriate OpenCL headers
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+
+#define M  4
+#define N  3
+#define K  5
+
+static const cl_float alpha = 10;
+
+static const cl_float A[M*K] = {
+    11, 12, 13, 14, 15,
+    21, 22, 23, 24, 25,
+    31, 32, 33, 34, 35,
+    41, 42, 43, 44, 45,
+};
+static const size_t lda = K;        /* i.e. lda = K */
+
+static const cl_float B[K*N] = {
+    11, 12, 13,
+    21, 22, 23,
+    31, 32, 33,
+    41, 42, 43,
+    51, 52, 53,
+};
+static const size_t ldb = N;        /* i.e. ldb = N */
+
+static const cl_float beta = 20;
+
+static cl_float C[M*N] = {
+    11, 12, 13,
+    21, 22, 23,
+    31, 32, 33,
+    41, 42, 43, 
+};
+static const size_t ldc = N;        /* i.e. ldc = N */
+
+static cl_float result[M*N];
+
+int main( void )
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufB, bufC;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs( 1, &platform, NULL );
+    err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
+    queue = clCreateCommandQueue( ctx, device, 0, &err );
+
+    /* Setup clBLAS */
+    err = clblasSetup( );
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
+                          NULL, &err );
+    bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
+                          NULL, &err );
+    bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
+                          NULL, &err );
+
+    err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0,
+        M * K * sizeof( *A ), A, 0, NULL, NULL );
+    err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0,
+        K * N * sizeof( *B ), B, 0, NULL, NULL );
+    err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0,
+        M * N * sizeof( *C ), C, 0, NULL, NULL );
+
+    /* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */
+    err = clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans, 
+							M, N, K,
+							alpha, bufA, 0, lda,
+							bufB, 0, ldb, beta,
+							bufC, 0, ldc,
+							1, &queue, 0, NULL, &event );
+
+    /* Wait for calculations to be finished. */
+    err = clWaitForEvents( 1, &event );
+
+    /* Fetch results of calculations from GPU memory. */
+    err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0,
+                                M * N * sizeof(*result),
+                                result, 0, NULL, NULL );
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject( bufC );
+    clReleaseMemObject( bufB );
+    clReleaseMemObject( bufA );
+
+    /* Finalize work with clBLAS */
+    clblasTeardown( );
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue( queue );
+    clReleaseContext( ctx );
+
+    return ret;
+}
+```
+
+## Build dependencies
+### Library for Windows
+*  Windows® 7/8
+*  Visual Studio 2010 SP1
+*  An OpenCL SDK, such as APP SDK 2.8
+*  Latest CMake
+
+### Library for Linux
+*  GCC 4.6 and onwards
+*  An OpenCL SDK, such as APP SDK 2.8
+*  Latest CMake
+
+### Test infrastructure
+* Latest Googletest
+* Latest ACML 
+* Latest Boost
+
+### Performance infrastructure
+* Python
\ No newline at end of file
diff --git a/doc/clBLAS.doxy b/doc/clBLAS.doxy
new file mode 100644
index 0000000..86fbbfc
--- /dev/null
+++ b/doc/clBLAS.doxy
@@ -0,0 +1,1902 @@
+# Doxyfile 1.8.4
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project
+#
+# All text after a hash (#) is considered a comment and will be ignored
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ")
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file 
+# that follow. The default is UTF-8 which is also the encoding used for all 
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the 
+# iconv built into libc) for the transcoding. See 
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or sequence of words) that should 
+# identify the project. Note that if you do not use Doxywizard you need 
+# to put quotes around the project name if it contains spaces.
+
+PROJECT_NAME           = clBLAS
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. 
+# This could be handy for archiving the generated documentation or 
+# if some version control system is used.
+
+PROJECT_NUMBER         = 2.0
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description 
+# for a project that appears at the top of each page and should give viewer 
+# a quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          = 
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is 
+# included in the documentation. The maximum height of the logo should not 
+# exceed 55 pixels and the maximum width should not exceed 200 pixels. 
+# Doxygen will copy the logo to the output directory.
+
+PROJECT_LOGO           = 
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) 
+# base path where the generated documentation will be put. 
+# If a relative path is entered, it will be relative to the location 
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = F:\code\git-svn\clBLAS.head\bin\master\vs10x64.superbuild\docs
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 
+# 4096 sub-directories (in 2 levels) under the output directory of each output 
+# format and will distribute the generated files over these directories. 
+# Enabling this option can be useful when feeding doxygen a huge amount of 
+# source files, where putting all generated files in the same directory would 
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all 
+# documentation generated by doxygen is written. Doxygen will use this 
+# information to generate all constant output in the proper language. 
+# The default language is English, other supported languages are: 
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, 
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, 
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English 
+# messages), Korean, Korean-en, Latvian, Lithuanian, Norwegian, Macedonian, 
+# Persian, Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, 
+# Slovak, Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will 
+# include brief member descriptions after the members that are listed in 
+# the file and class documentation (similar to JavaDoc). 
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend 
+# the brief description of a member or function before the detailed description. 
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the 
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator 
+# that is used to form the text in various listings. Each string 
+# in this list, if found as the leading text of the brief description, will be 
+# stripped from the text and the result after processing the whole list, is 
+# used as the annotated text. Otherwise, the brief description is used as-is. 
+# If left blank, the following values are used ("$name" is automatically 
+# replaced with the name of the entity): "The $name class" "The $name widget" 
+# "The $name file" "is" "provides" "specifies" "contains" 
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       = 
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then 
+# Doxygen will generate a detailed section even if there is only a brief 
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all 
+# inherited members of a class in the documentation of that class as if those 
+# members were ordinary class members. Constructors, destructors and assignment 
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full 
+# path before files name in the file list and in the header files. If set 
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag 
+# can be used to strip a user-defined part of the path. Stripping is 
+# only done if one of the specified strings matches the left-hand part of 
+# the path. The tag can be used to show relative paths in the file list. 
+# If left blank the directory from which doxygen is run is used as the 
+# path to strip. Note that you specify absolute paths here, but also 
+# relative paths, which will be relative from the directory where doxygen is 
+# started.
+
+STRIP_FROM_PATH        = 
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of 
+# the path mentioned in the documentation of a class, which tells 
+# the reader which header file to include in order to use a class. 
+# If left blank only the name of the header file containing the class 
+# definition is used. Otherwise one should specify the include paths that 
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    = 
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter 
+# (but less readable) file names. This can be useful if your file system 
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen 
+# will interpret the first line (until the first dot) of a JavaDoc-style 
+# comment as the brief description. If set to NO, the JavaDoc 
+# comments will behave just like regular Qt-style comments 
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will 
+# interpret the first line (until the first dot) of a Qt-style 
+# comment as the brief description. If set to NO, the comments 
+# will behave just like regular Qt-style comments (thus requiring 
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen 
+# treat a multi-line C++ special comment block (i.e. a block of //! or /// 
+# comments) as a brief description. This used to be the default behaviour. 
+# The new default is to treat a multi-line C++ comment block as a detailed 
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented 
+# member inherits the documentation from any documented member that it 
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce 
+# a new page for each member. If set to NO, the documentation of a member will 
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. 
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that acts 
+# as commands in the documentation. An alias has the form "name=value". 
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to 
+# put the command \sideeffect (or @sideeffect) in the documentation, which 
+# will result in a user-defined paragraph with heading "Side Effects:". 
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                = 
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only). 
+# A mapping has the form "name=value". For example adding 
+# "class=itcl::class" will allow you to use the command class in the 
+# itcl::class meaning.
+
+TCL_SUBST              = 
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C 
+# sources only. Doxygen will then generate output that is more tailored for C. 
+# For instance, some of the names that are used will be different. The list 
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = YES
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java 
+# sources only. Doxygen will then generate output that is more tailored for 
+# Java. For instance, namespaces will be presented as packages, qualified 
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran 
+# sources only. Doxygen will then generate output that is more tailored for 
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL 
+# sources. Doxygen will then generate output that is tailored for 
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it 
+# parses. With this tag you can assign which parser to use for a given 
+# extension. Doxygen has a built-in mapping, but you can override or extend it 
+# using this tag. The format is ext=language, where ext is a file extension, 
+# and language is one of the parsers supported by doxygen: IDL, Java, 
+# Javascript, CSharp, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, 
+# C++. For instance to make doxygen treat .inc files as Fortran files (default 
+# is PHP), and .f files as C (default is Fortran), use: inc=Fortran f=C. Note 
+# that for custom extensions you also need to set FILE_PATTERNS otherwise the 
+# files are not read by doxygen.
+
+EXTENSION_MAPPING      = 
+
+# If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all 
+# comments according to the Markdown format, which allows for more readable 
+# documentation. See http://daringfireball.net/projects/markdown/ for details. 
+# The output of markdown processing is further processed by doxygen, so you 
+# can mix doxygen, HTML, and XML commands with Markdown formatting. 
+# Disable only in case of backward compatibilities issues.
+
+MARKDOWN_SUPPORT       = YES
+
+# When enabled doxygen tries to link words that correspond to documented 
+# classes, or namespaces to their corresponding documentation. Such a link can 
+# be prevented in individual cases by by putting a % sign in front of the word 
+# or globally by setting AUTOLINK_SUPPORT to NO.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want 
+# to include (a tag file for) the STL sources as input, then you should 
+# set this tag to YES in order to let doxygen match functions declarations and 
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. 
+# func(std::string) {}). This also makes the inheritance and collaboration 
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to 
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. 
+# Doxygen will parse them like normal C++ but will assume all classes use public 
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate 
+# getter and setter methods for a property. Setting this option to YES (the 
+# default) will make doxygen replace the get and set methods by a property in 
+# the documentation. This will only work if the methods are indeed getting or 
+# setting a simple type. If this is not the case, or you want to show the 
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC 
+# tag is set to YES, then doxygen will reuse the documentation of the first 
+# member in the group (if any) for the other members of the group. By default 
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of 
+# the same type (for instance a group of public functions) to be put as a 
+# subgroup of that type (e.g. under the Public Functions section). Set it to 
+# NO to prevent subgrouping. Alternatively, this can be done per class using 
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and 
+# unions are shown inside the group in which they are included (e.g. using 
+# @ingroup) instead of on a separate page (for HTML and Man pages) or 
+# section (for LaTeX and RTF).
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and 
+# unions with only public data fields or simple typedef fields will be shown 
+# inline in the documentation of the scope in which they are defined (i.e. file, 
+# namespace, or group documentation), provided this scope is documented. If set 
+# to NO (the default), structs, classes, and unions are shown on a separate 
+# page (for HTML and Man pages) or section (for LaTeX and RTF).
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum 
+# is documented as struct, union, or enum with the name of the typedef. So 
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct 
+# with name TypeT. When disabled the typedef will appear as a member of a file, 
+# namespace, or class. And the struct will be named TypeS. This can typically 
+# be useful for C code in case the coding convention dictates that all compound 
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This 
+# cache is used to resolve symbols given their name and scope. Since this can 
+# be an expensive process and often the same symbol appear multiple times in 
+# the code, doxygen keeps a cache of pre-resolved symbols. If the cache is too 
+# small doxygen will become slower. If the cache is too large, memory is wasted. 
+# The cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid 
+# range is 0..9, the default is 0, corresponding to a cache size of 2^16 = 65536 
+# symbols.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in 
+# documentation are documented, even if no documentation was available. 
+# Private class members and static file members will be hidden unless 
+# the EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class 
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal 
+# scope will be included in the documentation.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file 
+# will be included in the documentation.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) 
+# defined locally in source files will be included in the documentation. 
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = NO
+
+# This flag is only useful for Objective-C code. When set to YES local 
+# methods, which are defined in the implementation section but not in 
+# the interface are included in the documentation. 
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be 
+# extracted and appear in the documentation as a namespace called 
+# 'anonymous_namespace{file}', where file will be replaced with the base 
+# name of the file that contains the anonymous namespace. By default 
+# anonymous namespaces are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all 
+# undocumented members of documented classes, files or namespaces. 
+# If set to NO (the default) these members will be included in the 
+# various overviews, but no documentation section is generated. 
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all 
+# undocumented classes that are normally visible in the class hierarchy. 
+# If set to NO (the default) these classes will be included in the various 
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = YES
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all 
+# friend (class|struct|union) declarations. 
+# If set to NO (the default) these declarations will be included in the 
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any 
+# documentation blocks found inside the body of a function. 
+# If set to NO (the default) these blocks will be appended to the 
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation 
+# that is typed after a \internal command is included. If the tag is set 
+# to NO (the default) then the documentation will be excluded. 
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate 
+# file names in lower-case letters. If set to YES upper-case letters are also 
+# allowed. This is useful if you have classes or files whose names only differ 
+# in case and if your file system supports case sensitive file names. Windows 
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen 
+# will show members with their full class and namespace scopes in the 
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen 
+# will put a list of the files that are included by a file in the documentation 
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen 
+# will list include files with double quotes in the documentation 
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] 
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen 
+# will sort the (detailed) documentation of file and class members 
+# alphabetically by member name. If set to NO the members will appear in 
+# declaration order.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the 
+# brief documentation of file, namespace and class members alphabetically 
+# by member name. If set to NO (the default) the members will appear in 
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen 
+# will sort the (brief and detailed) documentation of class members so that 
+# constructors and destructors are listed first. If set to NO (the default) 
+# the constructors will appear in the respective orders defined by 
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. 
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO 
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the 
+# hierarchy of group names into alphabetical order. If set to NO (the default) 
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be 
+# sorted by fully-qualified names, including namespaces. If set to 
+# NO (the default), the class list will be sorted only by class name, 
+# not including the namespace part. 
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. 
+# Note: This option applies only to the class list, not to the 
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to 
+# do proper type resolution of all parameters of a function it will reject a 
+# match between the prototype and the implementation of a member function even 
+# if there is only one candidate or it is obvious which candidate to choose 
+# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen 
+# will still accept a match between prototype and implementation in such cases.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or 
+# disable (NO) the todo list. This list is created by putting \todo 
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or 
+# disable (NO) the test list. This list is created by putting \test 
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or 
+# disable (NO) the bug list. This list is created by putting \bug 
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or 
+# disable (NO) the deprecated list. This list is created by putting 
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional 
+# documentation sections, marked by \if section-label ... \endif 
+# and \cond section-label ... \endcond blocks.
+
+ENABLED_SECTIONS       = 
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines 
+# the initial value of a variable or macro consists of for it to appear in 
+# the documentation. If the initializer consists of more lines than specified 
+# here it will be hidden. Use a value of 0 to hide initializers completely. 
+# The appearance of the initializer of individual variables and macros in the 
+# documentation can be controlled using \showinitializer or \hideinitializer 
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated 
+# at the bottom of the documentation of classes and structs. If set to YES the 
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. 
+# This will remove the Files entry from the Quick Index and from the 
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the 
+# Namespaces page.  This will remove the Namespaces entry from the Quick Index 
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that 
+# doxygen should invoke to get the current version for each file (typically from 
+# the version control system). Doxygen will invoke the program by executing (via 
+# popen()) the command <command> <input-file>, where <command> is the value of 
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file 
+# provided by doxygen. Whatever the program writes to standard output 
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    = 
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed 
+# by doxygen. The layout file controls the global structure of the generated 
+# output files in an output format independent way. To create the layout file 
+# that represents doxygen's defaults, run doxygen with the -l option. 
+# You can optionally specify a file name after the option, if omitted 
+# DoxygenLayout.xml will be used as the name of the layout file.
+
+LAYOUT_FILE            = 
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files 
+# containing the references data. This must be a list of .bib files. The 
+# .bib extension is automatically appended if omitted. Using this command 
+# requires the bibtex tool to be installed. See also 
+# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style 
+# of the bibliography can be controlled using LATEX_BIB_STYLE. To use this 
+# feature you need bibtex and perl available in the search path. Do not use 
+# file names with spaces, bibtex cannot handle them.
+
+CITE_BIB_FILES         = 
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated 
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are 
+# generated by doxygen. Possible values are YES and NO. If left blank 
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings 
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will 
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for 
+# potential errors in the documentation, such as not documenting some 
+# parameters in a documented function, or documenting parameters that 
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# The WARN_NO_PARAMDOC option can be enabled to get warnings for 
+# functions that are documented, but have no documentation for their parameters 
+# or return value. If set to NO (the default) doxygen will only warn about 
+# wrong or incomplete parameter documentation, but not about the absence of 
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that 
+# doxygen can produce. The string should contain the $file, $line, and $text 
+# tags, which will be replaced by the file and line number from which the 
+# warning originated and the warning text. Optionally the format may contain 
+# $version, which will be replaced by the version of the file (if it could 
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning 
+# and error messages should be written. If left blank the output is written 
+# to stderr.
+
+WARN_LOGFILE           = 
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain 
+# documented source files. You may enter file names like "myfile.cpp" or 
+# directories like "/usr/src/myproject". Separate the files or directories 
+# with spaces.
+
+INPUT                  = clBLAS.h \
+                         include/cltypes.h \
+                         include/kerngen.h \
+                         include/solver.h \
+                         include/mempat.h \
+                         src/blas/gens/blas_kgen.h \
+                         src/blas/include/clblas-internal.h \
+                         src/blas/include/kernel_extra.h \
+                         src/blas/include/solution_seq.h \
+                         include/granulation.h \
+                         src/tools/ktest/step.h
+
+# This tag can be used to specify the character encoding of the source files 
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is 
+# also the default input encoding. Doxygen uses libiconv (or the iconv built 
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for 
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the 
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 
+# and *.h) to filter out the source-files in the directories. If left 
+# blank the following patterns are tested: 
+# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh 
+# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py 
+# *.f90 *.f *.for *.vhd *.vhdl
+
+FILE_PATTERNS          = 
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories 
+# should be searched for input files as well. Possible values are YES and NO. 
+# If left blank NO is used.
+
+RECURSIVE              = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be 
+# excluded from the INPUT source files. This way you can easily exclude a 
+# subdirectory from a directory tree whose root is specified with the INPUT tag. 
+# Note that relative paths are relative to the directory from which doxygen is 
+# run.
+
+EXCLUDE                = 
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or 
+# directories that are symbolic links (a Unix file system feature) are excluded 
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the 
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude 
+# certain files from those directories. Note that the wildcards are matched 
+# against the file with absolute path, so to exclude all test directories 
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       = 
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names 
+# (namespaces, classes, functions, etc.) that should be excluded from the 
+# output. The symbol name can be a fully qualified name, a word, or if the 
+# wildcard * is used, a substring. Examples: ANamespace, AClass, 
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        = 
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or 
+# directories that contain example code fragments that are included (see 
+# the \include command).
+
+EXAMPLE_PATH           = samples
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the 
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 
+# and *.h) to filter out the source-files in the directories. If left 
+# blank all files are included.
+
+EXAMPLE_PATTERNS       = 
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be 
+# searched for input files to be used with the \include or \dontinclude 
+# commands irrespective of the value of the RECURSIVE tag. 
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or 
+# directories that contain image that are included in the documentation (see 
+# the \image command).
+
+IMAGE_PATH             = 
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should 
+# invoke to filter for each input file. Doxygen will invoke the filter program 
+# by executing (via popen()) the command <filter> <input-file>, where <filter> 
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an 
+# input file. Doxygen will then use the output that the filter program writes 
+# to standard output.  If FILTER_PATTERNS is specified, this tag will be ignored. 
+# Note that the filter must not add or remove lines; it is applied before the 
+# code is scanned, but not when the output code is generated. If lines are added 
+# or removed, the anchors will not be placed correctly.
+
+INPUT_FILTER           = 
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern 
+# basis.  Doxygen will compare the file name with each pattern and apply the 
+# filter if there is a match.  The filters are a list of the form: 
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further 
+# info on how filters are used. If FILTER_PATTERNS is empty or if 
+# non of the patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        = 
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using 
+# INPUT_FILTER) will be used to filter the input files when producing source 
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file 
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) 
+# and it is also possible to disable source filtering for a specific pattern 
+# using *.ext= (so without naming a filter). This option only has effect when 
+# FILTER_SOURCE_FILES is enabled.
+
+FILTER_SOURCE_PATTERNS = 
+
+# If the USE_MD_FILE_AS_MAINPAGE tag refers to the name of a markdown file that 
+# is part of the input, its contents will be placed on the main page 
+# (index.html). This can be useful if you have a project on for instance GitHub 
+# and want reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE = 
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will 
+# be generated. Documented entities will be cross-referenced with these sources. 
+# Note: To get rid of all source code in the generated output, make sure also 
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body 
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct 
+# doxygen to hide any special comment blocks from generated source code 
+# fragments. Normal C, C++ and Fortran comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES 
+# then for each documented function all documented 
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES 
+# then for each documented function all documented entities 
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) 
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from 
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will 
+# link to the source code.  Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code 
+# will point to the HTML generated by the htags(1) tool instead of doxygen 
+# built-in source browser. The htags tool is part of GNU's global source 
+# tagging system (see http://www.gnu.org/software/global/global.html). You 
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen 
+# will generate a verbatim copy of the header file for each class for 
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+# If CLANG_ASSISTED_PARSING is set to YES, then doxygen will use the clang parser 
+# for more acurate parsing at the cost of reduced performance. This can be 
+# particularly helpful with template rich C++ code for which doxygen's built-in 
+# parser lacks the necessairy type information.
+
+CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled you can provide the compiler with command 
+# line options that you would normally use when invoking the compiler. Note that 
+# the include paths will already be set by doxygen for the files and directories 
+# specified at INPUT and INCLUDE_PATH.
+
+CLANG_OPTIONS          = 
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index 
+# of all compounds will be generated. Enable this if the project 
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = NO
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then 
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns 
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all 
+# classes will be put under the same header in the alphabetical index. 
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that 
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          = 
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will 
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for 
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank 
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for 
+# each generated HTML page. If it is left blank doxygen will generate a 
+# standard header. Note that when using a custom header you are responsible  
+# for the proper inclusion of any scripts and style sheets that doxygen 
+# needs, which is dependent on the configuration options used. 
+# It is advised to generate a default header using "doxygen -w html 
+# header.html footer.html stylesheet.css YourConfigFile" and then modify 
+# that header. Note that the header is subject to change so you typically 
+# have to redo this when upgrading to a newer version of doxygen or when 
+# changing the value of configuration settings such as GENERATE_TREEVIEW!
+
+HTML_HEADER            = 
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for 
+# each generated HTML page. If it is left blank doxygen will generate a 
+# standard footer.
+
+HTML_FOOTER            = 
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading 
+# style sheet that is used by each HTML page. It can be used to 
+# fine-tune the look of the HTML output. If left blank doxygen will 
+# generate a default style sheet. Note that it is recommended to use 
+# HTML_EXTRA_STYLESHEET instead of this one, as it is more robust and this 
+# tag will in the future become obsolete.
+
+HTML_STYLESHEET        = 
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional 
+# user-defined cascading style sheet that is included after the standard 
+# style sheets created by doxygen. Using this option one can overrule 
+# certain style aspects. This is preferred over using HTML_STYLESHEET 
+# since it does not replace the standard style sheet and is therefor more 
+# robust against future updates. Doxygen will copy the style sheet file to 
+# the output directory.
+
+HTML_EXTRA_STYLESHEET  = 
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or 
+# other source files which should be copied to the HTML output directory. Note 
+# that these files will be copied to the base HTML output directory. Use the 
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these 
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that 
+# the files will be copied as-is; there are no commands or markers available.
+
+HTML_EXTRA_FILES       = 
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. 
+# Doxygen will adjust the colors in the style sheet and background images 
+# according to this color. Hue is specified as an angle on a colorwheel, 
+# see http://en.wikipedia.org/wiki/Hue for more information. 
+# For instance the value 0 represents red, 60 is yellow, 120 is green, 
+# 180 is cyan, 240 is blue, 300 purple, and 360 is red again. 
+# The allowed range is 0 to 359.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of 
+# the colors in the HTML output. For a value of 0 the output will use 
+# grayscales only. A value of 255 will produce the most vivid colors.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to 
+# the luminance component of the colors in the HTML output. Values below 
+# 100 gradually make the output lighter, whereas values above 100 make 
+# the output darker. The value divided by 100 is the actual gamma applied, 
+# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2, 
+# and 100 does not change the gamma.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML 
+# page will contain the date and time when the page was generated. Setting 
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML 
+# documentation will contain sections that can be hidden and shown after the 
+# page has loaded.
+
+HTML_DYNAMIC_SECTIONS  = YES
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of 
+# entries shown in the various tree structured indices initially; the user 
+# can expand and collapse entries dynamically later on. Doxygen will expand 
+# the tree to such a level that at most the specified number of entries are 
+# visible (unless a fully collapsed tree already exceeds this amount). 
+# So setting the number of entries 1 will produce a full collapsed tree by 
+# default. 0 is a special value representing an infinite number of entries 
+# and will result in a full expanded tree by default.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files 
+# will be generated that can be used as input for Apple's Xcode 3 
+# integrated development environment, introduced with OSX 10.5 (Leopard). 
+# To create a documentation set, doxygen will generate a Makefile in the 
+# HTML output directory. Running make will produce the docset in that 
+# directory and running "make install" will install the docset in 
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find 
+# it at startup. 
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html 
+# for more information.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the 
+# feed. A documentation feed provides an umbrella under which multiple 
+# documentation sets from a single provider (such as a company or product suite) 
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that 
+# should uniquely identify the documentation set bundle. This should be a 
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen 
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely 
+# identify the documentation publisher. This should be a reverse domain-name 
+# style string, e.g. com.mycompany.MyDocSet.documentation.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files 
+# will be generated that can be used as input for tools like the 
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) 
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can 
+# be used to specify the file name of the resulting .chm file. You 
+# can add a path in front of the file if the result should not be 
+# written to the html output directory.
+
+CHM_FILE               = 
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can 
+# be used to specify the location (absolute path including file name) of 
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run 
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           = 
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag 
+# controls if a separate .chi index file is generated (YES) or that 
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING 
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file 
+# content.
+
+CHM_INDEX_ENCODING     = 
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag 
+# controls whether a binary table of contents is generated (YES) or a 
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members 
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and 
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated 
+# that can be used as input for Qt's qhelpgenerator to generate a 
+# Qt Compressed Help (.qch) of the generated HTML documentation.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can 
+# be used to specify the file name of the resulting .qch file. 
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE               = 
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating 
+# Qt Help Project output. For more information please see 
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating 
+# Qt Help Project output. For more information please see 
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to 
+# add. For more information please see 
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME   = 
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the 
+# custom filter to add. For more information please see 
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters"> 
+# Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS  = 
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this 
+# project's 
+# filter section matches. 
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes"> 
+# Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS  = 
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can 
+# be used to specify the location of Qt's qhelpgenerator. 
+# If non-empty doxygen will try to run qhelpgenerator on the generated 
+# .qhp file.
+
+QHG_LOCATION           = 
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files  
+# will be generated, which together with the HTML files, form an Eclipse help 
+# plugin. To install this plugin and make it available under the help contents 
+# menu in Eclipse, the contents of the directory containing the HTML and XML 
+# files needs to be copied into the plugins directory of eclipse. The name of 
+# the directory within the plugins directory should be the same as 
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before 
+# the help appears.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin 
+# the directory name containing the HTML and XML files should also have 
+# this name.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) 
+# at top of each HTML page. The value NO (the default) enables the index and 
+# the value YES disables it. Since the tabs have the same information as the 
+# navigation tree you can set this option to NO if you already set 
+# GENERATE_TREEVIEW to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index 
+# structure should be generated to display hierarchical information. 
+# If the tag value is set to YES, a side panel will be generated 
+# containing a tree-like index structure (just like the one that 
+# is generated for HTML Help). For this to work a browser that supports 
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). 
+# Windows users are probably better off using the HTML help feature. 
+# Since the tree basically has the same information as the tab index you 
+# could consider to set DISABLE_INDEX to NO when enabling this option.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values 
+# (range [0,1..20]) that doxygen will group on one line in the generated HTML 
+# documentation. Note that a value of 0 will completely suppress the enum 
+# values from appearing in the overview section.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be 
+# used to set the initial width (in pixels) of the frame in which the tree 
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open 
+# links to external symbols imported via tag files in a separate window.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of Latex formulas included 
+# as images in the HTML documentation. The default is 10. Note that 
+# when you change the font size after a successful doxygen run you need 
+# to manually remove any form_*.png images from the HTML output directory 
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images 
+# generated for formulas are transparent PNGs. Transparent PNGs are 
+# not supported properly for IE 6.0, but are supported on all modern browsers. 
+# Note that when changing this option you need to delete any form_*.png files 
+# in the HTML output before the changes have effect.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax 
+# (see http://www.mathjax.org) which uses client side Javascript for the 
+# rendering instead of using prerendered bitmaps. Use this if you do not 
+# have LaTeX installed or if you want to formulas look prettier in the HTML 
+# output. When enabled you may also need to install MathJax separately and 
+# configure the path to it using the MATHJAX_RELPATH option.
+
+USE_MATHJAX            = YES
+
+# When MathJax is enabled you can set the default output format to be used for 
+# the MathJax output. Supported types are HTML-CSS, NativeMML (i.e. MathML) and 
+# SVG. The default value is HTML-CSS, which is slower, but has the best 
+# compatibility.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the 
+# HTML output directory using the MATHJAX_RELPATH option. The destination 
+# directory should contain the MathJax.js script. For instance, if the mathjax 
+# directory is located at the same level as the HTML output directory, then 
+# MATHJAX_RELPATH should be ../mathjax. The default value points to 
+# the MathJax Content Delivery Network so you can quickly see the result without 
+# installing MathJax.  However, it is strongly recommended to install a local 
+# copy of MathJax from http://www.mathjax.org before deployment.
+
+MATHJAX_RELPATH        = http://www.mathjax.org/mathjax
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension 
+# names that should be enabled during MathJax rendering.
+
+MATHJAX_EXTENSIONS     = 
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript 
+# pieces of code that will be used on startup of the MathJax code.
+
+MATHJAX_CODEFILE       = 
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box 
+# for the HTML output. The underlying search engine uses javascript 
+# and DHTML and should work on any modern browser. Note that when using 
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets 
+# (GENERATE_DOCSET) there is already a search function so this one should 
+# typically be disabled. For large projects the javascript based search engine 
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be 
+# implemented using a web server instead of a web client using Javascript. 
+# There are two flavours of web server based search depending on the 
+# EXTERNAL_SEARCH setting. When disabled, doxygen will generate a PHP script for 
+# searching and an index file used by the script. When EXTERNAL_SEARCH is 
+# enabled the indexing and searching needs to be provided by external tools. 
+# See the manual for details.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH is enabled doxygen will no longer generate the PHP 
+# script for searching. Instead the search results are written to an XML file 
+# which needs to be processed by an external indexer. Doxygen will invoke an 
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain 
+# the search results. Doxygen ships with an example indexer (doxyindexer) and 
+# search engine (doxysearch.cgi) which are based on the open source search 
+# engine library Xapian. See the manual for configuration details.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server 
+# which will returned the search results when EXTERNAL_SEARCH is enabled. 
+# Doxygen ships with an example search engine (doxysearch) which is based on 
+# the open source search engine library Xapian. See the manual for configuration 
+# details.
+
+SEARCHENGINE_URL       = 
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed 
+# search data is written to a file for indexing by an external tool. With the 
+# SEARCHDATA_FILE tag the name of this file can be specified.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH AND EXTERNAL_SEARCH are both enabled the 
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is 
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple 
+# projects and redirect the results back to the right project.
+
+EXTERNAL_SEARCH_ID     = 
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen 
+# projects other than the one defined by this configuration file, but that are 
+# all added to the same external search index. Each project needs to have a 
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id 
+# of to a relative location where the documentation can be found. 
+# The format is: EXTRA_SEARCH_MAPPINGS = id1=loc1 id2=loc2 ...
+
+EXTRA_SEARCH_MAPPINGS  = 
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will 
+# generate Latex output.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be 
+# invoked. If left blank `latex' will be used as the default command name. 
+# Note that when enabling USE_PDFLATEX this option is only used for 
+# generating bitmaps for formulas in the HTML output, but not in the 
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to 
+# generate index for LaTeX. If left blank `makeindex' will be used as the 
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact 
+# LaTeX documents. This may be useful for small projects and may help to 
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used 
+# by the printer. Possible values are: a4, letter, legal and 
+# executive. If left blank a4 will be used.
+
+PAPER_TYPE             = a4wide
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX 
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         = 
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for 
+# the generated latex document. The header should contain everything until 
+# the first chapter. If it is left blank doxygen will generate a 
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           = 
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for 
+# the generated latex document. The footer should contain everything after 
+# the last chapter. If it is left blank doxygen will generate a 
+# standard footer. Notice: only use this tag if you know what you are doing!
+
+LATEX_FOOTER           = 
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images 
+# or other source files which should be copied to the LaTeX output directory. 
+# Note that the files will be copied as-is; there are no commands or markers 
+# available.
+
+LATEX_EXTRA_FILES      = 
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated 
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will 
+# contain links (just like the HTML output) instead of page references 
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of 
+# plain latex in the generated Makefile. Set this option to YES to get a 
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. 
+# command to the generated LaTeX files. This will instruct LaTeX to keep 
+# running if errors occur, instead of asking the user for help. 
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not 
+# include the index chapters (such as File Index, Compound Index, etc.) 
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include 
+# source code with syntax highlighting in the LaTeX output. 
+# Note that which sources are shown also depends on other settings 
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the 
+# bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See 
+# http://en.wikipedia.org/wiki/BibTeX for more info.
+
+LATEX_BIB_STYLE        = plain
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output 
+# The RTF output is optimized for Word 97 and may not look very pretty with 
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact 
+# RTF documents. This may be useful for small projects and may help to 
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated 
+# will contain hyperlink fields. The RTF file will 
+# contain links (just like the HTML output) instead of page references. 
+# This makes the output suitable for online browsing using WORD or other 
+# programs which support those fields. 
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load style sheet definitions from file. Syntax is similar to doxygen's 
+# config file, i.e. a series of assignments. You only have to provide 
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    = 
+
+# Set optional variables used in the generation of an rtf document. 
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    = 
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will 
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to 
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output, 
+# then it will generate one additional man file for each entity 
+# documented in the real man page(s). These additional files 
+# only source the real man page, but without them the man command 
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will 
+# generate an XML file that captures the structure of 
+# the code including all documentation.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema, 
+# which can be used by a validating XML parser to check the 
+# syntax of the XML files.
+
+XML_SCHEMA             = 
+
+# The XML_DTD tag can be used to specify an XML DTD, 
+# which can be used by a validating XML parser to check the 
+# syntax of the XML files.
+
+XML_DTD                = 
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will 
+# dump the program listings (including syntax highlighting 
+# and cross-referencing information) to the XML output. Note that 
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES Doxygen will generate DOCBOOK files 
+# that can be used to generate PDF.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the DOCBOOK pages will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in 
+# front of it. If left blank docbook will be used as the default path.
+
+DOCBOOK_OUTPUT         = docbook
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will 
+# generate an AutoGen Definitions (see autogen.sf.net) file 
+# that captures the structure of the code including all 
+# documentation. Note that this feature is still experimental 
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will 
+# generate a Perl module file that captures the structure of 
+# the code including all documentation. Note that this 
+# feature is still experimental and incomplete at the 
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate 
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able 
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be 
+# nicely formatted so it can be parsed by a human reader.  This is useful 
+# if you want to understand what is going on.  On the other hand, if this 
+# tag is set to NO the size of the Perl module output will be much smaller 
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file 
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. 
+# This is useful so different doxyrules.make files included by the same 
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will 
+# evaluate all C-preprocessor directives found in the sources and include 
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro 
+# names in the source code. If set to NO (the default) only conditional 
+# compilation will be performed. Macro expansion can be done in a controlled 
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES 
+# then the macro expansion is limited to the macros specified with the 
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files 
+# pointed to by INCLUDE_PATH will be searched when a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that 
+# contain include files that are not input files but should be processed by 
+# the preprocessor.
+
+INCLUDE_PATH           = 
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard 
+# patterns (like *.h and *.hpp) to filter out the header-files in the 
+# directories. If left blank, the patterns specified with FILE_PATTERNS will 
+# be used.
+
+INCLUDE_FILE_PATTERNS  = 
+
+# The PREDEFINED tag can be used to specify one or more macro names that 
+# are defined before the preprocessor is started (similar to the -D option of 
+# gcc). The argument of the tag is a list of macros of the form: name 
+# or name=definition (no spaces). If the definition and the = are 
+# omitted =1 is assumed. To prevent a macro definition from being 
+# undefined via #undef or recursively expanded use the := operator 
+# instead of the = operator.
+
+PREDEFINED             = 
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then 
+# this tag can be used to specify a list of macro names that should be expanded. 
+# The macro definition that is found in the sources will be used. 
+# Use the PREDEFINED tag if you want to use a different macro definition that 
+# overrules the definition found in the source code.
+
+EXPAND_AS_DEFINED      = 
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then 
+# doxygen's preprocessor will remove all references to function-like macros 
+# that are alone on a line, have an all uppercase name, and do not end with a 
+# semicolon, because these will confuse the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles. For each 
+# tag file the location of the external documentation should be added. The 
+# format of a tag file without this location is as follows: 
+#   TAGFILES = file1 file2 ... 
+# Adding location for the tag files is done as follows: 
+#   TAGFILES = file1=loc1 "file2 = loc2" ... 
+# where "loc1" and "loc2" can be relative or absolute paths 
+# or URLs. Note that each tag file must have a unique name (where the name does 
+# NOT include the path). If a tag file is not located in the directory in which 
+# doxygen is run, you must also specify the path to the tagfile here.
+
+TAGFILES               = 
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create 
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       = 
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed 
+# in the class index. If set to NO only the inherited external classes 
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed 
+# in the modules index. If set to NO, only the current project's groups will 
+# be listed.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES all external pages will be listed 
+# in the related pages index. If set to NO, only the current project's 
+# pages will be listed.
+
+EXTERNAL_PAGES         = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script 
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will 
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base 
+# or super classes. Setting the tag to NO turns the diagrams off. Note that 
+# this option also works with HAVE_DOT disabled, but it is recommended to 
+# install and use dot, since it yields more powerful graphs.
+
+CLASS_DIAGRAMS         = NO
+
+# You can define message sequence charts within doxygen comments using the \msc 
+# command. Doxygen will then run the mscgen tool (see 
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the 
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where 
+# the mscgen tool resides. If left empty the tool is assumed to be found in the 
+# default search path.
+
+MSCGEN_PATH            = 
+
+# If set to YES, the inheritance and collaboration graphs will hide 
+# inheritance and usage relations if the target is undocumented 
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is 
+# available from the path. This tool is part of Graphviz, a graph visualization 
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section 
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is 
+# allowed to run in parallel. When set to 0 (the default) doxygen will 
+# base this on the number of processors available in the system. You can set it 
+# explicitly to a value larger than 0 to get control over the balance 
+# between CPU load and processing speed.
+
+DOT_NUM_THREADS        = 0
+
+# By default doxygen will use the Helvetica font for all dot files that 
+# doxygen generates. When you want a differently looking font you can specify 
+# the font name using DOT_FONTNAME. You need to make sure dot is able to find 
+# the font, which can be done by putting it in a standard location or by setting 
+# the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the 
+# directory containing the font.
+
+DOT_FONTNAME           = FreeSans
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. 
+# The default size is 10pt.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the Helvetica font. 
+# If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to 
+# set the path where dot can find it.
+
+DOT_FONTPATH           = 
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen 
+# will generate a graph for each documented class showing the direct and 
+# indirect inheritance relations. Setting this tag to YES will force the 
+# CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen 
+# will generate a graph for each documented class showing the direct and 
+# indirect implementation dependencies (inheritance, containment, and 
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen 
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and 
+# collaboration diagrams in a style similar to the OMG's Unified Modeling 
+# Language.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside 
+# the class node. If there are many fields or methods and many nodes the 
+# graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS 
+# threshold limits the number of items for each type to make the size more 
+# manageable. Set this to 0 for no limit. Note that the threshold may be 
+# exceeded by 50% before the limit is enforced.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If set to YES, the inheritance and collaboration graphs will show the 
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT 
+# tags are set to YES then doxygen will generate a graph for each documented 
+# file showing the direct and indirect include dependencies of the file with 
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and 
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each 
+# documented header file showing the documented files that directly or 
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then 
+# doxygen will generate a call dependency graph for every global function 
+# or class method. Note that enabling this option will significantly increase 
+# the time of a run. So in most cases it will be better to enable call graphs 
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then 
+# doxygen will generate a caller dependency graph for every global function 
+# or class method. Note that enabling this option will significantly increase 
+# the time of a run. So in most cases it will be better to enable caller 
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen 
+# will generate a graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH and HAVE_DOT tags are set to YES 
+# then doxygen will show the dependencies a directory has on other directories 
+# in a graphical way. The dependency relations are determined by the #include 
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images 
+# generated by dot. Possible values are svg, png, jpg, or gif. 
+# If left blank png will be used. If you choose svg you need to set 
+# HTML_FILE_EXTENSION to xhtml in order to make the SVG files 
+# visible in IE 9+ (other browsers do not have this requirement).
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to 
+# enable generation of interactive SVG images that allow zooming and panning. 
+# Note that this requires a modern browser other than Internet Explorer. 
+# Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you 
+# need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files 
+# visible. Older versions of IE do not have SVG support.
+
+INTERACTIVE_SVG        = NO
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be 
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               = 
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that 
+# contain dot files that are included in the documentation (see the 
+# \dotfile command).
+
+DOTFILE_DIRS           = 
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that 
+# contain msc files that are included in the documentation (see the 
+# \mscfile command).
+
+MSCFILE_DIRS           = 
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of 
+# nodes that will be shown in the graph. If the number of nodes in a graph 
+# becomes larger than this value, doxygen will truncate the graph, which is 
+# visualized by representing a node as a red box. Note that doxygen if the 
+# number of direct children of the root node in a graph is already larger than 
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note 
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the 
+# graphs generated by dot. A depth value of 3 means that only nodes reachable 
+# from the root by following a path via at most 3 edges will be shown. Nodes 
+# that lay further from the root node will be omitted. Note that setting this 
+# option to 1 or 2 may greatly reduce the computation time needed for large 
+# code bases. Also note that the size of a graph can be further restricted by 
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent 
+# background. This is disabled by default, because dot on Windows does not 
+# seem to support this out of the box. Warning: Depending on the platform used, 
+# enabling this option may lead to badly anti-aliased labels on the edges of 
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output 
+# files in one run (i.e. multiple -o and -T options on the command line). This 
+# makes dot run faster, but since only newer versions of dot (>1.8.10) 
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will 
+# generate a legend page explaining the meaning of the various boxes and 
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will 
+# remove the intermediate dot files that are used to generate 
+# the various graphs.
+
+DOT_CLEANUP            = YES
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..5c25781
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,298 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+cmake_minimum_required(VERSION 2.6)
+
+#User toggle-able options that can be changed on the command line with -D
+option( BUILD_RUNTIME "Build the BLAS runtime library" ON )
+option( BUILD_TEST "Build the library testing suite (dependency on google test, Boost, and ACML)" ON )
+option( BUILD_PERFORMANCE "Copy the performance scripts that can measure and graph performance" OFF )
+option( BUILD_SAMPLE "Build the sample programs" OFF )
+option( BUILD_CLIENT "Build a command line clBLAS client program with a variety of configurable parameters (dependency on Boost)" OFF )
+option( BUILD_KTEST "A command line tool for testing single clBLAS kernel" ON )
+
+# By default test-correctness is linked and tested against ACML library.
+# However, test-correctness can instead use NETLIB as a reference library
+set(CORR_TEST_WITH_ACML ON CACHE BOOL "Use ACML library in correctness tests")
+
+# uncomment these to print compiler invocation lines for nmake files
+# set( CMAKE_START_TEMP_FILE "" )
+# set( CMAKE_END_TEMP_FILE "" )
+# set( CMAKE_VERBOSE_MAKEFILE 1 )
+
+# If we are on linux, and we wish to link with the netlib BLAS implementation, we need to have a valid fortran compiler
+if( NOT CORR_TEST_WITH_ACML AND NOT WIN32)
+  project(clBLAS Fortran C CXX )
+else( )
+  project(clBLAS C CXX)
+endif( )
+
+# Define a version for the code
+set( clBLAS_VERSION_MAJOR 2 )
+set( clBLAS_VERSION_MINOR 1 )
+set( clBLAS_VERSION_PATCH 0 )
+set( clBLAS_VERSION "${clBLAS_VERSION_MAJOR}.${clBLAS_VERSION_MINOR}.${clBLAS_VERSION_PATCH}")
+
+# Increment this if we break backward compatibility.
+set(clBLAS_SOVERSION 1)
+
+# We have custom written Find* modules now in the root source directory
+set( CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${PROJECT_SOURCE_DIR} )
+
+# On windows, it's convenient to change the default install prefix such that it does NOT point to 'program files' (permissions problems)
+# Need to check out CMAKE_RUNTIME_OUTPUT_DIRECTORY variable, and see if that eliminates the need to modify install path
+if( CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT )
+	set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" FORCE )
+endif( )
+
+set( ACMLROOT $ENV{ACMLROOT} CACHE PATH "AMD ACML root path")
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Debug CACHE STRING
+      "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel."
+      FORCE)
+endif()
+
+if(TARGET_PLATFORM EQUAL 32 OR TARGET_PLATFORM EQUAL 64)
+    set(TARGET_PLATFORM ${TARGET_PLATFORM} CACHE STRING "Target platform type (32-bit or 64-bit)" FORCE)
+else()
+    if(CMAKE_SIZEOF_VOID_P MATCHES 8)
+        set(TARGET_PLATFORM "64" CACHE STRING "Target platform type (32-bit or 64-bit)" FORCE)
+    else()
+        set(TARGET_PLATFORM "32" CACHE STRING "Target platform type (32-bit or 64-bit)" FORCE)
+    endif()
+endif()
+
+if( MSVC_IDE )
+    set_property( GLOBAL PROPERTY USE_FOLDERS TRUE )
+endif( )
+
+message(STATUS "Target platform: ${TARGET_PLATFORM}-bit")
+if(TARGET_PLATFORM EQUAL 32)
+    set(_arch "x86" INTERNAL)
+    set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS FALSE)
+else()
+    set(_arch "x86_64" INTERNAL)
+    set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS TRUE)
+endif()
+
+# add the math library for Linux
+if( UNIX ) 
+    set(MATH_LIBRARY "m")
+endif()
+
+# Find the BLAS library
+# TODO: maybe this could be written using the FindBLAS module in the future
+if( BUILD_TEST )
+	if(NOT CORR_TEST_WITH_ACML)
+		if( WIN32 )
+			find_package( Netlib COMPONENTS BLAS REQUIRED )
+		else( )
+			if( $ENV{REFBLAS_ROOT} )
+				set( REFBLAS_ROOT $ENV{REFBLAS_ROOT} CACHE PATH "NetLib BLAS root path")
+			else( )
+				message(FATAL_ERROR "Cannot find reference BLAS, please set REFBLAS_ROOT environment variable")
+			endif( )
+			
+			# Find reference BLAS implementation
+			include( ${REFBLAS_ROOT}/package/cmake/exportBLAS.cmake )
+		endif( )
+	else( )
+		# Find ACML BLAS implementation
+		# platform dependent ACML subdirectory
+		if (WIN32)
+			set(ACML_SUBDIR ifort${TARGET_PLATFORM}_mp)
+		else()
+		   set(ACML_SUBDIR gfortran${TARGET_PLATFORM}_mp)
+		endif()
+
+		find_path(ACML_INCLUDE_DIRS acml.h
+			HINTS
+				$ENV{ACMLROOT}/include
+				${ACMLROOT}/include
+				${ACMLROOT}/${ACML_SUBDIR}/include
+		)
+
+		if( ACML_INCLUDE_DIRS )
+		else()
+			message(WARNING "Cannot find acml.h")
+		endif()
+		
+		if( UNIX )
+			find_library(ACML_LIBRARIES acml acml_mp
+				HINTS
+					$ENV{ACMLROOT}/lib
+					${ACMLROOT}/lib
+					${ACMLROOT}/${ACML_SUBDIR}/lib
+			)
+			find_library(_acml_mv_library acml_mv
+				HINTS
+					$ENV{ACMLROOT}/lib
+					${ACMLROOT}/lib
+					${ACMLROOT}/${ACML_SUBDIR}/lib
+			)
+			mark_as_advanced(_acml_mv_library)
+		endif( )
+		
+		if(WIN32)
+			find_library(ACML_LIBRARIES libacml_dll libacml_mp_dll
+				HINTS
+					$ENV{ACMLROOT}/lib
+					${ACMLROOT}/lib
+					${ACMLROOT}/${ACML_SUBDIR}/lib
+			)
+		endif( )
+		
+		if( NOT ACML_LIBRARIES )
+			message(WARNING "Cannot find libacml")
+		endif( )
+
+		if(ACML_INCLUDE_DIRS AND ACML_LIBRARIES)
+			if(_acml_mv_library)
+				list(APPEND ACML_LIBRARIES ${_acml_mv_library})
+			endif()
+			message(STATUS "Found ACML: ${ACML_LIBRARIES}")
+			set(ACML_FOUND TRUE BOOL "Found the ACML package")
+		endif()
+		mark_as_advanced(ACML_FOUND ACML_INCLUDE_DIRS ACML_LIBRARIES)
+
+	endif( )
+endif( )
+
+# This will define OPENCL_FOUND
+find_package( OpenCL )
+
+# Find Google Test package
+find_package( GTest )
+
+# Find Boost on the system, and configure the type of boost build we want
+set( Boost_USE_MULTITHREADED ON )
+set( Boost_USE_STATIC_LIBS   ON )
+set( Boost_DETAILED_FAILURE_MSG   ON )
+set( Boost_DEBUG ON )
+set( Boost_ADDITIONAL_VERSIONS "1.44.0" "1.44" "1.47.0" "1.47" )
+
+find_package( Boost 1.33.0 COMPONENTS program_options )
+message(STATUS "Boost_PROGRAM_OPTIONS_LIBRARY: ${Boost_PROGRAM_OPTIONS_LIBRARY}")
+
+
+if( NOT Boost_FOUND )
+	message( STATUS "The clBLAS ktest requires boost to be installed" )
+	set( BUILD_KTEST OFF )
+	message( STATUS "The clBLAS client requires boost to be installed" )
+	set( BUILD_CLIENT OFF )
+endif()
+
+# Turn on maximum compiler verbosity
+if(CMAKE_COMPILER_IS_GNUCXX)
+    add_definitions(-pedantic -Wall -Wextra
+        -D_POSIX_C_SOURCE=199309L -D_XOPEN_SOURCE=500
+    )
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99 -Wstrict-prototypes" CACHE STRING
+        "Default CFLAGS" FORCE)
+    # Don't use -rpath.
+    set(CMAKE_SKIP_RPATH ON CACHE BOOL "Skip RPATH" FORCE)
+
+    set(CMAKE_C_FLAGS "-m${TARGET_PLATFORM} ${CMAKE_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "-m${TARGET_PLATFORM} ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_Fortran_FLAGS "-m${TARGET_PLATFORM} ${CMAKE_Fortran_FLAGS}")
+
+    if(TARGET_PLATFORM EQUAL 32)
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-builtin")
+    endif()
+endif()
+
+if (WIN32)
+    add_definitions(-D_CRT_SECURE_NO_WARNINGS)
+endif()
+
+#TODO:  We should remove this pre-processor define for our 1.8 build; this means removing our deprecated image functions such as calls clCreateImage2D( )
+add_definitions( -DCL_USE_DEPRECATED_OPENCL_1_1_APIS )
+
+configure_file( "${PROJECT_SOURCE_DIR}/version.h.in" "${PROJECT_BINARY_DIR}/include/version.h" )
+
+# configure a header file to pass the CMake version settings to the source, and package the header files in the output archive
+install( FILES 
+			"clBLAS.h" 
+			"clAmdBlas.h"
+			"clAmdBlas.version.h"
+			"clBLAS-complex.h"
+			"${PROJECT_BINARY_DIR}/include/version.h"
+		DESTINATION 
+			"./include" )
+
+
+if( BUILD_CLIENT AND IS_DIRECTORY "${PROJECT_SOURCE_DIR}/client")
+	add_subdirectory( client )
+endif( )
+
+if( BUILD_PERFORMANCE AND IS_DIRECTORY "${PROJECT_SOURCE_DIR}/scripts/perf" )
+	add_subdirectory( scripts/perf )
+endif( )
+
+if( BUILD_RUNTIME AND IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library" )
+	add_subdirectory( library )
+	add_subdirectory( library/tools/tune )
+	if( BUILD_KTEST )
+		add_subdirectory( library/tools/ktest )
+	endif( )
+endif()
+
+if( BUILD_SAMPLE AND IS_DIRECTORY "${PROJECT_SOURCE_DIR}/samples" )
+	add_subdirectory( samples )
+endif( )
+
+# The build server is not supposed to build or package any of the tests; build server script will define this on the command line with 
+# cmake -G "Visual Studio 10 Win64" -D BUILDSERVER:BOOL=ON ../..
+if( BUILD_TEST )
+	if( IS_DIRECTORY "${PROJECT_SOURCE_DIR}/tests" )
+		add_subdirectory(tests)
+	endif( )
+
+	# These tests #include <getopts.h>, which is not windows compliant
+	if (NOT WIN32 AND IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library" )
+		add_subdirectory( library/blas/gens/tests )
+		add_subdirectory( library/blas/gens/legacy/tests )
+		add_subdirectory( library/common/tests )
+	endif( )
+endif( )
+
+# The following code is setting variables to control the behavior of CPack to generate our 
+if( WIN32 )
+	set( CPACK_SOURCE_GENERATOR "ZIP" )
+	set( CPACK_GENERATOR "ZIP" )
+else( )
+	set( CPACK_SOURCE_GENERATOR "TGZ" )
+	set( CPACK_GENERATOR "TGZ" )
+endif( )
+
+if( TARGET_PLATFORM EQUAL 64 )
+	set( CPACK_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${clBLAS_VERSION}-${CMAKE_HOST_SYSTEM_NAME}-x64")
+else( )
+	set( CPACK_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${clBLAS_VERSION}-${CMAKE_HOST_SYSTEM_NAME}-x32")
+endif( )
+
+set( CPACK_SOURCE_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${clBLAS_VERSION}-${CMAKE_HOST_SYSTEM_NAME}-Source")
+
+set( CPACK_PACKAGE_VERSION_MAJOR ${clBLAS_VERSION_MAJOR} )
+set( CPACK_PACKAGE_VERSION_MINOR ${clBLAS_VERSION_MINOR} )
+set( CPACK_PACKAGE_VERSION_PATCH ${clBLAS_VERSION_PATCH} )
+set( CPACK_PACKAGE_DESCRIPTION_SUMMARY "OpenCL implementation of a BLAS library")
+set( CPACK_PACKAGE_VENDOR "Neutral")
+set( CPACK_SOURCE_IGNORE_FILES "/\\\\.hg/;/\\\\.svn/;/\\\\.git/" )
+
+# Define all variables that influence CPack before including CPack, such as install targets
+include( CPack )
diff --git a/src/FindNetlib.cmake b/src/FindNetlib.cmake
new file mode 100644
index 0000000..9e26103
--- /dev/null
+++ b/src/FindNetlib.cmake
@@ -0,0 +1,109 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+
+# Locate an Netlib implementation.
+# Pre-built binaries for windows can be found at http://icl.cs.utk.edu/lapack-for-windows/lapack/
+#
+# Defines the following variables:
+#
+#   Netlib_FOUND - Found the OPENCL framework
+#
+# Also defines the library variables below as normal
+# variables.  These contain debug/optimized keywords when
+# a debugging library is found.
+#
+#   Netlib_LIBRARIES - libNetlib
+#
+# Accepts the following variables as input:
+#
+#   Netlib_ROOT - (as a CMake or environment variable)
+#                The root directory of where Netlib libraries are found
+#
+#   FIND_LIBRARY_USE_LIB64_PATHS - Global property that controls whether FindNetlib should search for
+#                              64bit or 32bit libs
+#
+#   Netlib_COMPILERS - Prioritized list of compiler flavors that this find package should search for when
+#                             looking for libraries.  The user could have multiple flavors of Netlib installed
+#                             and setting this before calling FindPackage will alter order searched
+#-----------------------
+# Example Usage:
+#
+#    find_package(Netlib REQUIRED)
+#    include_directories(${Netlib_INCLUDE_DIRS})
+#
+#    add_executable(foo foo.cc)
+#    target_link_libraries(foo ${Netlib_LIBRARIES})
+#
+#-----------------------
+
+#TODO:  Extend this to use Netlib_FIND_COMPONENTS, Netlib_FIND_REQUIRED, Netlib_FIND_QUIETLY
+include( FindPackageHandleStandardArgs )
+
+# Search for 64bit libs if FIND_LIBRARY_USE_LIB64_PATHS is set to true in the global environment, 32bit libs else
+get_property( LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS )
+
+# This is a prioritized list of Netlib compiler versions that this FindModule looks for
+if( NOT DEFINED Netlib_COMPILERS )
+	set( Netlib_COMPILERS minGW intel )
+endif( )
+
+# Debug print statements
+#message( "Netlib_LIBRARY_PATH_SUFFIXES: ${Netlib_LIBRARY_PATH_SUFFIXES}" )
+#message( "ENV{Netlib_ROOT}: $ENV{Netlib_ROOT}" )
+#message( "Netlib_FIND_COMPONENTS: ${Netlib_FIND_COMPONENTS}" )
+#message( "Netlib_FIND_REQUIRED: ${Netlib_FIND_REQUIRED}" )
+
+# If the user does not set which components to find, then default to all components
+if( NOT Netlib_FIND_COMPONENTS )
+	set( Netlib_FIND_COMPONENTS BLAS )
+endif( )
+
+# The library name available from Netlib has different names for 64bit and 32bit libs
+if( LIB64 )
+	set( Netlib_BLAS_LIBNAME libblas )
+#	set( Netlib_BLAS_LIBNAME BLAS )  Even though the download is named BLAS, the linker expects the .dll to be called libblas.dll
+else( )
+	set( Netlib_BLAS_LIBNAME libblas )
+endif( )
+
+list( FIND Netlib_FIND_COMPONENTS BLAS contains_BLAS )
+if( NOT contains_BLAS EQUAL -1 )
+	# Find and set the location of main Netlib lib file
+	find_library( Netlib_BLAS_LIBRARY
+		NAMES ${Netlib_BLAS_LIBNAME}
+		HINTS
+			${Netlib_ROOT}
+			ENV Netlib_ROOT
+		PATHS
+			/usr/lib
+			/usr/local/lib
+		DOC "Netlib dynamic library path"
+		PATH_SUFFIXES lib
+	)
+	mark_as_advanced( Netlib_BLAS_LIBRARY )
+
+	FIND_PACKAGE_HANDLE_STANDARD_ARGS( NETLIB DEFAULT_MSG Netlib_BLAS_LIBRARY )
+endif( )
+
+if( NETLIB_FOUND )
+	list( APPEND Netlib_LIBRARIES ${Netlib_BLAS_LIBRARY} )
+else( )
+	if( NOT Netlib_FIND_QUIETLY )
+		message( WARNING "FindNetlib could not find the Netlib library" )
+		message( STATUS "Did you remember to set the Netlib_ROOT environment variable?" )
+	endif( )
+endif()
diff --git a/src/FindOpenCL.cmake b/src/FindOpenCL.cmake
new file mode 100644
index 0000000..1cdc43d
--- /dev/null
+++ b/src/FindOpenCL.cmake
@@ -0,0 +1,96 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+
+# Locate an OpenCL implementation.
+# Currently supports AMD APP SDK (http://developer.amd.com/sdks/AMDAPPSDK/Pages/default.aspx/)
+#
+# Defines the following variables:
+#
+#   OPENCL_FOUND - Found the OPENCL framework
+#   OPENCL_INCLUDE_DIRS - Include directories
+#
+# Also defines the library variables below as normal
+# variables.  These contain debug/optimized keywords when
+# a debugging library is found.
+#
+#   OPENCL_LIBRARIES - libopencl
+#
+# Accepts the following variables as input:
+#
+#   OPENCL_ROOT - (as a CMake or environment variable)
+#                The root directory of the OpenCL implementation found
+#
+#   FIND_LIBRARY_USE_LIB64_PATHS - Global property that controls whether findOpenCL should search for
+#                              64bit or 32bit libs
+#-----------------------
+# Example Usage:
+#
+#    find_package(OPENCL REQUIRED)
+#    include_directories(${OPENCL_INCLUDE_DIRS})
+#
+#    add_executable(foo foo.cc)
+#    target_link_libraries(foo ${OPENCL_LIBRARIES})
+#
+#-----------------------
+if( DEFINED ENV{AMDAPPSDKROOT} )
+	set( OPENCL_ROOT $ENV{AMDAPPSDKROOT} CACHE PATH "Environment variable defining the root of OPENCL implementation" )
+else( )
+	set( OPENCL_ROOT "/usr/lib" CACHE PATH "Environment variable defining the root of OPENCL implementation" )
+endif( )
+
+find_path(OPENCL_INCLUDE_DIRS
+	NAMES OpenCL/cl.h CL/cl.h
+    HINTS
+		${OPENCL_ROOT}/include
+		ENV AMDAPPSDKROOT/include
+	PATHS
+		/usr/include
+		/usr/local/include
+	DOC "OpenCL header file path"
+)
+mark_as_advanced( OPENCL_INCLUDE_DIRS )
+
+# Search for 64bit libs if FIND_LIBRARY_USE_LIB64_PATHS is set to true in the global environment, 32bit libs else
+get_property( LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS )
+
+if( LIB64 )
+	find_library( OPENCL_LIBRARIES
+		NAMES OpenCL
+		HINTS
+            ${OPENCL_ROOT}/lib
+            ENV AMDAPPSDKROOT/lib
+		DOC "OpenCL dynamic library path"
+		PATH_SUFFIXES x86_64 x64
+	)
+else( )
+	find_library( OPENCL_LIBRARIES
+		NAMES OpenCL
+		HINTS
+            ${OPENCL_ROOT}/lib
+            ENV AMDAPPSDKROOT/lib
+		DOC "OpenCL dynamic library path"
+		PATH_SUFFIXES x86
+	)
+endif( )
+mark_as_advanced( OPENCL_LIBRARIES )
+
+include( FindPackageHandleStandardArgs )
+FIND_PACKAGE_HANDLE_STANDARD_ARGS( OPENCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS )
+
+if( NOT OPENCL_FOUND )
+	message( STATUS "FindOpenCL looked for libraries named: OpenCL" )
+endif()
diff --git a/src/clAmdBlas.h b/src/clAmdBlas.h
new file mode 100644
index 0000000..1921473
--- /dev/null
+++ b/src/clAmdBlas.h
@@ -0,0 +1,12242 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#ifndef CLAMDBLAS_H_
+#define CLAMDBLAS_H_
+
+/*! @file clAmdBlas.h
+ * /note clAmdBlas.h is a deprecated header file.  
+ * This header is provided to help projects that were written with the older clAmdBlas codebase, to help them 
+ * port to the new API at their own schedule.  It will not be maintained or updated, and will be removed after 
+ * a reasonable amount of time has passed.  All new code should be written against clFFT.h.  
+ * Older projects should migrate to the new header at their earliest convenience.
+ */
+
+/**
+ * @mainpage OpenCL BLAS
+ *
+ */
+
+#include "clBLAS.h"
+
+/* The following header defines a fixed version number as this header is deprecated and won't be updated */
+#include "clAmdBlas.version.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @defgroup OVERVIEW Overview
+ *
+ * This library provides an implementation of the Basic Linear Algebra Subprograms levels 1, 2 and 3,
+ * using OpenCL and optimized for AMD GPU hardware. It provides BLAS-1 functions
+ * SWAP, SCAL, COPY, AXPY, DOT, DOTU, DOTC, ROTG, ROTMG, ROT, ROTM, iAMAX, ASUM and NRM2,
+ * BLAS-2 functions GEMV, SYMV, TRMV, TRSV, HEMV, SYR, SYR2, HER, HER2, GER, GERU, GERC,
+ * TPMV, SPMV, HPMV, TPSV, SPR, SPR2, HPR, HPR2, GBMV, TBMV, SBMV, HBMV and TBSV
+ * and BLAS-3 functions GEMM, SYMM, TRMM, TRSM, HEMM, HERK, HER2K, SYRK and SYR2K.
+ *
+ * This library’s primary goal is to assist the end user to enqueue OpenCL
+ * kernels to process BLAS functions in an OpenCL-efficient manner, while
+ * keeping interfaces familiar to users who know how to use BLAS. All
+ * functions accept matrices through buffer objects.
+ *
+ * @section deprecated
+ * This library provided support for the creation of scratch images to achieve better performance
+ * on older <a href="http://developer.amd.com/gpu/AMDAPPSDK/Pages/default.aspx">AMD APP SDK's</a>. 
+ * However, memory buffers now give the same performance as buffers objects in the current SDK's. 
+ * Scratch image buffers are being deprecated and users are advised not to use scratch images in
+ * new applications.
+ */
+
+/**
+ * @defgroup TYPES clAmdBlas types
+ */
+/*@{*/
+
+
+/*	Since there is no method to inherit or extend an enum, clAmdBlasOrder is now a 
+	set of macro's and typedefs that 'behave' like an enum.  The advantage is there
+	is no need to cast between clblasOrder and clAmdBlasOrder
+	*/
+#define clAmdBlasRowMajor clblasRowMajor
+#define clAmdBlasColumnMajor clblasColumnMajor
+
+typedef enum clblasOrder_ clAmdBlasOrder;
+
+/*	Since there is no method to inherit or extend an enum, clAmdBlasTranspose is now a 
+	set of macro's and typedefs that 'behave' like an enum.  The advantage is there
+	is no need to cast between clblasTranspose and clAmdBlasTranspose
+	*/
+#define clAmdBlasNoTrans clblasNoTrans
+#define clAmdBlasTrans clblasTrans
+#define clAmdBlasConjTrans clblasConjTrans
+
+typedef enum clblasTranspose_ clAmdBlasTranspose;
+
+/*	Since there is no method to inherit or extend an enum, clAmdBlasUplo is now a 
+	set of macro's and typedefs that 'behave' like an enum.  The advantage is there
+	is no need to cast between clblasUplo and clAmdBlasUplo
+	*/
+#define clAmdBlasUpper clblasUpper
+#define clAmdBlasLower clblasLower
+
+typedef enum clblasUplo_ clAmdBlasUplo;
+
+/*	Since there is no method to inherit or extend an enum, clAmdBlasDiag is now a 
+	set of macro's and typedefs that 'behave' like an enum.  The advantage is there
+	is no need to cast between clblasDiag and clAmdBlasDiag
+	*/
+#define clAmdBlasUnit clblasUnit
+#define clAmdBlasNonUnit clblasNonUnit
+
+typedef enum clblasDiag_ clAmdBlasDiag;
+
+/*	Since there is no method to inherit or extend an enum, clAmdBlasSide is now a 
+	set of macro's and typedefs that 'behave' like an enum.  The advantage is there
+	is no need to cast between clblasSide and clAmdBlasSide
+	*/
+#define clAmdBlasLeft clblasLeft
+#define clAmdBlasRight clblasRight
+
+typedef enum clblasSide_ clAmdBlasSide;
+
+/*	Since there is no method to inherit or extend an enum, clAmdBlasStatus is now a 
+	set of macro's and typedefs that 'behave' like an enum.  The advantage is there
+	is no need to cast between clblasStatus and clAmdBlasStatus
+	*/
+#define clAmdBlasSuccess clblasSuccess
+#define clAmdBlasInvalidValue clblasInvalidValue
+#define clAmdBlasInvalidCommandQueue clblasInvalidCommandQueue
+#define clAmdBlasInvalidContext clblasInvalidContext
+#define clAmdBlasInvalidMemObject clblasInvalidMemObject
+#define clAmdBlasInvalidDevice clblasInvalidDevice
+#define clAmdBlasInvalidEventWaitList clblasInvalidEventWaitList
+#define clAmdBlasOutOfResources clblasOutOfResources
+#define clAmdBlasOutOfHostMemory clblasOutOfHostMemory
+#define clAmdBlasInvalidOperation clblasInvalidOperation
+#define clAmdBlasCompilerNotAvailable clblasCompilerNotAvailable
+#define clAmdBlasBuildProgramFailure clblasBuildProgramFailure
+
+#define clAmdBlasNotImplemented clblasNotImplemented
+#define clAmdBlasNotInitialized clblasNotInitialized
+#define clAmdBlasInvalidMatA clblasInvalidMatA
+#define clAmdBlasInvalidMatB clblasInvalidMatB
+#define clAmdBlasInvalidMatC clblasInvalidMatC
+#define clAmdBlasInvalidVecX clblasInvalidVecX
+#define clAmdBlasInvalidVecY clblasInvalidVecY
+#define clAmdBlasInvalidDim clblasInvalidDim
+#define clAmdBlasInvalidLeadDimA clblasInvalidLeadDimA
+#define clAmdBlasInvalidLeadDimB clblasInvalidLeadDimB
+#define clAmdBlasInvalidLeadDimC clblasInvalidLeadDimC
+#define clAmdBlasInvalidIncX clblasInvalidIncX
+#define clAmdBlasInvalidIncY clblasInvalidIncY
+#define clAmdBlasInsufficientMemMatA clblasInsufficientMemMatA
+#define clAmdBlasInsufficientMemMatB clblasInsufficientMemMatB
+#define clAmdBlasInsufficientMemMatC clblasInsufficientMemMatC
+#define clAmdBlasInsufficientMemVecX clblasInsufficientMemVecX
+#define clAmdBlasInsufficientMemVecY clblasInsufficientMemVecY
+
+typedef enum clblasStatus_ clAmdBlasStatus;
+
+
+/*@}*/
+
+/**
+ * @defgroup VERSION Version information
+ */
+/*@{*/
+
+/**
+ * @brief Get the clAmdBlas library version info.
+ *
+ * @param[out] major        Location to store library's major version.
+ * @param[out] minor        Location to store library's minor version.
+ * @param[out] patch        Location to store library's patch version.
+ *
+ * @returns always \b clAmdBlasSuccess.
+ *
+ * @ingroup VERSION
+ */
+__inline clAmdBlasStatus
+clAmdBlasGetVersion( cl_uint* major, cl_uint* minor, cl_uint* patch )
+{
+	return clblasGetVersion( major, minor, patch );
+}
+
+/*@}*/
+
+/**
+ * @defgroup INIT Initialize library
+ */
+/*@{*/
+
+/**
+ * @brief Initialize the clAmdBlas library.
+ *
+ * Must be called before any other clAmdBlas API function is invoked.
+ * @note This function is not thread-safe.
+ *
+ * @return
+ *   - \b clAmdBlasSucces on success;
+ *   - \b clAmdBlasOutOfHostMemory if there is not enough of memory to allocate
+ *     library's internal structures;
+ *   - \b clAmdBlasOutOfResources in case of requested resources scarcity.
+ *
+ * @ingroup INIT
+ */
+__inline clAmdBlasStatus
+clAmdBlasSetup( )
+{
+	return clblasSetup( );
+}
+
+/**
+ * @brief Finalize the usage of the clAmdBlas library.
+ *
+ * Frees all memory allocated for different computational kernel and other
+ * internal data.
+ * @note This function is not thread-safe.
+ *
+ * @ingroup INIT
+ */
+__inline void
+clAmdBlasTeardown( )
+{
+	clblasTeardown( );
+}
+
+/*@}*/
+
+/**
+ * @defgroup MISC Miscellaneous
+ */
+/*@{*/
+
+/**
+ * @deprecated
+ * @brief Create scratch image.
+ *
+ * Images created with this function can be used by the library to switch from
+ * a buffer-based to an image-based implementation. This can increase
+ * performance up to 2 or 3 times over buffer-objects-based ones on same systems.
+ * To leverage the GEMM and TRMM kernels, it is necessary to create two images.
+ *
+ * The following description provides bounds for the width and height arguments
+ * for functions that can use scratch images.
+ *
+ * Let \c type be the data type of the function in question.
+ *
+ * Let <tt>fl4RelSize(type) = sizeof(cl_float4) / sizeof(type)</tt>.
+ *
+ * Let \c width1 and \c width2 be the respective values of the width argument
+ * passed into the function for the two images needed to activate the image-based
+ * algorithm. Similarly, let \c height1 and \c height2 be the values for the
+ * height argument.
+ *
+ * Let <tt>div_up(x,y) = (x + y – 1) / y</tt>.
+ *
+ * Let <tt>round_up(x,y) = div_up(x,y) * y</tt>.
+ *
+ * Let <tt>round_down(x,y) = (x / y) * y</tt>.
+ *
+ * Then:
+ *
+ * For \b xGEMM there should be 2 images satisfying the following requirements:
+ *   - <tt>width1 >= round_up(K, 64) / fl4RelSize(type)</tt>,
+ *   - <tt>width2 >= round_up(K, 64) / fl4RelSize(type)</tt>,
+ *   - <tt>height >= 64M</tt>,
+ *
+ * for any transA, transB, and order.
+ *
+ * For \b xTRMM:
+ *   - <tt>width1 >= round_up(T, 64) / fl4RelSize(type)</tt>,
+ *   - <tt>width2 >= round_up(N, 64) / fl4RelSize(type)</tt>,
+ *   - <tt>height >= 64</tt>,
+ *
+ * for any transA, transB and order, where
+ *   - \c T = M, for \c side = clAmdBlasLeft, and
+ *   - \c T = N, for \c side = clAmdBlasRight.
+ *
+ * For \b xTRSM:
+ *   - <tt>round_down(width, 32) * round_down(height, 32) * fl4RelSize(type) >= 1/2 * (round_up(T, 32)^2 + div_up(T, 32) * 32^2)</tt>
+ *
+ * for any transA, transB and order, where
+ *   - \c T = M, for \c side = clAmdBlasLeft, and
+ *   - \c T = N, for \c side = clAmdBlasRight.
+ *
+ * A call to clAmdAddScratchImage with arguments \c width and \c height reserves
+ * approximately <tt>width * height * 16</tt> bytes of device memory.
+ *
+ * @return A created image identifier.
+ *
+ * @ingroup MISC
+ */
+cl_ulong
+clAmdBlasAddScratchImage(
+    cl_context context,
+    size_t width,
+    size_t height,
+    clAmdBlasStatus *status);
+
+/**
+ * @deprecated
+ * @brief Release scratch image.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if an invalid image identified is passed.
+ *
+ * @ingroup MISC
+ */
+clAmdBlasStatus
+clAmdBlasRemoveScratchImage(
+    cl_ulong imageID);
+
+/*@}*/
+
+/**
+ * @defgroup BLAS1 BLAS-1 functions
+ *
+ * The Level 1 Basic Linear Algebra Subprograms are functions that perform
+ * vector-vector operations.
+ */
+/*@{*/
+/*@}*/
+
+/**
+ * @defgroup SWAP SWAP  - Swap elements from 2 vectors
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief interchanges two vectors of float.
+ *
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if either \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SWAP
+ */
+__inline clAmdBlasStatus
+clAmdBlasSswap( 
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSswap( N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_sswap.c
+ * Example of how to use the @ref clAmdBlasSswap function.
+ */
+ 
+ /**
+ * @brief interchanges two vectors of double.
+ *
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSswap() function otherwise.
+ *
+ * @ingroup SWAP
+ */
+__inline clAmdBlasStatus
+clAmdBlasDswap( 
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDswap( N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+/**
+ * @brief interchanges two vectors of complex-float elements.
+ *
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - the same error codes as the clAmdBlasSwap() function otherwise.
+ *
+ * @ingroup SWAP
+ */
+__inline clAmdBlasStatus
+clAmdBlasCswap( 
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCswap( N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+    
+/**
+ * @brief interchanges two vectors of double-complex elements.
+ *
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - the same error codes as the clAmdBlasDwap() function otherwise.
+ *
+ * @ingroup SWAP
+ */
+__inline clAmdBlasStatus
+clAmdBlasZswap( 
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZswap( N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+/*@}*/
+
+
+/**
+ * @defgroup SCAL SCAL  - Scales a vector by a constant
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief Scales a float vector by a float constant
+ *
+ *   - \f$ X \leftarrow \alpha X \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] alpha     The constant factor for vector \b X.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - \b incx zero, or
+ *     - the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if either \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SCAL
+ */
+__inline clAmdBlasStatus
+clAmdBlasSscal(
+    size_t N,
+    cl_float alpha,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSscal( N, alpha, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_sscal.c
+ * Example of how to use the @ref clAmdBlasSscal function.
+ */
+ 
+ /**
+ * @brief Scales a double vector by a double constant
+ *
+ *   - \f$ X \leftarrow \alpha X \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] alpha     The constant factor for vector \b X.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSscal() function otherwise.
+ *
+ * @ingroup SCAL
+ */
+__inline clAmdBlasStatus
+clAmdBlasDscal(
+    size_t N,
+    cl_double alpha,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDscal( N, alpha, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+    
+/**
+ * @brief Scales a complex-float vector by a complex-float constant
+ *
+  *   - \f$ X \leftarrow \alpha X \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] alpha     The constant factor for vector \b X.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - the same error codes as the clAmdBlasSscal() function otherwise.
+ *
+ * @ingroup SCAL
+ */
+__inline clAmdBlasStatus
+clAmdBlasCscal(
+    size_t N,
+    cl_float2 alpha,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCscal( N, alpha, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+    
+/**
+ * @brief Scales a complex-double vector by a complex-double constant
+ *
+ *   - \f$ X \leftarrow \alpha X \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] alpha     The constant factor for vector \b X.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - the same error codes as the clAmdBlasDscal() function otherwise.
+ *
+ * @ingroup SCAL
+ */
+__inline clAmdBlasStatus
+clAmdBlasZscal(
+    size_t N,
+    cl_double2 alpha,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZscal( N, alpha, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+ 
+/*@}*/
+
+/**
+ * @defgroup SSCAL SSCAL  - Scales a complex vector by a real constant
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief Scales a complex-float vector by a float constant
+ *
+ *   - \f$ X \leftarrow \alpha X \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] alpha     The constant factor for vector \b X.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - \b incx zero, or
+ *     - the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if either \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SSCAL
+ */
+__inline clAmdBlasStatus
+clAmdBlasCsscal(
+    size_t N,
+    cl_float alpha,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCsscal( N, alpha, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+ 
+/*@}*/
+
+/**
+ * @example example_csscal.c
+ * Example of how to use the @ref clAmdBlasCsscal function.
+ */
+ 
+/**
+ * @brief Scales a complex-double vector by a double constant
+ *
+ *   - \f$ X \leftarrow \alpha X \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] alpha     The constant factor for vector \b X.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasCsscal() function otherwise.
+ *
+ * @ingroup SSCAL
+ */
+__inline clAmdBlasStatus
+clAmdBlasZdscal(
+    size_t N,
+    cl_double alpha,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZdscal( N, alpha, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+ /*@}*/
+ 
+ 
+/**
+ * @defgroup COPY COPY  - Copies elements from vector X to vector Y
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief Copies float elements from vector X to vector Y
+ *
+ *   - \f$ Y \leftarrow X \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if either \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup COPY
+ */
+__inline clAmdBlasStatus
+clAmdBlasScopy(
+    size_t N,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasScopy( N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_scopy.c
+ * Example of how to use the @ref clAmdBlasScopy function.
+ */
+ 
+ /**
+ * @brief Copies double elements from vector X to vector Y
+ *
+ *   - \f$ Y \leftarrow X \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasScopy() function otherwise.
+ *
+ * @ingroup COPY
+ */
+__inline clAmdBlasStatus
+clAmdBlasDcopy(
+    size_t N,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDcopy( N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+    
+/**
+ * @brief Copies complex-float elements from vector X to vector Y
+ *
+ *   - \f$ Y \leftarrow X \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - the same error codes as the clAmdBlasScopy() function otherwise.
+ *
+ * @ingroup COPY
+ */
+__inline clAmdBlasStatus
+clAmdBlasCcopy(
+    size_t N,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCcopy( N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+    
+/**
+ * @brief Copies complex-double elements from vector X to vector Y
+ *
+ *   - \f$ Y \leftarrow X \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - the same error codes as the clAmdBlasDcopy() function otherwise.
+ *
+ * @ingroup COPY
+ */
+__inline clAmdBlasStatus
+clAmdBlasZcopy(
+    size_t N,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZcopy( N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+ 
+ /*@}*/
+ 
+/**
+ * @defgroup AXPY AXPY  - Scale X and add to Y
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief Scale vector X of float elements and add to Y
+ *
+ *   - \f$ Y \leftarrow \alpha X + Y \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] alpha     The constant factor for vector \b X.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if either \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup AXPY
+ */
+__inline clAmdBlasStatus
+clAmdBlasSaxpy(
+    size_t N,
+    cl_float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSaxpy( N, alpha, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_saxpy.c
+ * Example of how to use the @ref clAmdBlasSaxpy function.
+ */
+ 
+/**
+ * @brief Scale vector X of double elements and add to Y
+ *
+ *   - \f$ Y \leftarrow \alpha X + Y \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] alpha     The constant factor for vector \b X.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSaxpy() function otherwise.
+ *
+ * @ingroup AXPY
+ */
+__inline clAmdBlasStatus
+clAmdBlasDaxpy(
+    size_t N,
+    cl_double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDaxpy( N, alpha, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+    
+/**
+ * @brief Scale vector X of complex-float elements and add to Y
+ *
+ *   - \f$ Y \leftarrow \alpha X + Y \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] alpha     The constant factor for vector \b X.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - the same error codes as the clAmdBlasSaxpy() function otherwise.
+ *
+ * @ingroup AXPY
+ */
+__inline clAmdBlasStatus
+clAmdBlasCaxpy(
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCaxpy( N, alpha, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+    
+/**
+ * @brief Scale vector X of double-complex elements and add to Y
+ *
+ *   - \f$ Y \leftarrow \alpha X + Y \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] alpha     The constant factor for vector \b X.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - the same error codes as the clAmdBlasDaxpy() function otherwise.
+ *
+ * @ingroup AXPY
+ */
+__inline clAmdBlasStatus
+clAmdBlasZaxpy(
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZaxpy( N, alpha, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+ 
+/*@}*/
+
+
+/**
+ * @defgroup DOT DOT  - Dot product of two vectors
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief dot product of two vectors containing float elements
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] dotProduct   Buffer object that will contain the dot-product value
+ * @param[in] offDP         Offset to dot-product in \b dotProduct buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y             Buffer object storing the vector \b Y.
+ * @param[in] offy          Offset of first element of vector \b Y in buffer object.
+ *                          Counted in elements.
+ * @param[in] incy          Increment for the elements of \b Y. Must not be zero.
+ * @param[in] scratchBuff	Temporary cl_mem scratch buffer object of minimum size N
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if either \b X, \b Y or \b dotProduct object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup DOT
+ */
+__inline clAmdBlasStatus
+clAmdBlasSdot(
+    size_t N,
+    cl_mem dotProduct,
+    size_t offDP,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSdot( N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, 
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_sdot.c
+ * Example of how to use the @ref clAmdBlasSdot function.
+ */
+ 
+/**
+ * @brief dot product of two vectors containing double elements
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] dotProduct   Buffer object that will contain the dot-product value
+ * @param[in] offDP         Offset to dot-product in \b dotProduct buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y             Buffer object storing the vector \b Y.
+ * @param[in] offy          Offset of first element of vector \b Y in buffer object.
+ *                          Counted in elements.
+ * @param[in] incy          Increment for the elements of \b Y. Must not be zero.
+ * @param[in] scratchBuff	Temporary cl_mem scratch buffer object of minimum size N
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSdot() function otherwise.
+ *
+ * @ingroup DOT
+ */
+__inline clAmdBlasStatus
+clAmdBlasDdot(
+    size_t N,
+    cl_mem dotProduct,
+    size_t offDP,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDdot( N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, 
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+ 
+
+/**
+ * @brief dot product of two vectors containing float-complex elements
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] dotProduct   Buffer object that will contain the dot-product value
+ * @param[in] offDP         Offset to dot-product in \b dotProduct buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y             Buffer object storing the vector \b Y.
+ * @param[in] offy          Offset of first element of vector \b Y in buffer object.
+ *                          Counted in elements.
+ * @param[in] incy          Increment for the elements of \b Y. Must not be zero.
+ * @param[in] scratchBuff   Temporary cl_mem scratch buffer object of minimum size N
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - the same error codes as the clAmdBlasSdot() function otherwise.
+ *
+ * @ingroup DOT
+ */
+__inline clAmdBlasStatus
+clAmdBlasCdotu(
+    size_t N,
+    cl_mem dotProduct,
+    size_t offDP,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCdotu( N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, 
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+
+/**
+ * @brief dot product of two vectors containing double-complex elements
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] dotProduct   Buffer object that will contain the dot-product value
+ * @param[in] offDP         Offset to dot-product in \b dotProduct buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y             Buffer object storing the vector \b Y.
+ * @param[in] offy          Offset of first element of vector \b Y in buffer object.
+ *                          Counted in elements.
+ * @param[in] incy          Increment for the elements of \b Y. Must not be zero.
+ * @param[in] scratchBuff   Temporary cl_mem scratch buffer object of minimum size N
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSdot() function otherwise.
+ *
+ * @ingroup DOT
+ */
+
+__inline clAmdBlasStatus
+clAmdBlasZdotu(
+    size_t N,
+    cl_mem dotProduct,
+    size_t offDP,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZdotu( N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, 
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+
+/**
+ * @brief dot product of two vectors containing float-complex elements conjugating the first vector
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] dotProduct   Buffer object that will contain the dot-product value
+ * @param[in] offDP         Offset to dot-product in \b dotProduct buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y             Buffer object storing the vector \b Y.
+ * @param[in] offy          Offset of first element of vector \b Y in buffer object.
+ *                          Counted in elements.
+ * @param[in] incy          Increment for the elements of \b Y. Must not be zero.
+ * @param[in] scratchBuff   Temporary cl_mem scratch buffer object of minimum size N
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - the same error codes as the clAmdBlasSdot() function otherwise.
+ *
+ * @ingroup DOT
+ */
+
+__inline clAmdBlasStatus
+clAmdBlasCdotc(
+    size_t N,
+    cl_mem dotProduct,
+    size_t offDP,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCdotc( N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, 
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+
+/**
+ * @brief dot product of two vectors containing double-complex elements conjugating the first vector
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] dotProduct   Buffer object that will contain the dot-product value
+ * @param[in] offDP         Offset to dot-product in \b dotProduct buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y             Buffer object storing the vector \b Y.
+ * @param[in] offy          Offset of first element of vector \b Y in buffer object.
+ *                          Counted in elements.
+ * @param[in] incy          Increment for the elements of \b Y. Must not be zero.
+ * @param[in] scratchBuff   Temporary cl_mem scratch buffer object of minimum size N
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSdot() function otherwise.
+ *
+ * @ingroup DOT
+ */
+__inline clAmdBlasStatus
+clAmdBlasZdotc(
+    size_t N,
+    cl_mem dotProduct,
+    size_t offDP,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZdotc( N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, 
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/*@}*/
+
+
+/**
+ * @defgroup ROTG ROTG  - Constructs givens plane rotation
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief construct givens plane rotation on float elements
+ *
+ * @param[out] SA           Buffer object that contains SA
+ * @param[in] offSA         Offset to SA in \b SA buffer object.
+ *                          Counted in elements.
+ * @param[out] SB           Buffer object that contains SB
+ * @param[in] offSB         Offset to SB in \b SB buffer object.
+ *                          Counted in elements.
+ * @param[out] C            Buffer object that contains C
+ * @param[in] offC          Offset to C in \b C buffer object.
+ *                          Counted in elements.
+ * @param[out] S            Buffer object that contains S
+ * @param[in] offS          Offset to S in \b S buffer object.
+ *                          Counted in elements.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidMemObject if either \b SA, \b SB, \b C or \b S object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup ROTG
+ */
+__inline clAmdBlasStatus
+clAmdBlasSrotg(
+    cl_mem SA,
+    size_t offSA,
+    cl_mem SB,
+    size_t offSB,
+    cl_mem C,
+    size_t offC,
+    cl_mem S,
+    size_t offS,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSrotg( SA, offSA, SB, offSB, C, offC, S, offS, 
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_srotg.c
+ * Example of how to use the @ref clAmdBlasSrotg function.
+ */
+ 
+/**
+ * @brief construct givens plane rotation on double elements
+ *
+ * @param[out] DA           Buffer object that contains DA
+ * @param[in] offDA         Offset to DA in \b DA buffer object.
+ *                          Counted in elements.
+ * @param[out] DB           Buffer object that contains DB
+ * @param[in] offDB         Offset to DB in \b DB buffer object.
+ *                          Counted in elements.
+ * @param[out] C            Buffer object that contains C
+ * @param[in] offC          Offset to C in \b C buffer object.
+ *                          Counted in elements.
+ * @param[out] S            Buffer object that contains S
+ * @param[in] offS          Offset to S in \b S buffer object.
+ *                          Counted in elements.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSrotg() function otherwise.
+ *
+ * @ingroup ROTG
+ */
+__inline clAmdBlasStatus
+clAmdBlasDrotg(
+    cl_mem DA,
+    size_t offDA,
+    cl_mem DB,
+    size_t offDB,
+    cl_mem C,
+    size_t offC,
+    cl_mem S,
+    size_t offS,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDrotg( DA, offDA, DB, offDB, C, offC, S, offS, 
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+    
+/**
+ * @brief construct givens plane rotation on float-complex elements
+ *
+ * @param[out] CA           Buffer object that contains CA
+ * @param[in] offCA         Offset to CA in \b CA buffer object.
+ *                          Counted in elements.
+ * @param[out] CB           Buffer object that contains CB
+ * @param[in] offCB         Offset to CB in \b CB buffer object.
+ *                          Counted in elements.
+ * @param[out] C            Buffer object that contains C. C is real.
+ * @param[in] offC          Offset to C in \b C buffer object.
+ *                          Counted in elements.
+ * @param[out] S            Buffer object that contains S
+ * @param[in] offS          Offset to S in \b S buffer object.
+ *                          Counted in elements.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - the same error codes as the clAmdBlasSrotg() function otherwise.
+ *
+ * @ingroup ROTG
+ */
+__inline clAmdBlasStatus
+clAmdBlasCrotg(
+    cl_mem CA,
+    size_t offCA,
+    cl_mem CB,
+    size_t offCB,
+    cl_mem C,
+    size_t offC,
+    cl_mem S,
+    size_t offS,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCrotg( CA, offCA, CB, offCB, C, offC, S, offS, 
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+    
+/**
+ * @brief construct givens plane rotation on double-complex elements
+ *
+ * @param[out] CA           Buffer object that contains CA
+ * @param[in] offCA         Offset to CA in \b CA buffer object.
+ *                          Counted in elements.
+ * @param[out] CB           Buffer object that contains CB
+ * @param[in] offCB         Offset to CB in \b CB buffer object.
+ *                          Counted in elements.
+ * @param[out] C            Buffer object that contains C. C is real.
+ * @param[in] offC          Offset to C in \b C buffer object.
+ *                          Counted in elements.
+ * @param[out] S            Buffer object that contains S
+ * @param[in] offS          Offset to S in \b S buffer object.
+ *                          Counted in elements.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - the same error codes as the clAmdBlasDrotg() function otherwise.
+ *
+ * @ingroup ROTG
+ */
+__inline clAmdBlasStatus
+clAmdBlasZrotg(
+    cl_mem CA,
+    size_t offCA,
+    cl_mem CB,
+    size_t offCB,
+    cl_mem C,
+    size_t offC,
+    cl_mem S,
+    size_t offS,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZrotg( CA, offCA, CB, offCB, C, offC, S, offS, 
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+ 
+/*@}*/
+
+/**
+ * @defgroup ROTMG ROTMG  - Constructs the modified givens rotation
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief construct the modified givens rotation on float elements
+ *
+ * @param[out] SD1          Buffer object that contains SD1
+ * @param[in] offSD1        Offset to SD1 in \b SD1 buffer object.
+ *                          Counted in elements.
+ * @param[out] SD2          Buffer object that contains SD2
+ * @param[in] offSD2        Offset to SD2 in \b SD2 buffer object.
+ *                          Counted in elements.
+ * @param[out] SX1          Buffer object that contains SX1
+ * @param[in] offSX1        Offset to SX1 in \b SX1 buffer object.
+ *                          Counted in elements.
+ * @param[in] SY1           Buffer object that contains SY1
+ * @param[in] offSY1        Offset to SY1 in \b SY1 buffer object.
+ *                          Counted in elements.
+ * @param[out] SPARAM       Buffer object that contains SPARAM array of minimum length 5
+                            SPARAM(0) = SFLAG
+                            SPARAM(1) = SH11
+                            SPARAM(2) = SH21
+                            SPARAM(3) = SH12
+                            SPARAM(4) = SH22
+                            
+ * @param[in] offSparam     Offset to SPARAM in \b SPARAM buffer object.
+ *                          Counted in elements.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidMemObject if either \b SX1, \b SY1, \b SD1, \b SD2 or \b SPARAM object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup ROTMG
+ */
+__inline clAmdBlasStatus
+clAmdBlasSrotmg(
+    cl_mem SD1,
+    size_t offSD1,
+    cl_mem SD2,
+    size_t offSD2,
+    cl_mem SX1,
+    size_t offSX1,
+    const cl_mem SY1,
+    size_t offSY1,
+    cl_mem SPARAM,
+    size_t offSparam,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSrotmg( SD1, offSD1, SD2, offSD2, SX1, offSX1, SY1, offSY1, SPARAM, offSparam,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_srotmg.c
+ * Example of how to use the @ref clAmdBlasSrotmg function.
+ */
+ 
+/**
+ * @brief construct the modified givens rotation on double elements
+ *
+ * @param[out] DD1          Buffer object that contains DD1
+ * @param[in] offDD1        Offset to DD1 in \b DD1 buffer object.
+ *                          Counted in elements.
+ * @param[out] DD2          Buffer object that contains DD2
+ * @param[in] offDD2        Offset to DD2 in \b DD2 buffer object.
+ *                          Counted in elements.
+ * @param[out] DX1          Buffer object that contains DX1
+ * @param[in] offDX1        Offset to DX1 in \b DX1 buffer object.
+ *                          Counted in elements.
+ * @param[in] DY1           Buffer object that contains DY1
+ * @param[in] offDY1        Offset to DY1 in \b DY1 buffer object.
+ *                          Counted in elements.
+ * @param[out] DPARAM       Buffer object that contains DPARAM array of minimum length 5
+                            DPARAM(0) = DFLAG
+                            DPARAM(1) = DH11
+                            DPARAM(2) = DH21
+                            DPARAM(3) = DH12
+                            DPARAM(4) = DH22
+                            
+ * @param[in] offDparam     Offset to DPARAM in \b DPARAM buffer object.
+ *                          Counted in elements.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSrotmg() function otherwise.
+ *
+ * @ingroup ROTMG
+ */
+__inline clAmdBlasStatus
+clAmdBlasDrotmg(
+    cl_mem DD1,
+    size_t offDD1,
+    cl_mem DD2,
+    size_t offDD2,
+    cl_mem DX1,
+    size_t offDX1,
+    const cl_mem DY1,
+    size_t offDY1,
+    cl_mem DPARAM,
+    size_t offDparam,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDrotmg( DD1, offDD1, DD2, offDD2, DX1, offDX1, DY1, offDY1, DPARAM, offDparam,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+ 
+/*@}*/
+
+
+/**
+ * @defgroup ROT ROT  - Apply givens rotation
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief applies a plane rotation for float elements
+ *
+ * @param[in] N         Number of elements in vector \b X and \b Y.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] C         C specifies the cosine, cos.
+ * @param[in] S         S specifies the sine, sin.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if either \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup ROT
+ */
+__inline clAmdBlasStatus
+clAmdBlasSrot(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_float C,
+    cl_float S,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSrot( N, X, offx, incx, Y, offy, incy, C, S,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_srot.c
+ * Example of how to use the @ref clAmdBlasSrot function.
+ */
+ 
+/**
+ * @brief applies a plane rotation for double elements
+ *
+ * @param[in] N         Number of elements in vector \b X and \b Y.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] C         C specifies the cosine, cos.
+ * @param[in] S         S specifies the sine, sin.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSrot() function otherwise.
+ *
+ * @ingroup ROT
+ */
+__inline clAmdBlasStatus
+clAmdBlasDrot(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_double C,
+    cl_double S,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDrot( N, X, offx, incx, Y, offy, incy, C, S,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+    
+/**
+ * @brief applies a plane rotation for float-complex elements
+ *
+ * @param[in] N         Number of elements in vector \b X and \b Y.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] C         C specifies the cosine, cos. This number is real
+ * @param[in] S         S specifies the sine, sin. This number is real
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - the same error codes as the clAmdBlasSrot() function otherwise.
+ *
+ * @ingroup ROT
+ */
+__inline clAmdBlasStatus
+clAmdBlasCsrot(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_float C,
+    cl_float S,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCsrot( N, X, offx, incx, Y, offy, incy, C, S,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+    
+/**
+ * @brief applies a plane rotation for double-complex elements
+ *
+ * @param[in] N         Number of elements in vector \b X and \b Y.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] C         C specifies the cosine, cos. This number is real
+ * @param[in] S         S specifies the sine, sin. This number is real
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSrot() function otherwise.
+ *
+ * @ingroup ROT
+ */
+__inline clAmdBlasStatus
+clAmdBlasZdrot(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_double C,
+    cl_double S,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZdrot( N, X, offx, incx, Y, offy, incy, C, S,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+ 
+/*@}*/
+ 
+/**
+ * @defgroup ROTM ROTM  - Apply modified givens rotation for points in the plane
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief modified givens rotation for float elements
+ *
+ * @param[in] N         Number of elements in vector \b X and \b Y.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] SPARAM    Buffer object that contains SPARAM array of minimum length 5
+ *                      SPARAM(1)=SFLAG
+ *                      SPARAM(2)=SH11
+ *                      SPARAM(3)=SH21
+ *                      SPARAM(4)=SH12
+ *                      SPARAM(5)=SH22
+ * @param[in] offSparam Offset of first element of array \b SPARAM in buffer object.
+ *                      Counted in elements.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if either \b X, \b Y or \b SPARAM object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup ROTM
+ */
+__inline clAmdBlasStatus
+clAmdBlasSrotm(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    const cl_mem SPARAM,
+    size_t offSparam,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSrotm( N, X, offx, incx, Y, offy, incy, SPARAM, offSparam,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_srotm.c
+ * Example of how to use the @ref clAmdBlasSrotm function.
+ */
+
+/**
+ * @brief modified givens rotation for double elements
+ *
+ * @param[in] N         Number of elements in vector \b X and \b Y.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] DPARAM    Buffer object that contains SPARAM array of minimum length 5
+ *                      DPARAM(1)=DFLAG
+ *                      DPARAM(2)=DH11
+ *                      DPARAM(3)=DH21
+ *                      DPARAM(4)=DH12
+ *                      DPARAM(5)=DH22
+ * @param[in] offDparam Offset of first element of array \b DPARAM in buffer object.
+ *                      Counted in elements.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+* @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSrotm() function otherwise.
+ *
+ * @ingroup ROTM
+ */
+__inline clAmdBlasStatus
+clAmdBlasDrotm(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    const cl_mem DPARAM,
+    size_t offDparam,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDrotm( N, X, offx, incx, Y, offy, incy, DPARAM, offDparam,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/*@}*/
+
+/**
+ * @defgroup NRM2 NRM2  - Euclidean norm of a vector
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief computes the euclidean norm of vector containing float elements
+ *
+ *  NRM2 = sqrt( X' * X )
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] NRM2         Buffer object that will contain the NRM2 value
+ * @param[in] offNRM2       Offset to NRM2 value in \b NRM2 buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff	Temporary cl_mem scratch buffer object that can hold minimum of (2*N) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx is zero, or
+ *     - the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if any of \b X or \b NRM2 or \b scratchBuff object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup NRM2
+ */
+__inline clAmdBlasStatus
+clAmdBlasSnrm2(
+    size_t N,
+    cl_mem NRM2,
+    size_t offNRM2,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSnrm2( N, NRM2, offNRM2, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_snrm2.c
+ * Example of how to use the @ref clAmdBlasSnrm2 function.
+ */
+ 
+/**
+ * @brief computes the euclidean norm of vector containing double elements
+ *
+ *  NRM2 = sqrt( X' * X )
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] NRM2         Buffer object that will contain the NRM2 value
+ * @param[in] offNRM2       Offset to NRM2 value in \b NRM2 buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff	Temporary cl_mem scratch buffer object that can hold minimum of (2*N) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSnrm2() function otherwise.
+ *
+ * @ingroup NRM2
+ */
+__inline clAmdBlasStatus
+clAmdBlasDnrm2(
+    size_t N,
+    cl_mem NRM2,
+    size_t offNRM2,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDnrm2( N, NRM2, offNRM2, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+    
+/**
+ * @brief computes the euclidean norm of vector containing float-complex elements
+ *
+ *  NRM2 = sqrt( X**H * X )
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] NRM2         Buffer object that will contain the NRM2 value.
+ *                          Note that the answer of Scnrm2 is a real value.
+ * @param[in] offNRM2       Offset to NRM2 value in \b NRM2 buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff	Temporary cl_mem scratch buffer object that can hold minimum of (2*N) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - the same error codes as the clAmdBlasSnrm2() function otherwise.
+ *
+ * @ingroup NRM2
+ */
+__inline clAmdBlasStatus
+clAmdBlasScnrm2(
+    size_t N,
+    cl_mem NRM2,
+    size_t offNRM2,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasScnrm2( N, NRM2, offNRM2, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+    
+/**
+ * @brief computes the euclidean norm of vector containing double-complex elements
+ *
+ *  NRM2 = sqrt( X**H * X )
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] NRM2         Buffer object that will contain the NRM2 value.
+ *                          Note that the answer of Dznrm2 is a real value.
+ * @param[in] offNRM2       Offset to NRM2 value in \b NRM2 buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff	Temporary cl_mem scratch buffer object that can hold minimum of (2*N) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSnrm2() function otherwise.
+ *     executable.
+ *
+ * @ingroup NRM2
+ */
+__inline clAmdBlasStatus
+clAmdBlasDznrm2(
+    size_t N,
+    cl_mem NRM2,
+    size_t offNRM2,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDznrm2( N, NRM2, offNRM2, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+ 
+/*@}*/
+
+/**
+ * @defgroup iAMAX iAMAX  - Index of max absolute value
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief index of max absolute value in a float array
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] iMax         Buffer object storing the index of first absolute max.
+ *                          The index will be of type unsigned int
+ * @param[in] offiMax       Offset for storing index in the buffer iMax
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff   Temprory cl_mem object to store intermediate results
+                            It should be able to hold minimum of (2*N) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx is zero, or
+ *     - the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if any of \b iMax \b X or \b scratchBuff object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if the context, the passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup iAMAX 
+ */
+__inline clAmdBlasStatus
+clAmdBlasiSamax(
+    size_t N,
+    cl_mem iMax,
+    size_t offiMax,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasiSamax( N, iMax, offiMax, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_isamax.c
+ * Example of how to use the @ref clAmdBlasiSamax function.
+ */
+
+
+/**
+ * @brief index of max absolute value in a double array
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] iMax         Buffer object storing the index of first absolute max.
+ *                          The index will be of type unsigned int
+ * @param[in] offiMax       Offset for storing index in the buffer iMax
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff   Temprory cl_mem object to store intermediate results
+                            It should be able to hold minimum of (2*N) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasiSamax() function otherwise.
+ *
+ * @ingroup iAMAX 
+ */
+__inline clAmdBlasStatus
+clAmdBlasiDamax(
+    size_t N,
+    cl_mem iMax,
+    size_t offiMax,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasiDamax( N, iMax, offiMax, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief index of max absolute value in a complex float array
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] iMax         Buffer object storing the index of first absolute max.
+ *                          The index will be of type unsigned int
+ * @param[in] offiMax       Offset for storing index in the buffer iMax
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff   Temprory cl_mem object to store intermediate results
+                            It should be able to hold minimum of (2*N) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - the same error codes as the clAmdBlasiSamax() function otherwise.
+ *
+ * @ingroup iAMAX 
+ */
+__inline clAmdBlasStatus
+clAmdBlasiCamax(
+    size_t N,
+    cl_mem iMax,
+    size_t offiMax,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasiCamax( N, iMax, offiMax, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief index of max absolute value in a complex double array
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] iMax         Buffer object storing the index of first absolute max.
+ *                          The index will be of type unsigned int
+ * @param[in] offiMax       Offset for storing index in the buffer iMax
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff   Temprory cl_mem object to store intermediate results
+                            It should be able to hold minimum of (2*N) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasiSamax() function otherwise.
+ *
+ * @ingroup iAMAX 
+ */
+__inline clAmdBlasStatus
+clAmdBlasiZamax(
+    size_t N,
+    cl_mem iMax,
+    size_t offiMax,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasiZamax( N, iMax, offiMax, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/*@}*/
+
+/**
+ * @defgroup ASUM ASUM  - Sum of absolute values
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief absolute sum of values of a vector containing float elements
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] asum         Buffer object that will contain the absoule sum value
+ * @param[in] offAsum       Offset to absolute sum in \b asum buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff   Temporary cl_mem scratch buffer object of minimum size N
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx is zero, or
+ *     - the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if any of \b X or \b asum or \b scratchBuff object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup ASUM
+ */
+__inline clAmdBlasStatus
+clAmdBlasSasum(
+    size_t N,
+    cl_mem asum,
+    size_t offAsum,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSasum( N, asum, offAsum, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_sasum.c
+ * Example of how to use the @ref clAmdBlasSasum function.
+ */
+
+/**
+ * @brief absolute sum of values of a vector containing double elements
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] asum         Buffer object that will contain the absoulte sum value
+ * @param[in] offAsum       Offset to absoule sum in \b asum buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff   Temporary cl_mem scratch buffer object of minimum size N
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSasum() function otherwise.
+ *
+ * @ingroup ASUM
+ */
+__inline clAmdBlasStatus
+clAmdBlasDasum(
+    size_t N,
+    cl_mem asum,
+    size_t offAsum,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDasum( N, asum, offAsum, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+
+/**
+ * @brief absolute sum of values of a vector containing float-complex elements
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] asum         Buffer object that will contain the absolute sum value
+ * @param[in] offAsum       Offset to absolute sum in \b asum buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff   Temporary cl_mem scratch buffer object of minimum size N
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - the same error codes as the clAmdBlasSasum() function otherwise.
+ *
+ * @ingroup ASUM
+ */
+__inline clAmdBlasStatus
+clAmdBlasScasum(
+    size_t N,
+    cl_mem asum,
+    size_t offAsum,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasScasum( N, asum, offAsum, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+
+/**
+ * @brief absolute sum of values of a vector containing double-complex elements
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] asum         Buffer object that will contain the absolute sum value
+ * @param[in] offAsum       Offset to absolute sum in \b asum buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff   Temporary cl_mem scratch buffer object of minimum size N
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSasum() function otherwise.
+ *
+ * @ingroup ASUM
+ */
+__inline clAmdBlasStatus
+clAmdBlasDzasum(
+    size_t N,
+    cl_mem asum,
+    size_t offAsum,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDzasum( N, asum, offAsum, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/*@}*/
+
+/**
+ * @defgroup BLAS2 BLAS-2 functions
+ *
+ * The Level 2 Basic Linear Algebra Subprograms are functions that perform
+ * matrix-vector operations.
+ */
+/*@{*/
+/*@}*/
+
+
+/**
+ * @defgroup GEMV GEMV  - General matrix-Vector multiplication
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief Matrix-vector product with a general rectangular matrix and
+ * float elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ y \leftarrow \alpha A x + \beta y \f$
+ *   - \f$ y \leftarrow \alpha A^T x + \beta y \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M when the
+ *                      parameter is set to \b clAmdBlasColumnMajor.
+ * @param[in] x         Buffer object storing vector \b x.
+ * @param[in] offx      Offset of first element of vector \b x in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b x. Must not be zero.
+ * @param[in] beta      The factor of the vector \b y.
+ * @param[out] y        Buffer object storing the vector \b y.
+ * @param[in] offy      Offset of first element of vector \b y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasSgemvEx() instead.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - either \b M or \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - any of the leading dimensions is invalid;
+ *     - the matrix size or the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if either \b A, \b x, or \b y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup GEMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasSgemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    cl_float alpha,
+    const cl_mem A,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_float beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSgemv( order, transA, M, N, alpha, A, 0, lda, x, offx, incx, beta, y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_sgemv.c
+ * Example of how to use the @ref clAmdBlasSgemv function.
+ */
+
+/**
+ * @brief Matrix-vector product with a general rectangular matrix and
+ * double elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ y \leftarrow \alpha A x + \beta y \f$
+ *   - \f$ y \leftarrow \alpha A^T x + \beta y \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. For a detailed description,
+ *                      see clAmdBlasSgemv().
+ * @param[in] x         Buffer object storing vector \b x.
+ * @param[in] offx      Offset of first element of vector \b x in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b x. It cannot be zero.
+ * @param[in] beta      The factor of the vector \b y.
+ * @param[out] y        Buffer object storing the vector \b y.
+ * @param[in] offy      Offset of first element of vector \b y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasDgemvEx() instead.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSgemv() function otherwise.
+ *
+ * @ingroup GEMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasDgemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    cl_double alpha,
+    const cl_mem A,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_double beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDgemv( order, transA, M, N, alpha, A, 0, lda, x, offx, incx, beta, y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Matrix-vector product with a general rectangular matrix and
+ * float complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ y \leftarrow \alpha A x + \beta y \f$
+ *   - \f$ y \leftarrow \alpha A^T x + \beta y \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. For a detailed description,
+ *                      see clAmdBlasSgemv().
+ * @param[in] x         Buffer object storing vector \b x.
+ * @param[in] offx      Offset of first element of vector \b x in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b x. It cannot be zero.
+ * @param[in] beta      The factor of the vector \b y.
+ * @param[out] y        Buffer object storing the vector \b y.
+ * @param[in] offy      Offset of first element of vector \b y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasCgemvEx() instead.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - the same error codes as the clAmdBlasSgemv() function otherwise.
+ *
+ * @ingroup GEMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasCgemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    FloatComplex beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCgemv( order, transA, M, N, alpha, A, 0, lda, x, offx, incx, beta, y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Matrix-vector product with a general rectangular matrix and
+ * double complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ y \leftarrow \alpha A x + \beta y \f$
+ *   - \f$ y \leftarrow \alpha A^T x + \beta y \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. For a detailed description,
+ *                      see clAmdBlasSgemv().
+ * @param[in] x         Buffer object storing vector \b x.
+ * @param[in] offx      Offset of first element of vector \b x in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b x. It cannot be zero.
+ * @param[in] beta      The factor of the vector \b y.
+ * @param[out] y        Buffer object storing the vector \b y.
+ * @param[in] offy      Offset of first element of vector \b y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasZgemvEx() instead.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSgemv() function otherwise.
+ *
+ * @ingroup GEMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasZgemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    DoubleComplex beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZgemv( order, transA, M, N, alpha, A, 0, lda, x, offx, incx, beta, y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Matrix-vector product with a general rectangular matrix and
+ *        float elements. Extended version.
+ *
+ * Matrix-vector products:
+ *   - \f$ y \leftarrow \alpha A x + \beta y \f$
+ *   - \f$ y \leftarrow \alpha A^T x + \beta y \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in
+ *                      the buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M when the
+ *                      parameter is set to \b clAmdBlasColumnMajor.
+ * @param[in] x         Buffer object storing vector \b x.
+ * @param[in] offx      Offset of first element of vector \b x in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b x. It cannot be zero.
+ * @param[in] beta      The factor of the vector \b y.
+ * @param[out] y        Buffer object storing the vector \b y.
+ * @param[in] offy      Offset of first element of vector \b y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidValue if \b offA exceeds the size of \b A buffer
+ *     object;
+ *   - the same error codes as the clAmdBlasSgemv() function otherwise.
+ *
+ * @ingroup GEMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasSgemvEx(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_float beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSgemv( order, transA, M, N, alpha, A, offA, lda, x, offx, incx, beta, y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_sgemv.c
+ * This is an example of how to use the @ref clAmdBlasSgemvEx function.
+ */
+
+/**
+ * @brief Matrix-vector product with a general rectangular matrix and
+ *        double elements. Extended version.
+ *
+ * Matrix-vector products:
+ *   - \f$ y \leftarrow \alpha A x + \beta y \f$
+ *   - \f$ y \leftarrow \alpha A^T x + \beta y \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of \b A in the buffer
+ *                      object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. For a detailed description,
+ *                      see clAmdBlasSgemv().
+ * @param[in] x         Buffer object storing vector \b x.
+ * @param[in] offx      Offset of first element of vector \b x in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b x. It cannot be zero.
+ * @param[in] beta      The factor of the vector \b y.
+ * @param[out] y        Buffer object storing the vector \b y.
+ * @param[in] offy      Offset of first element of vector \b y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - \b clAmdBlasInvalidValue if \b offA exceeds the size of \b A buffer
+ *     object;
+ *   - the same error codes as the clAmdBlasSgemv() function otherwise.
+ *
+ * @ingroup GEMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasDgemvEx(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_double beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDgemv( order, transA, M, N, alpha, A, offA, lda, x, offx, incx, beta, y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Matrix-vector product with a general rectangular matrix and
+ *        float complex elements. Extended version.
+ *
+ * Matrix-vector products:
+ *   - \f$ y \leftarrow \alpha A x + \beta y \f$
+ *   - \f$ y \leftarrow \alpha A^T x + \beta y \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in
+ *                      the buffer object. Counted in elements
+ * @param[in] lda       Leading dimension of matrix \b A. For a detailed description,
+ *                      see clAmdBlasSgemv().
+ * @param[in] x         Buffer object storing vector \b x.
+ * @param[in] offx      Offset of first element of vector \b x in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b x. It cannot be zero.
+ * @param[in] beta      The factor of the vector \b y.
+ * @param[out] y        Buffer object storing the vector \b y.
+ * @param[in] offy      Offset of first element of vector \b y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidValue if \b offA exceeds the size of \b A buffer
+ *     object;
+ *   - the same error codes as the clAmdBlasSgemv() function otherwise.
+ *
+ * @ingroup GEMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasCgemvEx(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    FloatComplex beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCgemv( order, transA, M, N, alpha, A, offA, lda, x, offx, incx, beta, y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Matrix-vector product with a general rectangular matrix and
+ *        double complex elements. Extended version.
+ *
+ * Matrix-vector products:
+ *   - \f$ y \leftarrow \alpha A x + \beta y \f$
+ *   - \f$ y \leftarrow \alpha A^T x + \beta y \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in
+ *                      the buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. For a detailed description,
+ *                      see clAmdBlasSgemv().
+ * @param[in] x         Buffer object storing vector \b x.
+ * @param[in] offx      Offset of first element of vector \b x in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b x. It cannot be zero.
+ * @param[in] beta      The factor of the vector \b y.
+ * @param[out] y        Buffer object storing the vector \b y.
+ * @param[in] offy      Offset of first element of vector \b y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - \b clAmdBlasInvalidValue if \b offA exceeds the size of \b A buffer
+ *     object;
+ *   - the same error codes as the clAmdBlasSgemv() function otherwise.
+ *
+ * @ingroup GEMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasZgemvEx(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    DoubleComplex beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZgemv( order, transA, M, N, alpha, A, offA, lda, x, offx, incx, beta, y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/*@}*/
+
+/**
+ * @defgroup SYMV SYMV  - Symmetric matrix-Vector multiplication
+ * @ingroup BLAS2
+ */
+
+/*@{*/
+
+/**
+ * @brief Matrix-vector product with a symmetric matrix and float elements.
+ *
+ * Matrix-vector products:
+ * - \f$ y \leftarrow \alpha A x + \beta y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot less
+ *                      than \b N.
+ * @param[in] x         Buffer object storing vector \b x.
+ * @param[in] offx      Offset of first element of vector \b x in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b x. It cannot be zero.
+ * @param[in] beta      The factor of vector \b y.
+ * @param[out] y        Buffer object storing vector \b y.
+ * @param[in] offy      Offset of first element of vector \b y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasSsymvEx() instead.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - any of the leading dimensions is invalid;
+ *     - the matrix sizes or the vector sizes along with the increments lead to
+ *       accessing outsize of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if either \b A, \b x, or \b y object is
+ *     invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs to
+ *     was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SYMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasSsymv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem A,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_float beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSsymv( order, uplo, N, alpha, A, 0, lda, x, offx, incx, beta, y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_ssymv.c
+ * This is an example of how to use the @ref clAmdBlasSsymv function.
+ */
+
+/**
+ * @brief Matrix-vector product with a symmetric matrix and double elements.
+ *
+ * Matrix-vector products:
+ * - \f$ y \leftarrow \alpha A x + \beta y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot less
+ *                      than \b N.
+ * @param[in] x         Buffer object storing vector \b x.
+ * @param[in] offx      Offset of first element of vector \b x in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b x. It cannot be zero.
+ * @param[in] beta      The factor of vector \b y.
+ * @param[out] y        Buffer object storing vector \b y.
+ * @param[in] offy      Offset of first element of vector \b y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasDsymvEx() instead.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSsymv() function otherwise.
+ *
+ * @ingroup SYMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasDsymv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem A,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_double beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDsymv( order, uplo, N, alpha, A, 0, lda, x, offx, incx, beta, y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Matrix-vector product with a symmetric matrix and float elements.
+ *        Extended version.
+ *
+ * Matrix-vector products:
+ * - \f$ y \leftarrow \alpha A x + \beta y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in
+ *                      the buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot less
+ *                      than \b N.
+ * @param[in] x         Buffer object storing vector \b x.
+ * @param[in] offx      Offset of first element of vector \b x in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b x. It cannot be zero.
+ * @param[in] beta      The factor of vector \b y.
+ * @param[out] y        Buffer object storing vector \b y.
+ * @param[in] offy      Offset of first element of vector \b y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidValue if \b offA exceeds the size of \b A buffer
+ *     object;
+ *   - the same error codes as the clAmdBlasSgemv() function otherwise.
+ *
+ * @ingroup SYMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasSsymvEx(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_float beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSsymv( order, uplo, N, alpha, A, offA, lda, x, offx, incx, beta, y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_ssymv.c
+ * This is an example of how to use the @ref clAmdBlasSsymv function.
+ */
+
+/**
+ * @brief Matrix-vector product with a symmetric matrix and double elements.
+ *        Extended version.
+ *
+ * Matrix-vector products:
+ * - \f$ y \leftarrow \alpha A x + \beta y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in
+ *                      the buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot less
+ *                      than \b N.
+ * @param[in] x         Buffer object storing vector \b x.
+ * @param[in] offx      Offset of first element of vector \b x in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b x. It cannot be zero.
+ * @param[in] beta      The factor of vector \b y.
+ * @param[out] y        Buffer object storing vector \b y.
+ * @param[in] offy      Offset of first element of vector \b y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - \b clAmdBlasInvalidValue if \b offA exceeds the size of \b A buffer
+ *     object;
+ *   - the same error codes as the clAmdBlasSsymv() function otherwise.
+ *
+ * @ingroup SYMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasDsymvEx(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_double beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDsymv( order, uplo, N, alpha, A, offA, lda, x, offx, incx, beta, y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/*@}*/
+
+
+/**
+ * @defgroup HEMV HEMV  - Hermitian matrix-vector multiplication
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief Matrix-vector product with a hermitian matrix and float-complex elements.
+ *
+ * Matrix-vector products:
+ * - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offa		Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot less
+ *                      than \b N.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b X. It cannot be zero.
+ * @param[in] beta      The factor of vector \b Y.
+ * @param[out] Y        Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b Y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - any of the leading dimensions is invalid;
+ *     - the matrix sizes or the vector sizes along with the increments lead to
+ *       accessing outsize of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if either \b A, \b X, or \b Y object is
+ *     invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs to
+ *     was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup HEMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasChemv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    FloatComplex beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasChemv( order, uplo, N, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Matrix-vector product with a hermitian matrix and double-complex elements.
+ *
+ * Matrix-vector products:
+ * - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offa		Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot less
+ *                      than \b N.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b X. It cannot be zero.
+ * @param[in] beta      The factor of vector \b Y.
+ * @param[out] Y        Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b Y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasChemv() function otherwise.
+ *
+ * @ingroup HEMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasZhemv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    DoubleComplex beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZhemv( order, uplo, N, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_zhemv.cpp
+ * Example of how to use the @ref clAmdBlasZhemv function.
+ */
+/*@}*/
+
+
+
+/**
+ * @defgroup TRMV TRMV  - Triangular matrix vector multiply
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief Matrix-vector product with a triangular matrix and
+ * float elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than \b N
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b incx is zero, or
+ *     - the leading dimension is invalid;
+ *   - \b clAmdBlasInvalidMemObject if either \b A or \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup TRMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasStrmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasStrmv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_strmv.c
+ * Example of how to use the @ref clAmdBlasStrmv function.
+ */
+
+/**
+ * @brief Matrix-vector product with a triangular matrix and
+ * double elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than \b N
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasStrmv() function otherwise.
+ *
+ * @ingroup TRMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasDtrmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDtrmv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Matrix-vector product with a triangular matrix and
+ * float complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than \b N
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return The same result as the clAmdBlasStrmv() function.
+ * @ingroup TRMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasCtrmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCtrmv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Matrix-vector product with a triangular matrix and
+ * double complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than \b N
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return The same result as the clAmdBlasDtrmv() function.
+ * @ingroup TRMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasZtrmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZtrmv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+
+/*@}*/
+
+/**
+ * @defgroup TRSV TRSV  - Triangular matrix vector Solve
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief solving triangular matrix problems with float elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than \b N
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b incx is zero, or
+ *     - the leading dimension is invalid;
+ *   - \b clAmdBlasInvalidMemObject if either \b A or \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup TRSV
+ */
+__inline clAmdBlasStatus
+clAmdBlasStrsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasStrsv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_strsv.c
+ * Example of how to use the @ref clAmdBlasStrsv function.
+ */
+
+
+/**
+ * @brief solving triangular matrix problems with double elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than \b N
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasStrsv() function otherwise.
+ *
+ * @ingroup TRSV
+ */
+__inline clAmdBlasStatus
+clAmdBlasDtrsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDtrsv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief solving triangular matrix problems with float-complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than \b N
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return The same result as the clAmdBlasStrsv() function.
+ *
+ * @ingroup TRSV
+ */
+__inline clAmdBlasStatus
+clAmdBlasCtrsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCtrsv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief solving triangular matrix problems with double-complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than \b N
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return The same result as the clAmdBlasDtrsv() function.
+ *
+ * @ingroup TRSV
+ */
+__inline clAmdBlasStatus
+clAmdBlasZtrsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZtrsv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/*@}*/
+
+/**
+ * @defgroup GER GER   - General matrix rank 1 operation
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief vector-vector product with float elements and
+ * performs the rank 1 operation A
+ *
+ * Vector-vector products:
+ *   - \f$ A \leftarrow \alpha X Y^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     specifies the scalar alpha.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset in number of elements for the first element in vector \b Y.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] A 		Buffer object storing matrix \b A. On exit, A is
+ *				        overwritten by the updated matrix.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M when the
+ *                      parameter is set to \b clAmdBlasColumnMajor.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b M, \b N or
+ *	   - either \b incx or \b incy is zero, or
+ *     - a leading dimension is invalid;
+ *   - \b clAmdBlasInvalidMemObject if A, X, or Y object is invalid,
+ *     or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfResources if you use image-based function implementation
+ *     and no suitable scratch image available;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs to
+ *     was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup GER
+ */
+__inline clAmdBlasStatus
+clAmdBlasSger(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    cl_float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSger( order, M, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_sger.c
+ * Example of how to use the @ref clAmdBlasSger function.
+ */
+
+
+/**
+ * @brief vector-vector product with double elements and
+ * performs the rank 1 operation A
+ *
+ * Vector-vector products:
+ *   - \f$ A \leftarrow \alpha X Y^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     specifies the scalar alpha.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset in number of elements for the first element in vector \b Y.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] A 		Buffer object storing matrix \b A. On exit, A is
+ *				        overwritten by the updated matrix.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M when the
+ *                      parameter is set to \b clAmdBlasColumnMajor.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSger() function otherwise.
+ *
+ * @ingroup GER
+ */
+__inline clAmdBlasStatus
+clAmdBlasDger(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    cl_double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDger( order, M, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+/*@}*/
+
+/**
+ * @defgroup GERU GERU  - General matrix rank 1 operation
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief vector-vector product with float complex elements and
+ * performs the rank 1 operation A
+ *
+ * Vector-vector products:
+ *   - \f$ A \leftarrow \alpha X Y^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     specifies the scalar alpha.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset in number of elements for the first element in vector \b Y.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] A 		Buffer object storing matrix \b A. On exit, A is
+ *				        overwritten by the updated matrix.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M when the
+ *                      parameter is set to \b clAmdBlasColumnMajor.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b M, \b N or
+ *	   - either \b incx or \b incy is zero, or
+ *     - a leading dimension is invalid;
+ *   - \b clAmdBlasInvalidMemObject if A, X, or Y object is invalid,
+ *     or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfResources if you use image-based function implementation
+ *     and no suitable scratch image available;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs to
+ *     was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup GERU
+ */
+__inline clAmdBlasStatus
+clAmdBlasCgeru(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A ,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCgeru( order, M, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief vector-vector product with double complex elements and
+ * performs the rank 1 operation A
+ *
+ * Vector-vector products:
+ *   - \f$ A \leftarrow \alpha X Y^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     specifies the scalar alpha.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset in number of elements for the first element in vector \b Y.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] A		   Buffer object storing matrix \b A. On exit, A is
+ *				        overwritten by the updated matrix.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M when the
+ *                      parameter is set to \b clAmdBlasColumnMajor.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasCgeru() function otherwise.
+ *
+ * @ingroup GERU
+ */
+__inline clAmdBlasStatus
+clAmdBlasZgeru(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZgeru( order, M, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+/*@}*/
+
+/**
+ * @defgroup GERC GERC  - General matrix rank 1 operation
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief vector-vector product with float complex elements and
+ * performs the rank 1 operation A
+ *
+ * Vector-vector products:
+ *   - \f$ A \leftarrow \alpha X Y^H + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     specifies the scalar alpha.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset in number of elements for the first element in vector \b Y.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] A 	    Buffer object storing matrix \b A. On exit, A is
+ *				        overwritten by the updated matrix.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M when the
+ *                      parameter is set to \b clAmdBlasColumnMajor.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b M, \b N or
+ *	   - either \b incx or \b incy is zero, or
+ *     - a leading dimension is invalid;
+ *   - \b clAmdBlasInvalidMemObject if A, X, or Y object is invalid,
+ *     or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfResources if you use image-based function implementation
+ *     and no suitable scratch image available;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs to
+ *     was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup GERC
+ */
+__inline clAmdBlasStatus
+clAmdBlasCgerc(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A ,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCgerc( order, M, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief vector-vector product with double complex elements and
+ * performs the rank 1 operation A
+ *
+ * Vector-vector products:
+ *   - \f$ A \leftarrow \alpha X Y^H + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     specifies the scalar alpha.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset in number of elements for the first element in vector \b Y.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] A		Buffer object storing matrix \b A. On exit, A is
+ *				        overwritten by the updated matrix.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M when the
+ *                      parameter is set to \b clAmdBlasColumnMajor.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasCgerc() function otherwise.
+ *
+ * @ingroup GERC
+ */
+__inline clAmdBlasStatus
+clAmdBlasZgerc(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZgerc( order, M, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+
+/*@}*/
+
+/**
+ * @defgroup SYR SYR   - Symmetric rank 1 update
+ *
+ * The Level 2 Basic Linear Algebra Subprograms are functions that perform
+ * symmetric rank 1 update operations.
+  * @ingroup BLAS2
+ */
+
+/*@{*/
+/**
+ * @brief Symmetric rank 1 operation with a general triangular matrix and
+ * float elements.
+ *
+ * Symmetric rank 1 operation:
+ *   - \f$ A \leftarrow \alpha x x^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] A 	    Buffer object storing matrix \b A.
+ * @param[in] offa      Offset of first element of matrix \b A in buffer object.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx is zero, or
+ *     - the leading dimension is invalid;
+ *   - \b clAmdBlasInvalidMemObject if either \b A, \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SYR
+ */
+__inline clAmdBlasStatus
+clAmdBlasSsyr(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+{
+	return clblasSsyr( order, uplo, N, alpha, X, offx, incx, A, offa, lda, 
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Symmetric rank 1 operation with a general triangular matrix and
+ * double elements.
+ *
+ * Symmetric rank 1 operation:
+ *   - \f$ A \leftarrow \alpha x x^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] A		Buffer object storing matrix \b A.
+ * @param[in] offa      Offset of first element of matrix \b A in buffer object.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSsyr() function otherwise.
+ *
+ * @ingroup SYR
+ */
+__inline clAmdBlasStatus
+clAmdBlasDsyr(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+{
+	return clblasDsyr( order, uplo, N, alpha, X, offx, incx, A, offa, lda, 
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+/*@}*/
+
+
+/**
+ * @defgroup HER HER   - Hermitian rank 1 operation 
+ *
+ * The Level 2 Basic Linear Algebra Subprogram functions that perform
+ * hermitian rank 1 operations.
+ * @ingroup BLAS2
+ */
+
+/*@{*/
+/**
+ * @brief hermitian rank 1 operation with a general triangular matrix and
+ * float-complex elements.
+ *
+ * hermitian rank 1 operation:
+ *   - \f$ A \leftarrow \alpha X X^H + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A (a scalar float value)
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] A		Buffer object storing matrix \b A.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx is zero, or
+ *     - the leading dimension is invalid;
+ *   - \b clAmdBlasInvalidMemObject if either \b A, \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup HER
+ */
+__inline clAmdBlasStatus
+clAmdBlasCher(
+	clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+{
+	return clblasCher( order, uplo, N, alpha, X, offx, incx, A, offa, lda, 
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_cher.c
+ * Example of how to use the @ref clAmdBlasCher function.
+ */
+
+/**
+ * @brief hermitian rank 1 operation with a general triangular matrix and
+ * double-complex elements.
+ *
+ * hermitian rank 1 operation:
+ *   - \f$ A \leftarrow \alpha X X^H + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A (a scalar double value)
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] A		Buffer object storing matrix \b A.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasCher() function otherwise.
+ *
+ * @ingroup HER
+ */
+__inline clAmdBlasStatus
+clAmdBlasZher(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+{
+	return clblasZher( order, uplo, N, alpha, X, offx, incx, A, offa, lda, 
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/*@}*/
+
+/**
+ * @defgroup SYR2 SYR2  - Symmetric rank 2 update
+ *
+ * The Level 2 Basic Linear Algebra Subprograms are functions that perform
+ * symmetric rank 2 update operations.
+  * @ingroup BLAS2
+ */
+
+/*@{*/
+/**
+ * @brief Symmetric rank 2 operation with a general triangular matrix and
+ * float elements.
+ *
+ * Symmetric rank 2 operation:
+ *   - \f$ A \leftarrow \alpha x y^T + \alpha y x^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] A 	    Buffer object storing matrix \b A.
+ * @param[in] offa      Offset of first element of matrix \b A in buffer object.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - either \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - the leading dimension is invalid;
+ *   - \b clAmdBlasInvalidMemObject if either \b A, \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SYR2
+ */
+__inline clAmdBlasStatus
+clAmdBlasSsyr2(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem X,
+    size_t offx,
+    int  incx,
+	const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+{
+	return clblasSsyr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, 
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Symmetric rank 2 operation with a general triangular matrix and
+ * double elements.
+ *
+ * Symmetric rank 2 operation:
+ *   - \f$ A \leftarrow \alpha x y^T + \alpha y x^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] A 	    Buffer object storing matrix \b A.
+ * @param[in] offa      Offset of first element of matrix \b A in buffer object.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - either \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - the leading dimension is invalid;
+ *   - \b clAmdBlasInvalidMemObject if either \b A, \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SYR2
+ */
+__inline clAmdBlasStatus
+clAmdBlasDsyr2(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+{
+	return clblasDsyr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, 
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/*@}*/
+
+/**
+ * @defgroup HER2 HER2  - Hermitian rank 2 update
+ *
+ * The Level 2 Basic Linear Algebra Subprograms are functions that perform
+ * hermitian rank 2 update operations.
+ * @ingroup BLAS2
+ */
+
+/*@{*/
+/**
+ * @brief Hermitian rank 2 operation with a general triangular matrix and
+ * float-compelx elements.
+ *
+ * Hermitian rank 2 operation:
+ *   - \f$ A \leftarrow \alpha X Y^H + \overline{ \alpha } Y X^H + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset in number of elements for the first element in vector \b Y.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] A		Buffer object storing matrix \b A.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - either \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - the leading dimension is invalid;
+ *   - \b clAmdBlasInvalidMemObject if either \b A, \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup HER2
+ */
+__inline clAmdBlasStatus
+clAmdBlasCher2(
+	clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+	const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+{
+	return clblasCher2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, 
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+
+/**
+* @brief Hermitian rank 2 operation with a general triangular matrix and
+ * double-compelx elements.
+ *
+ * Hermitian rank 2 operation:
+ *   - \f$ A \leftarrow \alpha X Y^H + \overline{ \alpha } Y X^H + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset in number of elements for the first element in vector \b Y.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] A		Buffer object storing matrix \b A.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasCher2() function otherwise.
+ *
+ * @ingroup HER2
+ */
+__inline clAmdBlasStatus
+clAmdBlasZher2(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+	cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+{
+	return clblasZher2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, 
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_zher2.c
+ * Example of how to use the @ref clAmdBlasZher2 function.
+ */
+
+/*@}*/
+
+/**
+ * @defgroup TPMV TPMV  - Triangular packed matrix-vector multiply
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief Matrix-vector product with a packed triangular matrix and
+ * float elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b AP is to be transposed.
+ * @param[in] diag				Specify whether matrix \b AP is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b A.
+ * @param[in] AP				Buffer object storing matrix \b AP in packed format.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b AP.
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b incx is zero
+ *   - \b clAmdBlasInvalidMemObject if either \b AP or \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup TPMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasStpmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem AP,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasStpmv( order, uplo, trans, diag, N, AP, offa, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_stpmv.c
+ * Example of how to use the @ref clAmdBlasStpmv function.
+ */
+
+/**
+ * @brief Matrix-vector product with a packed triangular matrix and
+ * double elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b AP is to be transposed.
+ * @param[in] diag				Specify whether matrix \b AP is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b AP.
+ * @param[in] AP				Buffer object storing matrix \b AP in packed format.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b AP.
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasStpmv() function otherwise.
+ *
+ * @ingroup TPMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasDtpmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem AP,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDtpmv( order, uplo, trans, diag, N, AP, offa, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+  * @brief Matrix-vector product with a packed triangular matrix and
+ * float-complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b AP is to be transposed.
+ * @param[in] diag				Specify whether matrix \b AP is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b AP.
+ * @param[in] AP				Buffer object storing matrix \b AP in packed format.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b AP.
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return The same result as the clAmdBlasStpmv() function.
+ * @ingroup TPMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasCtpmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem AP,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCtpmv( order, uplo, trans, diag, N, AP, offa, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Matrix-vector product with a packed triangular matrix and
+ * double-complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b AP is to be transposed.
+ * @param[in] diag				Specify whether matrix \b AP is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b AP.
+ * @param[in] AP				Buffer object storing matrix \b AP in packed format.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b AP.
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return The same result as the clAmdBlasDtpmv() function.
+ * @ingroup TPMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasZtpmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem AP,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZtpmv( order, uplo, trans, diag, N, AP, offa, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+/*@}*/
+
+
+
+/**
+ * @defgroup TPSV TPSV  - Triangular packed matrix vector solve 
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief solving triangular packed matrix problems with float elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo              The triangle in matrix being referenced.
+ * @param[in] trans             How matrix \b A is to be transposed.
+ * @param[in] diag              Specify whether matrix \b A is unit triangular.
+ * @param[in] N                 Number of rows/columns in matrix \b A.
+ * @param[in] A                 Buffer object storing matrix in packed format.\b A.
+ * @param[in] offa              Offset in number of elements for first element in matrix \b A.
+ * @param[out] X                Buffer object storing vector \b X.
+ * @param[in] offx              Offset in number of elements for first element in vector \b X.
+ * @param[in] incx              Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b incx is zero, or
+ *     - the leading dimension is invalid;
+ *   - \b clAmdBlasInvalidMemObject if either \b A or \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup TPSV
+ */
+__inline clAmdBlasStatus
+clAmdBlasStpsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasStpsv( order, uplo, trans, diag, N, A, offa, X, offx, incx,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_stpsv.c
+ * Example of how to use the @ref clAmdBlasStpsv function.
+ */
+
+/**
+ * @brief solving triangular packed matrix problems with double elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo              The triangle in matrix being referenced.
+ * @param[in] trans             How matrix \b A is to be transposed.
+ * @param[in] diag              Specify whether matrix \b A is unit triangular.
+ * @param[in] N                 Number of rows/columns in matrix \b A.
+ * @param[in] A                 Buffer object storing matrix in packed format.\b A.
+ * @param[in] offa              Offset in number of elements for first element in matrix \b A.
+ * @param[out] X                Buffer object storing vector \b X.
+ * @param[in] offx              Offset in number of elements for first element in vector \b X.
+ * @param[in] incx              Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b incx is zero, or
+ *     - the leading dimension is invalid;
+ *   - \b clAmdBlasInvalidMemObject if either \b A or \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup TPSV
+ */
+__inline clAmdBlasStatus
+clAmdBlasDtpsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDtpsv( order, uplo, trans, diag, N, A, offa, X, offx, incx,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief solving triangular packed matrix problems with float complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo              The triangle in matrix being referenced.
+ * @param[in] trans             How matrix \b A is to be transposed.
+ * @param[in] diag              Specify whether matrix \b A is unit triangular.
+ * @param[in] N                 Number of rows/columns in matrix \b A.
+ * @param[in] A                 Buffer object storing matrix in packed format.\b A.
+ * @param[in] offa              Offset in number of elements for first element in matrix \b A.
+ * @param[out] X                Buffer object storing vector \b X.
+ * @param[in] offx              Offset in number of elements for first element in vector \b X.
+ * @param[in] incx              Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b incx is zero, or
+ *     - the leading dimension is invalid;
+ *   - \b clAmdBlasInvalidMemObject if either \b A or \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup TPSV
+ */
+__inline clAmdBlasStatus
+clAmdBlasCtpsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCtpsv( order, uplo, trans, diag, N, A, offa, X, offx, incx,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief solving triangular packed matrix problems with double complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo              The triangle in matrix being referenced.
+ * @param[in] trans             How matrix \b A is to be transposed.
+ * @param[in] diag              Specify whether matrix \b A is unit triangular.
+ * @param[in] N                 Number of rows/columns in matrix \b A.
+ * @param[in] A                 Buffer object storing matrix in packed format.\b A.
+ * @param[in] offa              Offset in number of elements for first element in matrix \b A.
+ * @param[out] X                Buffer object storing vector \b X.
+ * @param[in] offx              Offset in number of elements for first element in vector \b X.
+ * @param[in] incx              Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b incx is zero, or
+ *     - the leading dimension is invalid;
+ *   - \b clAmdBlasInvalidMemObject if either \b A or \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup TPSV
+ */
+__inline clAmdBlasStatus
+clAmdBlasZtpsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZtpsv( order, uplo, trans, diag, N, A, offa, X, offx, incx,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+/*@}*/
+
+
+/**
+ * @defgroup SPMV SPMV  - Symmetric packed matrix vector multiply
+ * @ingroup BLAS2
+ */
+
+/*@{*/
+
+/**
+ * @brief Matrix-vector product with a symmetric packed-matrix and float elements.
+ *
+ * Matrix-vector products:
+ * - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in matrix \b AP.
+ * @param[in] alpha     The factor of matrix \b AP.
+ * @param[in] AP        Buffer object storing matrix \b AP.
+ * @param[in] offa		Offset in number of elements for first element in matrix \b AP.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b X. It cannot be zero.
+ * @param[in] beta      The factor of vector \b Y.
+ * @param[out] Y        Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b Y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - the matrix sizes or the vector sizes along with the increments lead to
+ *       accessing outsize of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if either \b AP, \b X, or \b Y object is
+ *     invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs to
+ *     was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SPMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasSspmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem AP,
+    size_t offa,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_float beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSspmv( order, uplo, N, alpha, AP, offa, X, offx, incx, beta, Y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_sspmv.c
+ * This is an example of how to use the @ref clAmdBlasSspmv function.
+ */
+
+/**
+ * @brief Matrix-vector product with a symmetric packed-matrix and double elements.
+ *
+ * Matrix-vector products:
+ * - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in matrix \b AP.
+ * @param[in] alpha     The factor of matrix \b AP.
+ * @param[in] AP        Buffer object storing matrix \b AP.
+ * @param[in] offa		Offset in number of elements for first element in matrix \b AP.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b X. It cannot be zero.
+ * @param[in] beta      The factor of vector \b Y.
+ * @param[out] Y        Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b Y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSspmv() function otherwise.
+ *
+ * @ingroup SPMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasDspmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem AP,
+    size_t offa,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_double beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDspmv( order, uplo, N, alpha, AP, offa, X, offx, incx, beta, Y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+/*@}*/
+
+
+
+/**
+ * @defgroup HPMV HPMV  - Hermitian packed matrix-vector multiplication
+ * @ingroup BLAS2
+ */
+
+/*@{*/
+
+/**
+ * @brief Matrix-vector product with a packed hermitian matrix and float-complex elements.
+ *
+ * Matrix-vector products:
+ * - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in matrix \b AP.
+ * @param[in] alpha     The factor of matrix \b AP.
+ * @param[in] AP        Buffer object storing packed matrix \b AP.
+ * @param[in] offa		Offset in number of elements for first element in matrix \b AP.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b X. It cannot be zero.
+ * @param[in] beta      The factor of vector \b Y.
+ * @param[out] Y        Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b Y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - the matrix sizes or the vector sizes along with the increments lead to
+ *       accessing outsize of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if either \b AP, \b X, or \b Y object is
+ *     invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs to
+ *     was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup HPMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasChpmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem AP,
+    size_t offa,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_float2 beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasChpmv( order, uplo, N, alpha, AP, offa, X, offx, incx, beta, Y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_chpmv.c
+ * This is an example of how to use the @ref clAmdBlasChpmv function.
+ */
+
+
+/**
+ * @brief Matrix-vector product with a packed hermitian matrix and double-complex elements.
+ *
+ * Matrix-vector products:
+ * - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in matrix \b AP.
+ * @param[in] alpha     The factor of matrix \b AP.
+ * @param[in] AP        Buffer object storing packed matrix \b AP.
+ * @param[in] offa		Offset in number of elements for first element in matrix \b AP.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b X. It cannot be zero.
+ * @param[in] beta      The factor of vector \b Y.
+ * @param[out] Y        Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b Y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasChpmv() function otherwise.
+ *
+ * @ingroup HPMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasZhpmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem AP,
+    size_t offa,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_double2 beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZhpmv( order, uplo, N, alpha, AP, offa, X, offx, incx, beta, Y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+/*@}*/
+
+
+/**
+ * @defgroup SPR SPR   - Symmetric packed matrix rank 1 update
+ *
+ * The Level 2 Basic Linear Algebra Subprograms are functions that perform
+ * symmetric rank 1 update operations on packed matrix
+ * @ingroup BLAS2
+ */
+
+/*@{*/
+/**
+ * @brief Symmetric rank 1 operation with a general triangular packed-matrix and
+ * float elements.
+ *
+ * Symmetric rank 1 operation:
+ *   - \f$ A \leftarrow \alpha X X^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] AP 	    Buffer object storing packed-matrix \b AP.
+ * @param[in] offa      Offset of first element of matrix \b AP in buffer object.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx is zero
+ *   - \b clAmdBlasInvalidMemObject if either \b AP, \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SPR
+ */
+__inline clAmdBlasStatus
+clAmdBlasSspr(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+{
+	return clblasSspr( order, uplo, N, alpha, X, offx, incx, AP, offa,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+	
+/**
+ * @example example_sspr.c
+ * Example of how to use the @ref clAmdBlasSspr function.
+ */
+
+/**
+ * @brief Symmetric rank 1 operation with a general triangular packed-matrix and
+ * double elements.
+ *
+ * Symmetric rank 1 operation:
+ *   - \f$ A \leftarrow \alpha X X^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] AP 	    Buffer object storing packed-matrix \b AP.
+ * @param[in] offa      Offset of first element of matrix \b AP in buffer object.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSspr() function otherwise.
+ *
+ * @ingroup SPR
+ */
+__inline clAmdBlasStatus
+clAmdBlasDspr(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+{
+	return clblasDspr( order, uplo, N, alpha, X, offx, incx, AP, offa,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+	/*@}*/
+
+/**
+ * @defgroup HPR HPR   - Hermitian packed matrix rank 1 update
+ *
+ * The Level 2 Basic Linear Algebra Subprogram functions that perform
+ * hermitian rank 1 operations on packed matrix
+ * @ingroup BLAS2
+ */
+
+/*@{*/
+/**
+ * @brief hermitian rank 1 operation with a general triangular packed-matrix and
+ * float-complex elements.
+ *
+ * hermitian rank 1 operation:
+ *   - \f$ A \leftarrow \alpha X X^H + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A (a scalar float value)
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] AP 	    Buffer object storing matrix \b AP.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b AP.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx is zero
+ *   - \b clAmdBlasInvalidMemObject if either \b AP, \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup HPR
+ */
+__inline clAmdBlasStatus
+clAmdBlasChpr(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem X,
+    size_t offx,
+    int  incx,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+{
+	return clblasChpr( order, uplo, N, alpha, X, offx, incx, AP, offa,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_chpr.c
+ * Example of how to use the @ref clAmdBlasChpr function.
+ */
+
+/**
+ * @brief hermitian rank 1 operation with a general triangular packed-matrix and
+ * double-complex elements.
+ *
+ * hermitian rank 1 operation:
+ *   - \f$ A \leftarrow \alpha X X^H + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A (a scalar float value)
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] AP 	    Buffer object storing matrix \b AP.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b AP.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasChpr() function otherwise.
+ *
+ * @ingroup HPR
+ */
+__inline clAmdBlasStatus
+clAmdBlasZhpr(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+{
+	return clblasZhpr( order, uplo, N, alpha, X, offx, incx, AP, offa,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+	/*@}*/
+
+/**
+ * @defgroup SPR2 SPR2  - Symmetric packed matrix rank 2 update
+ *
+ * The Level 2 Basic Linear Algebra Subprograms are functions that perform
+ * symmetric rank 2 update operations on packed matrices
+ * @ingroup BLAS2
+ */
+
+/*@{*/
+/**
+ * @brief Symmetric rank 2 operation with a general triangular packed-matrix and
+ * float elements.
+ *
+ * Symmetric rank 2 operation:
+ *   - \f$ A \leftarrow \alpha X Y^T + \alpha Y X^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] AP		Buffer object storing packed-matrix \b AP.
+ * @param[in] offa      Offset of first element of matrix \b AP in buffer object.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - either \b N is zero, or
+ *     - either \b incx or \b incy is zero
+ *   - \b clAmdBlasInvalidMemObject if either \b AP, \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SPR2
+ */
+__inline clAmdBlasStatus
+clAmdBlasSspr2(
+	clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+	const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+{
+	return clblasSspr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_sspr2.c
+ * Example of how to use the @ref clAmdBlasSspr2 function.
+ */
+
+/**
+ * @brief Symmetric rank 2 operation with a general triangular packed-matrix and
+ * double elements.
+ *
+ * Symmetric rank 2 operation:
+ *   - \f$ A \leftarrow \alpha X Y^T + \alpha Y X^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] AP		Buffer object storing packed-matrix \b AP.
+ * @param[in] offa      Offset of first element of matrix \b AP in buffer object.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSspr2() function otherwise.
+ *
+ * @ingroup SPR2
+ */
+__inline clAmdBlasStatus
+clAmdBlasDspr2(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+	cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+{
+	return clblasDspr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/*@}*/
+
+/**
+ * @defgroup HPR2 HPR2  - Hermitian packed matrix rank 2 update
+ *
+ * The Level 2 Basic Linear Algebra Subprograms are functions that perform
+ * hermitian rank 2 update operations on packed matrices
+ * @ingroup BLAS2
+ */
+
+/*@{*/
+/**
+ * @brief Hermitian rank 2 operation with a general triangular packed-matrix and
+ * float-compelx elements.
+ *
+ * Hermitian rank 2 operation:
+ *   - \f$ A \leftarrow \alpha X Y^H + \conjg( alpha ) Y X^H + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset in number of elements for the first element in vector \b Y.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] AP		Buffer object storing packed-matrix \b AP.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b AP.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - either \b N is zero, or
+ *     - either \b incx or \b incy is zero
+ *   - \b clAmdBlasInvalidMemObject if either \b AP, \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup HPR2
+ */
+__inline clAmdBlasStatus
+clAmdBlasChpr2(
+	clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+	const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+{
+	return clblasChpr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Hermitian rank 2 operation with a general triangular packed-matrix and
+ * double-compelx elements.
+ *
+ * Hermitian rank 2 operation:
+ *   - \f$ A \leftarrow \alpha X Y^H + \conjg( alpha ) Y X^H + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset in number of elements for the first element in vector \b Y.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] AP		Buffer object storing packed-matrix \b AP.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b AP.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasChpr2() function otherwise.
+ *
+ * @ingroup HPR2
+ */
+__inline clAmdBlasStatus
+clAmdBlasZhpr2(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+	cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+{
+	return clblasZhpr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_zhpr2.c
+ * Example of how to use the @ref clAmdBlasZhpr2 function.
+ */
+/*@}*/
+
+
+
+/**
+ * @defgroup GBMV GBMV  - General banded matrix-vector multiplication
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief Matrix-vector product with a general rectangular banded matrix and
+ * float elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *   - \f$ Y \leftarrow \alpha A^T X + \beta Y \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] trans     How matrix \b A is to be transposed.
+ * @param[in] M         Number of rows in banded matrix \b A.
+ * @param[in] N         Number of columns in banded matrix \b A.
+ * @param[in] KL        Number of sub-diagonals in banded matrix \b A.
+ * @param[in] KU        Number of super-diagonals in banded matrix \b A.
+ * @param[in] alpha     The factor of banded matrix \b A.
+ * @param[in] A         Buffer object storing banded matrix \b A.
+ * @param[in] offa      Offset in number of elements for the first element in banded matrix \b A.
+ * @param[in] lda       Leading dimension of banded matrix \b A. It cannot be less
+ *                      than ( \b KL + \b KU + 1 )
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] beta      The factor of the vector \b Y.
+ * @param[out] Y        Buffer object storing the vector \b y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - either \b M or \b N is zero, or
+ *     - KL is greater than \b M - 1, or
+ *     - KU is greater than \b N - 1, or
+ *     - either \b incx or \b incy is zero, or
+ *     - any of the leading dimensions is invalid;
+ *     - the matrix size or the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if either \b A, \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup GBMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasSgbmv(
+    clblasOrder order,
+    clblasTranspose trans,
+    size_t M,
+    size_t N,
+    size_t KL,
+    size_t KU,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_float beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_sgbmv.c
+ * Example of how to use the @ref clAmdBlasSgbmv function.
+ */
+
+
+/**
+ * @brief Matrix-vector product with a general rectangular banded matrix and
+ * double elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *   - \f$ Y \leftarrow \alpha A^T X + \beta Y \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] trans     How matrix \b A is to be transposed.
+ * @param[in] M         Number of rows in banded matrix \b A.
+ * @param[in] N         Number of columns in banded matrix \b A.
+ * @param[in] KL        Number of sub-diagonals in banded matrix \b A.
+ * @param[in] KU        Number of super-diagonals in banded matrix \b A.
+ * @param[in] alpha     The factor of banded matrix \b A.
+ * @param[in] A         Buffer object storing banded matrix \b A.
+ * @param[in] offa      Offset in number of elements for the first element in banded matrix \b A.
+ * @param[in] lda       Leading dimension of banded matrix \b A. It cannot be less
+ *                      than ( \b KL + \b KU + 1 )
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] beta      The factor of the vector \b Y.
+ * @param[out] Y        Buffer object storing the vector \b y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSgbmv() function otherwise.
+ *
+ * @ingroup GBMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasDgbmv(
+    clblasOrder order,
+    clblasTranspose trans,
+    size_t M,
+    size_t N,
+    size_t KL,
+    size_t KU,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_double beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+
+/**
+ * @brief Matrix-vector product with a general rectangular banded matrix and
+ * float-complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *   - \f$ Y \leftarrow \alpha A^T X + \beta Y \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] trans     How matrix \b A is to be transposed.
+ * @param[in] M         Number of rows in banded matrix \b A.
+ * @param[in] N         Number of columns in banded matrix \b A.
+ * @param[in] KL        Number of sub-diagonals in banded matrix \b A.
+ * @param[in] KU        Number of super-diagonals in banded matrix \b A.
+ * @param[in] alpha     The factor of banded matrix \b A.
+ * @param[in] A         Buffer object storing banded matrix \b A.
+ * @param[in] offa      Offset in number of elements for the first element in banded matrix \b A.
+ * @param[in] lda       Leading dimension of banded matrix \b A. It cannot be less
+ *                      than ( \b KL + \b KU + 1 )
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] beta      The factor of the vector \b Y.
+ * @param[out] Y        Buffer object storing the vector \b y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return The same result as the clAmdBlasSgbmv() function.
+ *
+ * @ingroup GBMV
+ */    
+__inline clAmdBlasStatus
+clAmdBlasCgbmv(
+    clblasOrder order,
+    clblasTranspose trans,
+    size_t M,
+    size_t N,
+    size_t KL,
+    size_t KU,
+    cl_float2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_float2 beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+
+/**
+ * @brief Matrix-vector product with a general rectangular banded matrix and
+ * double-complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *   - \f$ Y \leftarrow \alpha A^T X + \beta Y \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] trans     How matrix \b A is to be transposed.
+ * @param[in] M         Number of rows in banded matrix \b A.
+ * @param[in] N         Number of columns in banded matrix \b A.
+ * @param[in] KL        Number of sub-diagonals in banded matrix \b A.
+ * @param[in] KU        Number of super-diagonals in banded matrix \b A.
+ * @param[in] alpha     The factor of banded matrix \b A.
+ * @param[in] A         Buffer object storing banded matrix \b A.
+ * @param[in] offa      Offset in number of elements for the first element in banded matrix \b A.
+ * @param[in] lda       Leading dimension of banded matrix \b A. It cannot be less
+ *                      than ( \b KL + \b KU + 1 )
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] beta      The factor of the vector \b Y.
+ * @param[out] Y        Buffer object storing the vector \b y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return The same result as the clAmdBlasDgbmv() function.
+ *
+ * @ingroup GBMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasZgbmv(
+    clblasOrder order,
+    clblasTranspose trans,
+    size_t M,
+    size_t N,
+    size_t KL,
+    size_t KU,
+    cl_double2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_double2 beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+/*@}*/
+
+
+/**
+ * @defgroup TBMV TBMV  - Triangular banded matrix vector multiply
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief Matrix-vector product with a triangular banded matrix and
+ * float elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in banded matrix \b A.
+ * @param[in] K					Number of sub-diagonals/super-diagonals in triangular banded matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than ( \b K + 1 )
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b incx is zero, or
+ *     - K is greater than \b N - 1
+ *     - the leading dimension is invalid;
+ *   - \b clAmdBlasInvalidMemObject if either \b A or \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup TBMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasStbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasStbmv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+/**
+ * @example example_stbmv.c
+ * Example of how to use the @ref clAmdBlasStbmv function.
+ */
+
+
+/**
+ * @brief Matrix-vector product with a triangular banded matrix and
+ * double elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in banded matrix \b A.
+ * @param[in] K					Number of sub-diagonals/super-diagonals in triangular banded matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than ( \b K + 1 )
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasStbmv() function otherwise.
+ *
+ * @ingroup TBMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasDtbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDtbmv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+
+/**
+ * @brief Matrix-vector product with a triangular banded matrix and
+ * float-complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in banded matrix \b A.
+ * @param[in] K					Number of sub-diagonals/super-diagonals in triangular banded matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than ( \b K + 1 )
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+* @return The same result as the clAmdBlasStbmv() function.
+ *
+ * @ingroup TBMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasCtbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCtbmv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+
+/**
+ * @brief Matrix-vector product with a triangular banded matrix and
+ * double-complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in banded matrix \b A.
+ * @param[in] K					Number of sub-diagonals/super-diagonals in triangular banded matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than ( \b K + 1 )
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+* @return The same result as the clAmdBlasDtbmv() function.
+ *
+ * @ingroup TBMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasZtbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZtbmv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, scratchBuff,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+/*@}*/
+
+
+/**
+ * @defgroup SBMV SBMV  - Symmetric banded matrix-vector multiplication
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief Matrix-vector product with a symmetric banded matrix and float elements.
+ *
+ * Matrix-vector products:
+ * - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in banded matrix \b A.
+ * @param[in] K			Number of sub-diagonals/super-diagonals in banded matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A			Buffer object storing matrix \b A.
+ * @param[in] offa		Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda		Leading dimension of matrix \b A. It cannot be less
+ *						than ( \b K + 1 )
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b X. It cannot be zero.
+ * @param[in] beta      The factor of vector \b Y.
+ * @param[out] Y        Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b Y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b incx is zero, or
+ *     - K is greater than \b N - 1
+ *     - the leading dimension is invalid;
+ *   - \b clAmdBlasInvalidMemObject if either \b A or \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SBMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasSsbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    size_t K,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_float beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSsbmv( order, uplo, N, K, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+/**
+ * @example example_ssbmv.c
+ * This is an example of how to use the @ref clAmdBlasSsbmv function.
+ */
+ 
+ 
+/**
+ * @brief Matrix-vector product with a symmetric banded matrix and double elements.
+ *
+ * Matrix-vector products:
+ * - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in banded matrix \b A.
+ * @param[in] K			Number of sub-diagonals/super-diagonals in banded matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A			Buffer object storing matrix \b A.
+ * @param[in] offa		Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda		Leading dimension of matrix \b A. It cannot be less
+ *						than ( \b K + 1 )
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b X. It cannot be zero.
+ * @param[in] beta      The factor of vector \b Y.
+ * @param[out] Y        Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b Y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSsbmv() function otherwise.
+ *
+ * @ingroup SBMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasDsbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    size_t K,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_double beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDsbmv( order, uplo, N, K, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/*@}*/
+
+
+/**
+ * @defgroup HBMV HBMV  - Hermitian banded matrix-vector multiplication
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief Matrix-vector product with a hermitian banded matrix and float elements.
+ *
+ * Matrix-vector products:
+ * - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in banded matrix \b A.
+ * @param[in] K			Number of sub-diagonals/super-diagonals in banded matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A			Buffer object storing matrix \b A.
+ * @param[in] offa		Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda		Leading dimension of matrix \b A. It cannot be less
+ *						than ( \b K + 1 )
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b X. It cannot be zero.
+ * @param[in] beta      The factor of vector \b Y.
+ * @param[out] Y        Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b Y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b incx is zero, or
+ *     - K is greater than \b N - 1
+ *     - the leading dimension is invalid;
+ *   - \b clAmdBlasInvalidMemObject if either \b A or \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup HBMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasChbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    size_t K,
+    cl_float2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_float2 beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasChbmv( order, uplo, N, K, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_chbmv.c
+ * This is an example of how to use the @ref clAmdBlasChbmv function.
+ */
+ 
+ 
+/**
+ * @brief Matrix-vector product with a hermitian banded matrix and double elements.
+ *
+ * Matrix-vector products:
+ * - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in banded matrix \b A.
+ * @param[in] K			Number of sub-diagonals/super-diagonals in banded matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A			Buffer object storing matrix \b A.
+ * @param[in] offa		Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda		Leading dimension of matrix \b A. It cannot be less
+ *						than ( \b K + 1 )
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b X. It cannot be zero.
+ * @param[in] beta      The factor of vector \b Y.
+ * @param[out] Y        Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b Y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasChbmv() function otherwise.
+ *
+ * @ingroup HBMV
+ */
+__inline clAmdBlasStatus
+clAmdBlasZhbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    size_t K,
+    cl_double2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_double2 beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZhbmv( order, uplo, N, K, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/*@}*/
+
+
+/**
+ * @defgroup TBSV TBSV  - Solving triangular banded matrix
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief solving triangular banded matrix problems with float elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in banded matrix \b A.
+ * @param[in] K					Number of sub-diagonals/super-diagonals in triangular banded matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than ( \b K + 1 )
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b incx is zero, or
+ *     - K is greater than \b N - 1
+ *     - the leading dimension is invalid;
+ *   - \b clAmdBlasInvalidMemObject if either \b A or \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup TBSV
+ */
+__inline clAmdBlasStatus
+clAmdBlasStbsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasStbsv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+/**
+ * @example example_stbsv.c
+ * This is an example of how to use the @ref clAmdBlasStbsv function.
+ */
+ 
+ 
+/**
+ * @brief solving triangular banded matrix problems with double elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in banded matrix \b A.
+ * @param[in] K					Number of sub-diagonals/super-diagonals in triangular banded matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than ( \b K + 1 )
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasStbsv() function otherwise.
+ *
+ * @ingroup TBSV
+ */
+__inline clAmdBlasStatus
+clAmdBlasDtbsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDtbsv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+    
+/**
+ * @brief solving triangular banded matrix problems with float-complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in banded matrix \b A.
+ * @param[in] K					Number of sub-diagonals/super-diagonals in triangular banded matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than ( \b K + 1 )
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return The same result as the clAmdBlasStbsv() function.
+ *
+ * @ingroup TBSV
+ */
+__inline clAmdBlasStatus
+clAmdBlasCtbsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCtbsv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+    
+/**
+ * @brief solving triangular banded matrix problems with double-complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in banded matrix \b A.
+ * @param[in] K					Number of sub-diagonals/super-diagonals in triangular banded matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than ( \b K + 1 )
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return The same result as the clAmdBlasDtbsv() function.
+ *
+ * @ingroup TBSV
+ */
+__inline clAmdBlasStatus
+clAmdBlasZtbsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZtbsv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/*@}*/
+
+
+/**
+ * @defgroup BLAS3 BLAS-3 functions
+ *
+ * The Level 3 Basic Linear Algebra Subprograms are funcions that perform
+ * matrix-matrix operations.
+ */
+/*@{*/
+/*@}*/
+
+/**
+ * @defgroup GEMM GEMM - General matrix-matrix multiplication
+ * @ingroup BLAS3
+ */
+/*@{*/
+
+/**
+ * @brief Matrix-matrix product of general rectangular matrices with float
+ * elements.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A B^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] transB    How matrix \b B is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] K         Number of columns in matrix \b A and rows in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b K when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M when the
+ *                      parameter is set to \b clAmdBlasColumnMajor.
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] ldb       Leading dimension of matrix \b B. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b K
+ *                      when it is set to \b clAmdBlasColumnMajor.
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in] ldc       Leading dimension of matrix \b C. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M when
+ *                      it is set to \b clAmdBlasColumnMajorOrder.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasSgemmEx() instead.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b M, \b N or \b K is zero, or
+ *     - any of the leading dimensions is invalid;
+ *     - the matrix sizes lead to accessing outsize of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if A, B, or C object is invalid,
+ *     or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfResources if you use image-based function implementation
+ *     and no suitable scratch image available;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs to
+ *     was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup GEMM
+ */
+__inline clAmdBlasStatus
+clAmdBlasSgemm( 
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    cl_float alpha,
+    const cl_mem A,
+    size_t lda,
+    const cl_mem B,
+    size_t ldb,
+    cl_float beta,
+    cl_mem C,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSgemm( order, transA, transB, M, N, K, alpha, A, 0, lda, B, 0, ldb, beta, C, 0, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_sgemm.c
+ * This is an example of how to use the @ref clAmdBlasSgemm function.
+ */
+
+/**
+ * @brief Matrix-matrix product of general rectangular matrices with double
+ * elements.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A B^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] transB    How matrix \b B is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] K         Number of columns in matrix \b A and rows in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed description,
+ *                      see clAmdBlasSgemm().
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed description,
+ *                      see clAmdBlasSgemm().
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in] ldc       Leading dimension of matrix \b C. For detailed description,
+ *                      see clAmdBlasSgemm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasDgemmEx() instead.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSgemm() function otherwise.
+ *
+ * @ingroup GEMM
+ */
+__inline clAmdBlasStatus
+clAmdBlasDgemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    cl_float alpha,
+    const cl_mem A,
+    size_t lda,
+    const cl_mem B,
+    size_t ldb,
+    cl_float beta,
+    cl_mem C,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDgemm( order, transA, transB, M, N, K, alpha, A, 0, lda, B, 0, ldb, beta, C, 0, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Matrix-matrix product of general rectangular matrices with float
+ * complex elements.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A B^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] transB    How matrix \b B is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] K         Number of columns in matrix \b A and rows in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed description,
+ *                      see clAmdBlasSgemm().
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed description,
+ *                      see clAmdBlasSgemm().
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in] ldc       Leading dimension of matrix \b C. For detailed description,
+ *                      see clAmdBlasSgemm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasCgemmEx() instead.
+ *
+ * @return The same result as the clAmdBlasSgemm() function.
+ *
+ * @ingroup GEMM
+ */
+__inline clAmdBlasStatus
+clAmdBlasCgemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    FloatComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCgemm( order, transA, transB, M, N, K, alpha, A, 0, lda, B, 0, ldb, beta, C, 0, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Matrix-matrix product of general rectangular matrices with double
+ * complex elements.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A B^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] transB    How matrix \b B is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] K         Number of columns in matrix \b A and rows in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed description,
+ *                      see clAmdBlasSgemm().
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed description,
+ *                      see clAmdBlasSgemm().
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in] ldc       Leading dimension of matrix \b C. For detailed description,
+ *                      see clAmdBlasSgemm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasZgemmEx() instead.
+ *
+ * @return The same result as the clAmdBlasDgemm() function.
+ *
+ * @ingroup GEMM
+ */
+__inline clAmdBlasStatus
+clAmdBlasZgemm(
+    clAmdBlasOrder order,
+    clAmdBlasTranspose transA,
+    clAmdBlasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t lda,
+    const cl_mem B,
+    size_t ldb,
+    DoubleComplex beta,
+    cl_mem C,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZgemm( order, transA, transB, M, N, K, alpha, A, 0, lda, B, 0, ldb, beta, C, 0, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Matrix-matrix product of general rectangular matrices with float
+ *        elements. Extended version.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A B^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] transB    How matrix \b B is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] K         Number of columns in matrix \b A and rows in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b K when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M when the
+ *                      parameter is set to \b clAmdBlasColumnMajor.
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b K
+ *                      when it is set to \b clAmdBlasColumnMajor.
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in]  offC     Offset of the first element of the matrix \b C in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldc       Leading dimension of matrix \b C. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M when
+ *                      it is set to \b clAmdBlasColumnMajorOrder.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidValue if either \b offA, \b offB or \b offC exceeds
+ *        the size of the respective buffer object;
+ *   - the same error codes as clAmdBlasSgemm() otherwise.
+ *
+ * @ingroup GEMM
+ */
+__inline clAmdBlasStatus
+clAmdBlasSgemmEx(
+    clAmdBlasOrder order,
+    clAmdBlasTranspose transA,
+    clAmdBlasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_float beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSgemm( order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_sgemm.c
+ * This is an example of how to use the @ref clAmdBlasSgemmEx function.
+ */
+
+/**
+ * @brief Matrix-matrix product of general rectangular matrices with double
+ *        elements. Extended version.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A B^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] transB    How matrix \b B is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] K         Number of columns in matrix \b A and rows in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed description,
+ *                      see clAmdBlasSgemm().
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed description,
+ *                      see clAmdBlasSgemm().
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in] offC      Offset of the first element of the matrix \b C in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldc       Leading dimension of matrix \b C. For detailed description,
+ *                      see clAmdBlasSgemm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *        point arithmetic with double precision;
+ *   - \b clAmdBlasInvalidValue if either \b offA, \b offB or offC exceeds
+ *        the size of the respective buffer object;
+ *   - the same error codes as the clAmdBlasSgemm() function otherwise.
+ *
+ * @ingroup GEMM
+ */
+__inline clAmdBlasStatus
+clAmdBlasDgemmEx(
+    clAmdBlasOrder order,
+    clAmdBlasTranspose transA,
+    clAmdBlasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_double beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDgemm( order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+
+/**
+ * @brief Matrix-matrix product of general rectangular matrices with float
+ *        complex elements. Extended version.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A B^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] transB    How matrix \b B is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] K         Number of columns in matrix \b A and rows in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed description,
+ *                      see clAmdBlasSgemm().
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed description,
+ *                      see clAmdBlasSgemm().
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in] offC      Offset of the first element of the matrix \b C in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldc       Leading dimension of matrix \b C. For detailed description,
+ *                      see clAmdBlasSgemm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidValue if either \b offA, \b offB or offC exceeds
+ *        the size of the respective buffer object;
+ *   - the same error codes as the clAmdBlasSgemm() function otherwise.
+ *
+ * @ingroup GEMM
+ */
+__inline clAmdBlasStatus
+clAmdBlasCgemmEx(
+    clAmdBlasOrder order,
+    clAmdBlasTranspose transA,
+    clAmdBlasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    FloatComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCgemm( order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+
+/**
+ * @brief Matrix-matrix product of general rectangular matrices with double
+ *        complex elements. Exteneded version.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A B^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] transB    How matrix \b B is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] K         Number of columns in matrix \b A and rows in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed description,
+ *                      see clAmdBlasSgemm().
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed description,
+ *                      see clAmdBlasSgemm().
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in] offC      Offset of the first element of the matrix \b C in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldc       Leading dimension of matrix \b C. For detailed description,
+ *                      see clAmdBlasSgemm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *        point arithmetic with double precision;
+ *   - \b clAmdBlasInvalidValue if either \b offA, \b offB or offC exceeds
+ *        the size of the respective buffer object;
+ *   - the same error codes as the clAmdBlasSgemm() function otherwise.
+ *
+ * @ingroup GEMM
+ */
+__inline clAmdBlasStatus
+clAmdBlasZgemmEx(
+    clAmdBlasOrder order,
+    clAmdBlasTranspose transA,
+    clAmdBlasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    DoubleComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZgemm( order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+
+/*@}*/
+
+/**
+ * @defgroup TRMM TRMM - Triangular matrix-matrix multiplication
+ * @ingroup BLAS3
+ */
+/*@{*/
+
+/**
+ * @brief Multiplying a matrix by a triangular matrix with float elements.
+ *
+ * Matrix-triangular matrix products:
+ *   - \f$ B \leftarrow \alpha A B \f$
+ *   - \f$ B \leftarrow \alpha A^T B \f$
+ *   - \f$ B \leftarrow \alpha B A \f$
+ *   - \f$ B \leftarrow \alpha B A^T \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b M when the \b side parameter is set to
+ *                      \b clAmdBlasLeft,\n or less than \b N when it is set
+ *                      to \b clAmdBlasRight.
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] ldb       Leading dimension of matrix \b B. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or not less than \b M
+ *                      when it is set to \b clAmdBlasColumnMajor.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasStrmmEx() instead.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b M, \b N, or \b K is zero, or
+ *     - any of the leading dimensions is invalid;
+ *     - the matrix sizes lead to accessing outsize of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if A, B, or C object is invalid,
+ *     or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfResources if you use image-based function implementation
+ *     and no suitable scratch image available;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs to
+ *     was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup TRMM
+ */
+__inline clAmdBlasStatus
+clAmdBlasStrmm(
+    clAmdBlasOrder order,
+    clAmdBlasSide side,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    clAmdBlasDiag diag,
+    size_t M,
+    size_t N,
+    cl_float alpha,
+    const cl_mem A,
+    size_t lda,
+    cl_mem B,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasStrmm( order, side, uplo, transA, diag, M, N, alpha, A, 0, lda, B, 0, ldb,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+
+/**
+ * @example example_strmm.c
+ * This is an example of how to use the @ref clAmdBlasStrmm function.
+ */
+
+/**
+ * @brief Multiplying a matrix by a triangular matrix with double elements.
+ *
+ * Matrix-triangular matrix products:
+ *   - \f$ B \leftarrow \alpha A B \f$
+ *   - \f$ B \leftarrow \alpha A^T B \f$
+ *   - \f$ B \leftarrow \alpha B A \f$
+ *   - \f$ B \leftarrow \alpha B A^T \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed
+ *                      description, see clAmdBlasStrmm().
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed
+ *                      description, see clAmdBlasStrmm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasDtrmmEx() instead.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasStrmm() function otherwise.
+ *
+ * @ingroup TRMM
+ */
+__inline clAmdBlasStatus
+clAmdBlasDtrmm(
+    clAmdBlasOrder order,
+    clAmdBlasSide side,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    clAmdBlasDiag diag,
+    size_t M,
+    size_t N,
+    cl_double alpha,
+    const cl_mem A,
+    size_t lda,
+    cl_mem B,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDtrmm( order, side, uplo, transA, diag, M, N, alpha, A, 0, lda, B, 0, ldb,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Multiplying a matrix by a triangular matrix with float complex
+ * elements.
+ *
+ * Matrix-triangular matrix products:
+ *   - \f$ B \leftarrow \alpha A B \f$
+ *   - \f$ B \leftarrow \alpha A^T B \f$
+ *   - \f$ B \leftarrow \alpha B A \f$
+ *   - \f$ B \leftarrow \alpha B A^T \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed
+ *                      description, see clAmdBlasStrmm().
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed
+ *                      description, see clAmdBlasStrmm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasCtrmmEx() instead.
+ *
+ * @return The same result as the clAmdBlasStrmm() function.
+ *
+ * @ingroup TRMM
+ */
+__inline clAmdBlasStatus
+clAmdBlasCtrmm(
+    clAmdBlasOrder order,
+    clAmdBlasSide side,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    clAmdBlasDiag diag,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t lda,
+    cl_mem B,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCtrmm( order, side, uplo, transA, diag, M, N, alpha, A, 0, lda, B, 0, ldb,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Multiplying a matrix by a triangular matrix with double complex
+ * elements.
+ *
+ * Matrix-triangular matrix products:
+ *   - \f$ B \leftarrow \alpha A B \f$
+ *   - \f$ B \leftarrow \alpha A^T B \f$
+ *   - \f$ B \leftarrow \alpha B A \f$
+ *   - \f$ B \leftarrow \alpha B A^T \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed
+ *                      description, see clAmdBlasStrmm().
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed
+ *                      description, see clAmdBlasStrmm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasZtrmmEx() instead.
+ *
+ * @return The same result as the clAmdBlasDtrmm() function.
+ *
+ * @ingroup TRMM
+ */
+__inline clAmdBlasStatus
+clAmdBlasZtrmm(
+    clAmdBlasOrder order,
+    clAmdBlasSide side,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    clAmdBlasDiag diag,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t lda,
+    cl_mem B,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZtrmm( order, side, uplo, transA, diag, M, N, alpha, A, 0, lda, B, 0, ldb,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Multiplying a matrix by a triangular matrix with float elements.
+ *        Extended version.
+ *
+ * Matrix-triangular matrix products:
+ *   - \f$ B \leftarrow \alpha A B \f$
+ *   - \f$ B \leftarrow \alpha A^T B \f$
+ *   - \f$ B \leftarrow \alpha B A \f$
+ *   - \f$ B \leftarrow \alpha B A^T \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b M when the \b side parameter is set to
+ *                      \b clAmdBlasLeft,\n or less than \b N when it is set
+ *                      to \b clAmdBlasRight.
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or not less than \b M
+ *                      when it is set to \b clAmdBlasColumnMajor.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidValue if either \b offA or \b offB exceeds the size
+ *        of the respective buffer object;
+ *   - the same error codes as clAmdBlasStrmm() otherwise.
+ *
+ * @ingroup TRMM
+ */
+__inline clAmdBlasStatus
+clAmdBlasStrmmEx(
+    clAmdBlasOrder order,
+    clAmdBlasSide side,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    clAmdBlasDiag diag,
+    size_t M,
+    size_t N,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasStrmm( order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_strmm.c
+ * This is an example of how to use the @ref clAmdBlasStrmmEx function.
+ */
+
+/**
+ * @brief Multiplying a matrix by a triangular matrix with double elements.
+ *        Extended version.
+ *
+ * Matrix-triangular matrix products:
+ *   - \f$ B \leftarrow \alpha A B \f$
+ *   - \f$ B \leftarrow \alpha A^T B \f$
+ *   - \f$ B \leftarrow \alpha B A \f$
+ *   - \f$ B \leftarrow \alpha B A^T \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed
+ *                      description, see clAmdBlasStrmm().
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed
+ *                      description, see clAmdBlasStrmm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - \b clAmdBlasInvalidValue if either \b offA or \b offB exceeds the size
+ *        of the respective buffer object;
+ *   - the same error codes as the clAmdBlasStrmm() function otherwise.
+ *
+ * @ingroup TRMM
+ */
+__inline clAmdBlasStatus
+clAmdBlasDtrmmEx(
+    clAmdBlasOrder order,
+    clAmdBlasSide side,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    clAmdBlasDiag diag,
+    size_t M,
+    size_t N,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDtrmm( order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Multiplying a matrix by a triangular matrix with float complex
+ *        elements. Extended version.
+ *
+ * Matrix-triangular matrix products:
+ *   - \f$ B \leftarrow \alpha A B \f$
+ *   - \f$ B \leftarrow \alpha A^T B \f$
+ *   - \f$ B \leftarrow \alpha B A \f$
+ *   - \f$ B \leftarrow \alpha B A^T \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed
+ *                      description, see clAmdBlasStrmm().
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed
+ *                      description, see clAmdBlasStrmm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidValue if either \b offA or \b offB exceeds the size
+ *        of the respective buffer object;
+ *   - the same error codes as clAmdBlasStrmm() otherwise.
+ *
+ * @ingroup TRMM
+ */
+__inline clAmdBlasStatus
+clAmdBlasCtrmmEx(
+    clAmdBlasOrder order,
+    clAmdBlasSide side,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    clAmdBlasDiag diag,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCtrmm( order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Multiplying a matrix by a triangular matrix with double complex
+ *        elements. Extended version.
+ *
+ * Matrix-triangular matrix products:
+ *   - \f$ B \leftarrow \alpha A B \f$
+ *   - \f$ B \leftarrow \alpha A^T B \f$
+ *   - \f$ B \leftarrow \alpha B A \f$
+ *   - \f$ B \leftarrow \alpha B A^T \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed
+ *                      description, see clAmdBlasStrmm().
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed
+ *                      description, see clAmdBlasStrmm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - \b clAmdBlasInvalidValue if either \b offA or \b offB exceeds the size
+ *        of the respective buffer object;
+ *   - the same error codes as the clAmdBlasStrmm() function otherwise.
+ *
+ * @ingroup TRMM
+ */
+__inline clAmdBlasStatus
+clAmdBlasZtrmmEx(
+    clAmdBlasOrder order,
+    clAmdBlasSide side,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    clAmdBlasDiag diag,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZtrmm( order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/*@}*/
+
+/**
+ * @defgroup TRSM TRSM - Solving triangular systems of equations
+ * @ingroup BLAS3
+ */
+/*@{*/
+
+/**
+ * @brief Solving triangular systems of equations with multiple right-hand
+ * sides and float elements.
+ *
+ * Solving triangular systems of equations:
+ *   - \f$ B \leftarrow \alpha A^{-1} B \f$
+ *   - \f$ B \leftarrow \alpha A^{-T} B \f$
+ *   - \f$ B \leftarrow \alpha B A^{-1} \f$
+ *   - \f$ B \leftarrow \alpha B A^{-T} \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b M when the \b side parameter is set to
+ *                      \b clAmdBlasLeft,\n or less than \b N
+ *                      when it is set to \b clAmdBlasRight.
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] ldb       Leading dimension of matrix \b B. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M
+ *                      when it is set to \b clAmdBlasColumnMajor.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasStrsmEx() instead.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b M, \b N or \b K is zero, or
+ *     - any of the leading dimensions is invalid;
+ *     - the matrix sizes lead to accessing outsize of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if A, B, or C object is invalid,
+ *     or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfResources if you use image-based function implementation
+ *     and no suitable scratch image available;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup TRSM
+ */
+__inline clAmdBlasStatus
+clAmdBlasStrsm(
+    clAmdBlasOrder order,
+    clAmdBlasSide side,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    clAmdBlasDiag diag,
+    size_t M,
+    size_t N,
+    cl_float alpha,
+    const cl_mem A,
+    size_t lda,
+    cl_mem B,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasStrsm( order, side, uplo, transA, diag, M, N, alpha, A, 0, lda, B, 0, ldb,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+
+/**
+ * @example example_strsm.c
+ * This is an example of how to use the @ref clAmdBlasStrsm function.
+ */
+
+/**
+ * @brief Solving triangular systems of equations with multiple right-hand
+ * sides and double elements.
+ *
+ * Solving triangular systems of equations:
+ *   - \f$ B \leftarrow \alpha A^{-1} B \f$
+ *   - \f$ B \leftarrow \alpha A^{-T} B \f$
+ *   - \f$ B \leftarrow \alpha B A^{-1} \f$
+ *   - \f$ B \leftarrow \alpha B A^{-T} \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed
+ *                      description, see clAmdBlasStrsm().
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed
+ *                      description, see clAmdBlasStrsm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasDtrsmEx() instead.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasStrsm() function otherwise.
+ *
+ * @ingroup TRSM
+ */
+__inline clAmdBlasStatus
+clAmdBlasDtrsm(
+    clAmdBlasOrder order,
+    clAmdBlasSide side,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    clAmdBlasDiag diag,
+    size_t M,
+    size_t N,
+    cl_double alpha,
+    const cl_mem A,
+    size_t lda,
+    cl_mem B,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDtrsm( order, side, uplo, transA, diag, M, N, alpha, A, 0, lda, B, 0, ldb,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+
+/**
+ * @brief Solving triangular systems of equations with multiple right-hand
+ * sides and float complex elements.
+ *
+ * Solving triangular systems of equations:
+ *   - \f$ B \leftarrow \alpha A^{-1} B \f$
+ *   - \f$ B \leftarrow \alpha A^{-T} B \f$
+ *   - \f$ B \leftarrow \alpha B A^{-1} \f$
+ *   - \f$ B \leftarrow \alpha B A^{-T} \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed
+ *                      description, see clAmdBlasStrsm().
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed
+ *                      description, see clAmdBlasStrsm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasCtrsmEx() instead.
+ *
+ * @return The same result as the clAmdBlasStrsm() function.
+ *
+ * @ingroup TRSM
+ */
+__inline clAmdBlasStatus
+clAmdBlasCtrsm(
+    clAmdBlasOrder order,
+    clAmdBlasSide side,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    clAmdBlasDiag diag,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t lda,
+    cl_mem B,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCtrsm( order, side, uplo, transA, diag, M, N, alpha, A, 0, lda, B, 0, ldb,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Solving triangular systems of equations with multiple right-hand
+ * sides and double complex elements.
+ *
+ * Solving triangular systems of equations:
+ *   - \f$ B \leftarrow \alpha A^{-1} B \f$
+ *   - \f$ B \leftarrow \alpha A^{-T} B \f$
+ *   - \f$ B \leftarrow \alpha B A^{-1} \f$
+ *   - \f$ B \leftarrow \alpha B A^{-T} \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed
+ *                      description, see clAmdBlasStrsm().
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed
+ *                      description, see clAmdBlasStrsm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasZtrsmEx() instead.
+ *
+ * @return The same result as the clAmdBlasDtrsm() function.
+ *
+ * @ingroup TRSM
+ */
+__inline clAmdBlasStatus
+clAmdBlasZtrsm(
+    clAmdBlasOrder order,
+    clAmdBlasSide side,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    clAmdBlasDiag diag,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t lda,
+    cl_mem B,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZtrsm( order, side, uplo, transA, diag, M, N, alpha, A, 0, lda, B, 0, ldb,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Solving triangular systems of equations with multiple right-hand
+ *        sides and float elements. Extended version.
+ *
+ * Solving triangular systems of equations:
+ *   - \f$ B \leftarrow \alpha A^{-1} B \f$
+ *   - \f$ B \leftarrow \alpha A^{-T} B \f$
+ *   - \f$ B \leftarrow \alpha B A^{-1} \f$
+ *   - \f$ B \leftarrow \alpha B A^{-T} \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b M when the \b side parameter is set to
+ *                      \b clAmdBlasLeft,\n or less than \b N
+ *                      when it is set to \b clAmdBlasRight.
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M
+ *                      when it is set to \b clAmdBlasColumnMajor.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidValue if either \b offA or \b offB exceeds the size
+ *        of the respective buffer object;
+ *   - the same error codes as clAmdBlasStrsm() otherwise.
+ *
+ * @ingroup TRSM
+ */
+__inline clAmdBlasStatus
+clAmdBlasStrsmEx(
+    clAmdBlasOrder order,
+    clAmdBlasSide side,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    clAmdBlasDiag diag,
+    size_t M,
+    size_t N,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasStrsm( order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+
+/**
+ * @example example_strsm.c
+ * This is an example of how to use the @ref clAmdBlasStrsmEx function.
+ */
+
+/**
+ * @brief Solving triangular systems of equations with multiple right-hand
+ *        sides and double elements. Extended version.
+ *
+ * Solving triangular systems of equations:
+ *   - \f$ B \leftarrow \alpha A^{-1} B \f$
+ *   - \f$ B \leftarrow \alpha A^{-T} B \f$
+ *   - \f$ B \leftarrow \alpha B A^{-1} \f$
+ *   - \f$ B \leftarrow \alpha B A^{-T} \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed
+ *                      description, see clAmdBlasStrsm().
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed
+ *                      description, see clAmdBlasStrsm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *        point arithmetic with double precision;
+ *   - \b clAmdBlasInvalidValue if either \b offA or \b offB exceeds the size
+ *        of the respective buffer object;
+ *   - the same error codes as the clAmdBlasStrsm() function otherwise.
+ *
+ * @ingroup TRSM
+ */
+__inline clAmdBlasStatus
+clAmdBlasDtrsmEx(
+    clAmdBlasOrder order,
+    clAmdBlasSide side,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    clAmdBlasDiag diag,
+    size_t M,
+    size_t N,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDtrsm( order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Solving triangular systems of equations with multiple right-hand
+ *        sides and float complex elements. Extended version.
+ *
+ * Solving triangular systems of equations:
+ *   - \f$ B \leftarrow \alpha A^{-1} B \f$
+ *   - \f$ B \leftarrow \alpha A^{-T} B \f$
+ *   - \f$ B \leftarrow \alpha B A^{-1} \f$
+ *   - \f$ B \leftarrow \alpha B A^{-T} \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed
+ *                      description, see clAmdBlasStrsm().
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed
+ *                      description, see clAmdBlasStrsm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidValue if either \b offA or \b offB exceeds the size
+ *        of the respective buffer object;
+ *   - the same error codes as clAmdBlasStrsm() otherwise.
+ *
+ * @ingroup TRSM
+ */
+__inline clAmdBlasStatus
+clAmdBlasCtrsmEx(
+    clAmdBlasOrder order,
+    clAmdBlasSide side,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    clAmdBlasDiag diag,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCtrsm( order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Solving triangular systems of equations with multiple right-hand
+ *        sides and double complex elements. Extended version.
+ *
+ * Solving triangular systems of equations:
+ *   - \f$ B \leftarrow \alpha A^{-1} B \f$
+ *   - \f$ B \leftarrow \alpha A^{-T} B \f$
+ *   - \f$ B \leftarrow \alpha B A^{-1} \f$
+ *   - \f$ B \leftarrow \alpha B A^{-T} \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed
+ *                      description, see clAmdBlasStrsm().
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed
+ *                      description, see clAmdBlasStrsm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *        point arithmetic with double precision;
+ *   - \b clAmdBlasInvalidValue if either \b offA or \b offB exceeds the size
+ *        of the respective buffer object;
+ *   - the same error codes as the clAmdBlasStrsm() function otherwise
+ *
+ * @ingroup TRSM
+ */
+__inline clAmdBlasStatus
+clAmdBlasZtrsmEx(
+    clAmdBlasOrder order,
+    clAmdBlasSide side,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    clAmdBlasDiag diag,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZtrsm( order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/*@}*/
+
+/**
+ * @defgroup SYRK SYRK - Symmetric rank-k update of a matrix
+ * @ingroup BLAS3
+ */
+
+/*@{*/
+
+/**
+ * @brief Rank-k update of a symmetric matrix with float elements.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T A + \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transA     How matrix \b A is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrix \b A if it is not
+ *                       transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrix \b A.
+ * @param[in] A          Buffer object storing the matrix \b A.
+ * @param[in] lda        Leading dimension of matrix \b A. It cannot be
+ *                       less than \b K if \b A is
+ *                       in the row-major format, and less than \b N
+ *                       otherwise.
+ * @param[in] beta       The factor of the matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] ldc        Leading dimension of matric \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasSsyrkEx() instead.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b K is zero, or
+ *     - any of the leading dimensions is invalid;
+ *     - the matrix sizes lead to accessing outsize of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if either \b A or \b C object is
+ *     invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs to
+ *     was released.
+ *
+ * @ingroup SYRK
+ */
+__inline clAmdBlasStatus
+clAmdBlasSsyrk(
+    clAmdBlasOrder order,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    size_t N,
+    size_t K,
+    cl_float alpha,
+    const cl_mem A,
+    size_t lda,
+    cl_float beta,
+    cl_mem C,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSsyrk( order, uplo, transA, N, K, alpha, A, 0, lda, beta, C, 0, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_ssyrk.c
+ * This is an example of how to use the @ref clAmdBlasSsyrk function.
+ */
+
+/**
+ * @brief Rank-k update of a symmetric matrix with double elements.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T A + \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transA     How matrix \b A is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrix \b A if it is not
+ *                       transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrix \b A.
+ * @param[in] A          Buffer object storing the matrix \b A.
+ * @param[in] lda        Leading dimension of matrix \b A. For detailed
+ *                       description, see clAmdBlasSsyrk().
+ * @param[in] beta       The factor of the matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] ldc        Leading dimension of matrix \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasDsyrkEx() instead.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSsyrk() function otherwise.
+ *
+ * @ingroup SYRK
+ */
+__inline clAmdBlasStatus
+clAmdBlasDsyrk(
+    clAmdBlasOrder order,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    size_t N,
+    size_t K,
+    cl_double alpha,
+    const cl_mem A,
+    size_t lda,
+    cl_double beta,
+    cl_mem C,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDsyrk( order, uplo, transA, N, K, alpha, A, 0, lda, beta, C, 0, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Rank-k update of a symmetric matrix with complex float elements.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T A + \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transA     How matrix \b A is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrix \b A if it is not
+ *                       transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrix \b A.
+ * @param[in] A          Buffer object storing the matrix \b A.
+ * @param[in] lda        Leading dimension of matrix \b A. For detailed
+ *                       description, see clAmdBlasSsyrk().
+ * @param[in] beta       The factor of the matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] ldc        Leading dimension of matrix \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasCsyrkEx() instead.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidValue if \b transA is set to \ref clAmdBlasConjTrans.
+ *   - the same error codes as the clAmdBlasSsyrk() function otherwise.
+ *
+ * @ingroup SYRK
+ */
+__inline clAmdBlasStatus
+clAmdBlasCsyrk(
+    clAmdBlasOrder order,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t lda,
+    FloatComplex beta,
+    cl_mem C,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCsyrk( order, uplo, transA, N, K, alpha, A, 0, lda, beta, C, 0, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Rank-k update of a symmetric matrix with complex double elements.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T A + \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transA     How matrix \b A is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrix \b A if it is not
+ *                       transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrix \b A.
+ * @param[in] A          Buffer object storing the matrix \b A.
+ * @param[in] lda        Leading dimension of matrix \b A. For detailed
+ *                       description, see clAmdBlasSsyrk().
+ * @param[in] beta       The factor of the matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] ldc        Leading dimension of matrix \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasZsyrkEx() instead.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - \b clAmdBlasInvalidValue if \b transA is set to \ref clAmdBlasConjTrans.
+ *   - the same error codes as the clAmdBlasSsyrk() function otherwise.
+ *
+ * @ingroup SYRK
+ */
+__inline clAmdBlasStatus
+clAmdBlasZsyrk(
+    clAmdBlasOrder order,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t lda,
+    DoubleComplex beta,
+    cl_mem C,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZsyrk( order, uplo, transA, N, K, alpha, A, 0, lda, beta, C, 0, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Rank-k update of a symmetric matrix with float elements.
+ *        Extended version.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T A + \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transA     How matrix \b A is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrix \b A if it is not
+ *                       transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrix \b A.
+ * @param[in] A          Buffer object storing the matrix \b A.
+ * @param[in] offA       Offset of the first element of the matrix \b A in the
+ *                       buffer object. Counted in elements.
+ * @param[in] lda        Leading dimension of matrix \b A. It cannot be
+ *                       less than \b K if \b A is
+ *                       in the row-major format, and less than \b N
+ *                       otherwise.
+ * @param[in] beta       The factor of the matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offC       Offset of the first element of the matrix \b C in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldc        Leading dimension of matric \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidValue if either \b offA or \b offC exceeds the size
+ *        of the respective buffer object;
+ *   - the same error codes as the clAmdBlasSsyrk() function otherwise.
+ *
+ * @ingroup SYRK
+ */
+__inline clAmdBlasStatus
+clAmdBlasSsyrkEx(
+    clAmdBlasOrder order,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    size_t N,
+    size_t K,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_float beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSsyrk( order, uplo, transA, N, K, alpha, A, offA, lda, beta, C, offC, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_ssyrk.c
+ * This is an example of how to use the @ref clAmdBlasSsyrkEx function.
+ */
+
+/**
+ * @brief Rank-k update of a symmetric matrix with double elements.
+ *        Extended version.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T A + \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transA     How matrix \b A is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrix \b A if it is not
+ *                       transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrix \b A.
+ * @param[in] A          Buffer object storing the matrix \b A.
+ * @param[in] offA       Offset of the first element of the matrix \b A in the
+ *                       buffer object. Counted in elements.
+ * @param[in] lda        Leading dimension of matrix \b A. For detailed
+ *                       description, see clAmdBlasSsyrk().
+ * @param[in] beta       The factor of the matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offC       Offset of the first element of the matrix \b C in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldc        Leading dimension of matrix \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *        point arithmetic with double precision;
+ *   - \b clAmdBlasInvalidValue if either \b offA or \b offC exceeds the size
+ *        of the respective buffer object;
+ *   - the same error codes as the clAmdBlasSsyrk() function otherwise.
+ *
+ * @ingroup SYRK
+ */
+__inline clAmdBlasStatus
+clAmdBlasDsyrkEx(
+    clAmdBlasOrder order,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    size_t N,
+    size_t K,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_double beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDsyrk( order, uplo, transA, N, K, alpha, A, offA, lda, beta, C, offC, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Rank-k update of a symmetric matrix with complex float elements.
+ *        Extended version.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T A + \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transA     How matrix \b A is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrix \b A if it is not
+ *                       transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrix \b A.
+ * @param[in] A          Buffer object storing the matrix \b A.
+ * @param[in] offA       Offset of the first element of the matrix \b A in the
+ *                       buffer object. Counted in elements.
+ * @param[in] lda        Leading dimension of matrix \b A. For detailed
+ *                       description, see clAmdBlasSsyrk().
+ * @param[in] beta       The factor of the matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offC       Offset of the first element of the matrix \b C in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldc        Leading dimension of matrix \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidValue if either \b offA or \b offC exceeds the size
+ *        of the respective buffer object;
+ *   - \b clAmdBlasInvalidValue if \b transA is set to \ref clAmdBlasConjTrans.
+ *   - the same error codes as the clAmdBlasSsyrk() function otherwise.
+ *
+ * @ingroup SYRK
+ */
+__inline clAmdBlasStatus
+clAmdBlasCsyrkEx(
+    clAmdBlasOrder order,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    FloatComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCsyrk( order, uplo, transA, N, K, alpha, A, offA, lda, beta, C, offC, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Rank-k update of a symmetric matrix with complex double elements.
+ *        Extended version.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T A + \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transA     How matrix \b A is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrix \b A if it is not
+ *                       transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrix \b A.
+ * @param[in] A          Buffer object storing the matrix \b A.
+ * @param[in] offA       Offset of the first element of the matrix \b A in the
+ *                       buffer object. Counted in elements.
+ * @param[in] lda        Leading dimension of matrix \b A. For detailed
+ *                       description, see clAmdBlasSsyrk().
+ * @param[in] beta       The factor of the matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offC       Offset of the first element of the matrix \b C in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldc        Leading dimension of matrix \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *         point arithmetic with double precision;
+ *   - \b clAmdBlasInvalidValue if either \b offA or \b offC exceeds the size
+ *        of the respective buffer object;
+ *   - \b clAmdBlasInvalidValue if \b transA is set to \ref clAmdBlasConjTrans.
+ *   - the same error codes as the clAmdBlasSsyrk() function otherwise.
+ *
+ * @ingroup SYRK
+ */
+__inline clAmdBlasStatus
+clAmdBlasZsyrkEx(
+    clAmdBlasOrder order,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    DoubleComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZsyrk( order, uplo, transA, N, K, alpha, A, offA, lda, beta, C, offC, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/*@}*/
+
+/**
+ * @defgroup SYR2K SYR2K - Symmetric rank-2k update to a matrix
+ * @ingroup BLAS3
+ */
+
+/*@{*/
+
+/**
+ * @brief Rank-2k update of a symmetric matrix with float elements.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transAB    How matrices \b A and \b B is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrices \b A and \b B if they
+ *                       are not transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrices \b A and \b B.
+ * @param[in] A          Buffer object storing matrix \b A.
+ * @param[in] lda        Leading dimension of matrix \b A. It cannot be less
+ *                       than \b K if \b A is
+ *                       in the row-major format, and less than \b N
+ *                       otherwise.
+ * @param[in] B          Buffer object storing matrix \b B.
+ * @param[in] ldb        Leading dimension of matrix \b B. It cannot be less
+ *                       less than \b K if \b B matches to the op(\b B) matrix
+ *                       in the row-major format, and less than \b N
+ *                       otherwise.
+ * @param[in] beta       The factor of matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] ldc        Leading dimension of matrix \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasSsyr2kEx() instead.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b K is zero, or
+ *     - any of the leading dimensions is invalid;
+ *     - the matrix sizes lead to accessing outsize of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if either \b A, \b B or \b C object is
+ *     invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs to
+ *     was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SYR2K
+ */
+__inline clAmdBlasStatus
+clAmdBlasSsyr2k(
+    clAmdBlasOrder order,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transAB,
+    size_t N,
+    size_t K,
+    cl_float alpha,
+    const cl_mem A,
+    size_t lda,
+    const cl_mem B,
+    size_t ldb,
+    cl_float beta,
+    cl_mem C,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSsyr2k( order, uplo, transAB, N, K, alpha, A, 0, lda, B, 0, ldb, beta, C, 0, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_ssyr2k.c
+ * This is an example of how to use the @ref clAmdBlasSsyr2k function.
+ */
+
+/**
+ * @brief Rank-2k update of a symmetric matrix with double elements.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transAB    How matrices \b A and \b B is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrices \b A and \b B if they
+ *                       are not transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrices \b A and \b B.
+ * @param[in] A          Buffer object storing matrix \b A.
+ * @param[in] lda        Leading dimension of matrix \b A. For detailed
+ *                       description, see clAmdBlasSsyr2k().
+ * @param[in] B          Buffer object storing matrix \b B.
+ * @param[in] ldb        Leading dimension of matrix \b B. For detailed
+ *                       description, see clAmdBlasSsyr2k().
+ * @param[in] beta       The factor of matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] ldc        Leading dimension of matrix \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasDsyr2kEx() instead.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSsyr2k() function otherwise.
+ *
+ * @ingroup SYR2K
+ */
+__inline clAmdBlasStatus
+clAmdBlasDsyr2k(
+    clAmdBlasOrder order,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transAB,
+    size_t N,
+    size_t K,
+    cl_double alpha,
+    const cl_mem A,
+    size_t lda,
+    const cl_mem B,
+    size_t ldb,
+    cl_double beta,
+    cl_mem C,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDsyr2k( order, uplo, transAB, N, K, alpha, A, 0, lda, B, 0, ldb, beta, C, 0, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Rank-2k update of a symmetric matrix with complex float elements.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transAB    How matrices \b A and \b B is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrices \b A and \b B if they
+ *                       are not transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrices \b A and \b B.
+ * @param[in] A          Buffer object storing matrix \b A.
+ * @param[in] lda        Leading dimension of matrix \b A. For detailed
+ *                       description, see clAmdBlasSsyr2k().
+ * @param[in] B          Buffer object storing matrix \b B.
+ * @param[in] ldb        Leading dimension of matrix \b B. For detailed
+ *                       description, see clAmdBlasSsyr2k().
+ * @param[in] beta       The factor of matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] ldc        Leading dimension of matrix \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasCsyr2kEx() instead.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidValue if \b transAB is set to \ref clAmdBlasConjTrans.
+ *   - the same error codes as the clAmdBlasSsyr2k() function otherwise.
+ *
+ * @ingroup SYR2K
+ */
+__inline clAmdBlasStatus
+clAmdBlasCsyr2k(
+    clAmdBlasOrder order,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transAB,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t lda,
+    const cl_mem B,
+    size_t ldb,
+    FloatComplex beta,
+    cl_mem C,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCsyr2k( order, uplo, transAB, N, K, alpha, A, 0, lda, B, 0, ldb, beta, C, 0, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Rank-2k update of a symmetric matrix with complex double elements.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transAB    How matrices \b A and \b B is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrices \b A and \b B if they
+ *                       are not transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrices \b A and \b B.
+ * @param[in] A          Buffer object storing matrix \b A.
+ * @param[in] lda        Leading dimension of matrix \b A. For detailed
+ *                       description, see clAmdBlasSsyr2k().
+ * @param[in] B          Buffer object storing matrix \b B.
+ * @param[in] ldb        Leading dimension of matrix \b B. For detailed
+ *                       description, see clAmdBlasSsyr2k().
+ * @param[in] beta       The factor of matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] ldc        Leading dimension of matrix \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * The function is obsolete and is not recommended for using in new
+ * applications. Use the superseding function clAmdBlasZsyr2kEx() instead.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - \b clAmdBlasInvalidValue if \b transAB is set to \ref clAmdBlasConjTrans.
+ *   - the same error codes as the clAmdBlasSsyr2k() function otherwise.
+ *
+ * @ingroup SYR2K
+ */
+__inline clAmdBlasStatus
+clAmdBlasZsyr2k(
+    clAmdBlasOrder order,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transAB,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t lda,
+    const cl_mem B,
+    size_t ldb,
+    DoubleComplex beta,
+    cl_mem C,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZsyr2k( order, uplo, transAB, N, K, alpha, A, 0, lda, B, 0, ldb, beta, C, 0, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Rank-2k update of a symmetric matrix with float elements.
+ *        Extended version.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transAB    How matrices \b A and \b B is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrices \b A and \b B if they
+ *                       are not transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrices \b A and \b B.
+ * @param[in] A          Buffer object storing matrix \b A.
+ * @param[in] offA       Offset of the first element of the matrix \b A in the
+ *                       buffer object. Counted in elements.
+ * @param[in] lda        Leading dimension of matrix \b A. It cannot be less
+ *                       than \b K if \b A is
+ *                       in the row-major format, and less than \b N
+ *                       otherwise.
+ * @param[in] B          Buffer object storing matrix \b B.
+ * @param[in] offB       Offset of the first element of the matrix \b B in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldb        Leading dimension of matrix \b B. It cannot be less
+ *                       less than \b K if \b B matches to the op(\b B) matrix
+ *                       in the row-major format, and less than \b N
+ *                       otherwise.
+ * @param[in] beta       The factor of matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offC       Offset of the first element of the matrix \b C in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldc        Leading dimension of matrix \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidValue if either \b offA, \b offB or \b offC exceeds
+ *        the size of the respective buffer object;
+ *   - the same error codes as the clAmdBlasSsyr2k() function otherwise.
+ *
+ * @ingroup SYR2K
+ */
+__inline clAmdBlasStatus
+clAmdBlasSsyr2kEx(
+    clAmdBlasOrder order,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transAB,
+    size_t N,
+    size_t K,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_float beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSsyr2k( order, uplo, transAB, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_ssyr2k.c
+ * This is an example of how to use the @ref clAmdBlasSsyr2kEx function.
+ */
+
+/**
+ * @brief Rank-2k update of a symmetric matrix with double elements.
+ *        Extended version.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transAB    How matrices \b A and \b B is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrices \b A and \b B if they
+ *                       are not transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrices \b A and \b B.
+ * @param[in] A          Buffer object storing matrix \b A.
+ * @param[in] offA       Offset of the first element of the matrix \b A in the
+ *                       buffer object. Counted in elements.
+ * @param[in] lda        Leading dimension of matrix \b A. For detailed
+ *                       description, see clAmdBlasSsyr2k().
+ * @param[in] B          Buffer object storing matrix \b B.
+ * @param[in] offB       Offset of the first element of the matrix \b B in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldb        Leading dimension of matrix \b B. For detailed
+ *                       description, see clAmdBlasSsyr2k().
+ * @param[in] beta       The factor of matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offC       Offset of the first element of the matrix \b C in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldc        Leading dimension of matrix \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *        point arithmetic with double precision;
+ *   - \b clAmdBlasInvalidValue if either \b offA, \b offB or \b offC exceeds
+ *        the size of the respective buffer object;
+ *   - the same error codes as the clAmdBlasSsyr2k() function otherwise.
+ *
+ * @ingroup SYR2K
+ */
+__inline clAmdBlasStatus
+clAmdBlasDsyr2kEx(
+    clAmdBlasOrder order,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transAB,
+    size_t N,
+    size_t K,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_double beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDsyr2k( order, uplo, transAB, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Rank-2k update of a symmetric matrix with complex float elements.
+ *        Extended version.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transAB    How matrices \b A and \b B is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrices \b A and \b B if they
+ *                       are not transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrices \b A and \b B.
+ * @param[in] A          Buffer object storing matrix \b A.
+ * @param[in] offA       Offset of the first element of the matrix \b A in the
+ *                       buffer object. Counted in elements.
+ * @param[in] lda        Leading dimension of matrix \b A. For detailed
+ *                       description, see clAmdBlasSsyr2k().
+ * @param[in] B          Buffer object storing matrix \b B.
+ * @param[in] offB       Offset of the first element of the matrix \b B in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldb        Leading dimension of matrix \b B. For detailed
+ *                       description, see clAmdBlasSsyr2k().
+ * @param[in] beta       The factor of matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offC       Offset of the first element of the matrix \b C in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldc        Leading dimension of matrix \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidValue if either \b offA, \b offB or \b offC exceeds
+ *        the size of the respective buffer object;
+ *   - \b clAmdBlasInvalidValue if \b transAB is set to \ref clAmdBlasConjTrans.
+ *   - the same error codes as the clAmdBlasSsyr2k() function otherwise.
+ *
+ * @ingroup SYR2K
+ */
+__inline clAmdBlasStatus
+clAmdBlasCsyr2kEx(
+    clAmdBlasOrder order,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transAB,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    FloatComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCsyr2k( order, uplo, transAB, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Rank-2k update of a symmetric matrix with complex double elements.
+ *        Extended version.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transAB    How matrices \b A and \b B is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrices \b A and \b B if they
+ *                       are not transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrices \b A and \b B.
+ * @param[in] A          Buffer object storing matrix \b A.
+ * @param[in] offA       Offset of the first element of the matrix \b A in the
+ *                       buffer object. Counted in elements.
+ * @param[in] lda        Leading dimension of matrix \b A. For detailed
+ *                       description, see clAmdBlasSsyr2k().
+ * @param[in] B          Buffer object storing matrix \b B.
+ * @param[in] offB       Offset of the first element of the matrix \b B in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldb        Leading dimension of matrix \b B. For detailed
+ *                       description, see clAmdBlasSsyr2k().
+ * @param[in] beta       The factor of matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offC       Offset of the first element of the matrix \b C in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldc        Leading dimension of matrix \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *        point arithmetic with double precision;
+ *   - \b clAmdBlasInvalidValue if either \b offA, \b offB or \b offC exceeds
+ *        the size of the respective buffer object;
+ *   - \b clAmdBlasInvalidValue if \b transAB is set to \ref clAmdBlasConjTrans.
+ *   - the same error codes as the clAmdBlasSsyr2k() function otherwise.
+ *
+ * @ingroup SYR2K
+ */
+__inline clAmdBlasStatus
+clAmdBlasZsyr2kEx(
+    clAmdBlasOrder order,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transAB,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    DoubleComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZsyr2k( order, uplo, transAB, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/*@}*/
+
+
+/**
+ * @defgroup SYMM SYMM  - Symmetric matrix-matrix multiply
+ * @ingroup BLAS3
+ */
+/*@{*/
+
+/**
+ * @brief Matrix-matrix product of symmetric rectangular matrices with float
+ * elements.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha B A + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side		The side of triangular matrix.
+ * @param[in] uplo		The triangle in matrix being referenced.
+ * @param[in] M         Number of rows in matrices \b B and \b C.
+ * @param[in] N         Number of columns in matrices \b B and \b C.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offa      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b M when the \b side parameter is set to
+ *                      \b clAmdBlasLeft,\n or less than \b N when the
+ *                      parameter is set to \b clAmdBlasRight.
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] offb      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M
+ *                      when it is set to \b clAmdBlasColumnMajor.
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in] offc      Offset of the first element of the matrix \b C in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldc       Leading dimension of matrix \b C. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M when
+ *                      it is set to \b clAmdBlasColumnMajorOrder.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events			  Event objects per each command queue that identify
+ *								  a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b M or \b N is zero, or
+ *     - any of the leading dimensions is invalid;
+ *     - the matrix sizes lead to accessing outsize of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if A, B, or C object is invalid,
+ *     or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfResources if you use image-based function implementation
+ *     and no suitable scratch image available;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs to
+ *     was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SYMM
+ */
+__inline clAmdBlasStatus
+clAmdBlasSsymm(
+    clAmdBlasOrder order,
+    clAmdBlasSide side,
+    clAmdBlasUplo uplo,
+    size_t M,
+    size_t N,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_float beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasSsymm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_ssymm.c
+ * This is an example of how to use the @ref clAmdBlasSsymm function.
+ */
+
+
+/**
+ * @brief Matrix-matrix product of symmetric rectangular matrices with double
+ * elements.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha B A + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side		The side of triangular matrix.
+ * @param[in] uplo		The triangle in matrix being referenced.
+ * @param[in] M         Number of rows in matrices \b B and \b C.
+ * @param[in] N         Number of columns in matrices \b B and \b C.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offa      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b M when the \b side parameter is set to
+ *                      \b clAmdBlasLeft,\n or less than \b N when the
+ *                      parameter is set to \b clAmdBlasRight.
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] offb      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M
+ *                      when it is set to \b clAmdBlasColumnMajor.
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in] offc      Offset of the first element of the matrix \b C in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldc       Leading dimension of matrix \b C. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M when
+ *                      it is set to \b clAmdBlasColumnMajorOrder.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events			  Event objects per each command queue that identify
+ *								  a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasSsymm() function otherwise.
+ *
+ * @ingroup SYMM
+ */
+__inline clAmdBlasStatus
+clAmdBlasDsymm(
+    clAmdBlasOrder order,
+    clAmdBlasSide side,
+    clAmdBlasUplo uplo,
+    size_t M,
+    size_t N,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_double beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasDsymm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Matrix-matrix product of symmetric rectangular matrices with
+ * float-complex elements.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha B A + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side		The side of triangular matrix.
+ * @param[in] uplo		The triangle in matrix being referenced.
+ * @param[in] M         Number of rows in matrices \b B and \b C.
+ * @param[in] N         Number of columns in matrices \b B and \b C.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offa      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b M when the \b side parameter is set to
+ *                      \b clAmdBlasLeft,\n or less than \b N when the
+ *                      parameter is set to \b clAmdBlasRight.
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] offb      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M
+ *                      when it is set to \b clAmdBlasColumnMajor.
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in] offc      Offset of the first element of the matrix \b C in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldc       Leading dimension of matrix \b C. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M when
+ *                      it is set to \b clAmdBlasColumnMajorOrder.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events			  Event objects per each command queue that identify
+ *								  a particular kernel execution instance.
+ *
+ * @return The same result as the clAmdBlasSsymm() function.
+ *
+ * @ingroup SYMM
+ */
+__inline clAmdBlasStatus
+clAmdBlasCsymm(
+    clAmdBlasOrder order,
+    clAmdBlasSide side,
+    clAmdBlasUplo uplo,
+    size_t M,
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_float2 beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCsymm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @brief Matrix-matrix product of symmetric rectangular matrices with
+ * double-complex elements.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha B A + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side		The side of triangular matrix.
+ * @param[in] uplo		The triangle in matrix being referenced.
+ * @param[in] M         Number of rows in matrices \b B and \b C.
+ * @param[in] N         Number of columns in matrices \b B and \b C.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offa      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b M when the \b side parameter is set to
+ *                      \b clAmdBlasLeft,\n or less than \b N when the
+ *                      parameter is set to \b clAmdBlasRight.
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] offb      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M
+ *                      when it is set to \b clAmdBlasColumnMajor.
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in] offc      Offset of the first element of the matrix \b C in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldc       Leading dimension of matrix \b C. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M when
+ *                      it is set to \b clAmdBlasColumnMajorOrder.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events			  Event objects per each command queue that identify
+ *								  a particular kernel execution instance.
+ *
+ * @return The same result as the clAmdBlasDsymm() function.
+ *
+ * @ingroup SYMM
+ */
+__inline clAmdBlasStatus
+clAmdBlasZsymm(
+    clAmdBlasOrder order,
+    clAmdBlasSide side,
+    clAmdBlasUplo uplo,
+    size_t M,
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_double2 beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZsymm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/*@}*/
+
+
+/**
+ * @defgroup HEMM HEMM  - Hermitian matrix-matrix multiplication
+ * @ingroup BLAS3
+ */
+/*@{*/
+
+/**
+ * @brief Matrix-matrix product of hermitian rectangular matrices with
+ * float-complex elements.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha B A + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side		The side of triangular matrix.
+ * @param[in] uplo		The triangle in matrix being referenced.
+ * @param[in] M         Number of rows in matrices \b B and \b C.
+ * @param[in] N         Number of columns in matrices \b B and \b C.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offa      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b M when the \b side parameter is set to
+ *                      \b clAmdBlasLeft,\n or less than \b N when the
+ *                      parameter is set to \b clAmdBlasRight.
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] offb      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M
+ *                      when it is set to \b clAmdBlasColumnMajor.
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in] offc      Offset of the first element of the matrix \b C in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldc       Leading dimension of matrix \b C. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M when
+ *                      it is set to \b clAmdBlasColumnMajorOrder.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - \b M or \b N is zero, or
+ *     - any of the leading dimensions is invalid;
+ *     - the matrix sizes lead to accessing outsize of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if A, B, or C object is invalid,
+ *     or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfResources if you use image-based function implementation
+ *     and no suitable scratch image available;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs to
+ *     was released;
+ *   - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clAmdBlasCompilerNotAvailable if a compiler is not available;
+ *   - \b clAmdBlasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup HEMM
+ */
+__inline clAmdBlasStatus
+clAmdBlasChemm(
+    clAmdBlasOrder order,
+    clAmdBlasSide side,
+    clAmdBlasUplo uplo,
+    size_t M,
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_float2 beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasChemm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_chemm.cpp
+ * This is an example of how to use the @ref clAmdBlasChemm function.
+ */
+
+
+/**
+ * @brief Matrix-matrix product of hermitian rectangular matrices with
+ * double-complex elements.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha B A + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side		The side of triangular matrix.
+ * @param[in] uplo		The triangle in matrix being referenced.
+ * @param[in] M         Number of rows in matrices \b B and \b C.
+ * @param[in] N         Number of columns in matrices \b B and \b C.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offa      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b M when the \b side parameter is set to
+ *                      \b clAmdBlasLeft,\n or less than \b N when the
+ *                      parameter is set to \b clAmdBlasRight.
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] offb      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M
+ *                      when it is set to \b clAmdBlasColumnMajor.
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in] offc      Offset of the first element of the matrix \b C in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldc       Leading dimension of matrix \b C. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clAmdBlasRowMajor,\n or less than \b M when
+ *                      it is set to \b clAmdBlasColumnMajorOrder.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasChemm() function otherwise.
+ *
+ * @ingroup HEMM
+ */
+__inline clAmdBlasStatus
+clAmdBlasZhemm(
+    clAmdBlasOrder order,
+    clAmdBlasSide side,
+    clAmdBlasUplo uplo,
+    size_t M,
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_double2 beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZhemm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/*@}*/
+
+
+/**
+ * @defgroup HERK HERK  - Hermitian rank-k update to a matrix
+ * @ingroup BLAS3
+ */
+/*@{*/
+
+/**
+ * @brief Rank-k update of a hermitian matrix with float-complex elements.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A A^H + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^H A + \beta C \f$
+ *
+ * where \b C is a hermitian matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transA     How matrix \b A is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrix \b A if it is not
+ *                       transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrix \b A.
+ * @param[in] A          Buffer object storing the matrix \b A.
+ * @param[in] offa       Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda        Leading dimension of matrix \b A. It cannot be
+ *                       less than \b K if \b A is
+ *                       in the row-major format, and less than \b N
+ *                       otherwise.
+ * @param[in] beta       The factor of the matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offc       Offset in number of elements for the first element in matrix \b C.
+ * @param[in] ldc        Leading dimension of matric \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b K is zero, or
+ *     - any of the leading dimensions is invalid;
+ *     - the matrix sizes lead to accessing outsize of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if either \b A or \b C object is
+ *     invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs to
+ *     was released.
+ *
+ * @ingroup HERK
+ */
+__inline clAmdBlasStatus
+clAmdBlasCherk(
+    clAmdBlasOrder order,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    size_t N,
+    size_t K,
+    float alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    float beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCherk( order, uplo, transA, N, K, alpha, A, offa, lda, beta, C, offc, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_cherk.cpp
+ * This is an example of how to use the @ref clAmdBlasCherk function.
+ */
+
+
+/**
+ * @brief Rank-k update of a hermitian matrix with double-complex elements.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A A^H + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^H A + \beta C \f$
+ *
+ * where \b C is a hermitian matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transA     How matrix \b A is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrix \b A if it is not
+ *                       transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrix \b A.
+ * @param[in] A          Buffer object storing the matrix \b A.
+ * @param[in] offa       Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda        Leading dimension of matrix \b A. It cannot be
+ *                       less than \b K if \b A is
+ *                       in the row-major format, and less than \b N
+ *                       otherwise.
+ * @param[in] beta       The factor of the matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offc       Offset in number of elements for the first element in matrix \b C.
+ * @param[in] ldc        Leading dimension of matric \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasCherk() function otherwise.
+ *
+ * @ingroup HERK
+ */
+__inline clAmdBlasStatus
+clAmdBlasZherk(
+    clAmdBlasOrder order,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose transA,
+    size_t N,
+    size_t K,
+    double alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    double beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZherk( order, uplo, transA, N, K, alpha, A, offa, lda, beta, C, offc, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/*@}*/
+
+
+/**
+ * @defgroup HER2K HER2K  - Hermitian rank-2k update to a matrix
+ * @ingroup BLAS3
+ */
+/*@{*/
+
+/**
+ * @brief Rank-2k update of a hermitian matrix with float-complex elements.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A B^H + conj( \alpha ) B A^H + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^H B + conj( \alpha ) B^H A + \beta C \f$
+ *
+ * where \b C is a hermitian matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] trans      How matrix \b A is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrix \b A if it is not
+ *                       transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrix \b A.
+ * @param[in] A          Buffer object storing the matrix \b A.
+ * @param[in] offa       Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda        Leading dimension of matrix \b A. It cannot be
+ *                       less than \b K if \b A is
+ *                       in the row-major format, and less than \b N
+ *                       otherwise. Vice-versa for transpose case.
+ * @param[in] B          Buffer object storing the matrix \b B.
+ * @param[in] offb       Offset in number of elements for the first element in matrix \b B.
+ * @param[in] ldb        Leading dimension of matrix \b B. It cannot be
+ *                       less than \b K if \b B is
+ *                       in the row-major format, and less than \b N
+ *                       otherwise. Vice-versa for transpose case
+ * @param[in] beta       The factor of the matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offc       Offset in number of elements for the first element in matrix \b C.
+ * @param[in] ldc        Leading dimension of matric \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called;
+ *   - \b clAmdBlasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b K is zero, or
+ *     - any of the leading dimensions is invalid;
+ *     - the matrix sizes lead to accessing outsize of any of the buffers;
+ *   - \b clAmdBlasInvalidMemObject if either \b A , \b B or \b C object is
+ *     invalid, or an image object rather than the buffer one;
+ *   - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clAmdBlasInvalidContext if a context a passed command queue belongs to
+ *     was released.
+ *
+ * @ingroup HER2K
+ */
+__inline clAmdBlasStatus
+clAmdBlasCher2k(
+    clAmdBlasOrder order,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose trans,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_float beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasCher2k( order, uplo, trans, N, K, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/**
+ * @example example_cher2k.c
+ * This is an example of how to use the @ref clAmdBlasCher2k function.
+ */
+
+
+/**
+ * @brief Rank-2k update of a hermitian matrix with double-complex elements.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A B^H + conj( \alpha ) B A^H + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^H B + conj( \alpha ) B^H A + \beta C \f$
+ *
+ * where \b C is a hermitian matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] trans      How matrix \b A is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrix \b A if it is not
+ *                       transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrix \b A.
+ * @param[in] A          Buffer object storing the matrix \b A.
+ * @param[in] offa       Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda        Leading dimension of matrix \b A. It cannot be
+ *                       less than \b K if \b A is
+ *                       in the row-major format, and less than \b N
+ *                       otherwise. Vice-versa for transpose case.
+ * @param[in] B          Buffer object storing the matrix \b B.
+ * @param[in] offb       Offset in number of elements for the first element in matrix \b B.
+ * @param[in] ldb        Leading dimension of matrix \b B. It cannot be
+ *                       less than \b K if B is
+ *                       in the row-major format, and less than \b N
+ *                       otherwise. Vice-versa for transpose case.
+ * @param[in] beta       The factor of the matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offc       Offset in number of elements for the first element in matrix \b C.
+ * @param[in] ldc        Leading dimension of matric \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clAmdBlasSuccess on success;
+ *   - \b clAmdBlasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clAmdBlasCher2k() function otherwise.
+ *
+ * @ingroup HER2K
+ */
+__inline clAmdBlasStatus
+clAmdBlasZher2k(
+    clAmdBlasOrder order,
+    clAmdBlasUplo uplo,
+    clAmdBlasTranspose trans,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_double beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	return clblasZher2k( order, uplo, trans, N, K, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc,
+			numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+/*@}*/
+
+
+
+
+#ifdef __cplusplus
+}      /* extern "C" { */
+#endif
+
+#endif /* CLAMDBLAS_H_ */
diff --git a/src/clAmdBlas.version.h b/src/clAmdBlas.version.h
new file mode 100644
index 0000000..26b8703
--- /dev/null
+++ b/src/clAmdBlas.version.h
@@ -0,0 +1,22 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/* the configured version and settings for clblas
+ */
+#define clAmdBlasVersionMajor 2
+#define clAmdBlasVersionMinor 0
+#define clAmdBlasVersionPatch 0
diff --git a/src/clBLAS-complex.h b/src/clBLAS-complex.h
new file mode 100644
index 0000000..391868e
--- /dev/null
+++ b/src/clBLAS-complex.h
@@ -0,0 +1,53 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef CLBLAS_COMPLEX_H_
+#define CLBLAS_COMPLEX H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef cl_float2 FloatComplex;
+typedef cl_double2 DoubleComplex;
+
+static __inline FloatComplex
+floatComplex(float real, float imag)
+{
+    FloatComplex z;
+    z.s[0] = real;
+    z.s[1] = imag;
+    return z;
+}
+
+static __inline DoubleComplex
+doubleComplex(double real, double imag)
+{
+    DoubleComplex z;
+    z.s[0] = real;
+    z.s[1] = imag;
+    return z;
+}
+
+#define CREAL(v) ((v).s[0])
+#define CIMAG(v) ((v).s[1])
+
+#ifdef __cplusplus
+}      /* extern "C" { */
+#endif
+
+#endif /* CLBLAS_COMPLEX_H_ */
diff --git a/src/clBLAS.def b/src/clBLAS.def
new file mode 100644
index 0000000..5111ff2
--- /dev/null
+++ b/src/clBLAS.def
@@ -0,0 +1,215 @@
+;/***********************************************************************
+;**	Copyright (C) 2010 Advanced Micro Devices, Inc. All Rights Reserved.
+;***********************************************************************/
+
+LIBRARY	clBLAS
+
+EXPORTS
+	clblasGetVersion
+	clblasSetup
+	clblasTeardown
+
+	clblasSgemv
+	clblasDgemv
+	clblasCgemv
+	clblasZgemv
+
+	clblasSsymv
+	clblasDsymv
+
+	clblasSgemm
+	clblasDgemm
+	clblasCgemm
+	clblasZgemm
+	;GEMMV2 is not exported
+    ;clblasSgemmV2
+	;clblasDgemmV2
+	;clblasCgemmV2
+	;clblasZgemmV2
+
+	clblasStrmm
+	clblasDtrmm
+	clblasCtrmm
+	clblasZtrmm
+
+	clblasStrsm
+	clblasDtrsm
+	clblasCtrsm
+	clblasZtrsm
+
+	clblasSsyr2k
+	clblasDsyr2k
+	clblasCsyr2k
+	clblasZsyr2k	
+
+	clblasSsyrk
+	clblasDsyrk
+	clblasCsyrk
+	clblasZsyrk
+
+	;GEMMV2 is not exported
+	;clblasSgemmExV2
+	;clblasDgemmExV2
+	;clblasCgemmExV2
+	;clblasZgemmExV2
+
+	clblasStrmv
+	clblasDtrmv
+	clblasCtrmv
+	clblasZtrmv
+	
+	clblasStrsv
+	clblasDtrsv
+	clblasCtrsv
+	clblasZtrsv
+	
+	clblasStpsv
+    clblasDtpsv
+    clblasCtpsv
+    clblasZtpsv
+    
+	clblasSsymm
+	clblasDsymm
+	clblasCsymm
+	clblasZsymm
+	
+	clblasSger
+	clblasDger
+	
+	clblasCgeru
+	clblasZgeru
+	
+	clblasCgerc
+	clblasZgerc
+	
+	clblasSsyr
+	clblasDsyr
+	
+	clblasCher
+	clblasZher
+	
+	clblasSsyr2
+	clblasDsyr2
+
+	clblasChemv
+	clblasZhemv
+	
+	clblasCher2
+	clblasZher2	
+
+	clblasChemm
+	clblasZhemm
+	
+	clblasCherk
+	clblasZherk
+	
+	clblasStpmv
+	clblasDtpmv
+	clblasCtpmv
+	clblasZtpmv
+	
+	clblasSspmv
+	clblasDspmv
+	
+	clblasChpmv
+	clblasZhpmv
+	
+	clblasSspr
+	clblasDspr
+	
+	clblasChpr
+	clblasZhpr
+	
+	clblasSspr2
+	clblasDspr2
+
+	clblasChpr2
+	clblasZhpr2
+	
+	clblasSgbmv
+	clblasDgbmv
+	clblasCgbmv
+	clblasZgbmv
+	
+	clblasStbmv
+	clblasDtbmv
+	clblasCtbmv
+	clblasZtbmv
+	
+	clblasSsbmv
+	clblasDsbmv
+	
+	clblasChbmv
+	clblasZhbmv
+	
+	clblasStbsv
+	clblasDtbsv
+	clblasCtbsv
+	clblasZtbsv
+	
+	clblasCher2k
+	clblasZher2k
+
+	clblasSswap
+	clblasDswap
+	clblasCswap
+	clblasZswap
+	
+	clblasSscal
+	clblasDscal
+	clblasCscal
+	clblasZscal
+	
+	clblasCsscal
+	clblasZdscal
+	
+	clblasScopy
+	clblasDcopy
+	clblasCcopy
+	clblasZcopy
+	
+	clblasSaxpy
+	clblasDaxpy
+	clblasCaxpy
+	clblasZaxpy
+	
+	clblasSdot
+	clblasDdot
+	clblasCdotu
+	clblasZdotu
+	clblasCdotc
+	clblasZdotc
+
+    clblasSrotg
+	clblasDrotg
+	clblasCrotg
+	clblasZrotg
+	
+	clblasSrotmg
+	clblasDrotmg
+	
+	clblasSrot
+	clblasDrot
+	clblasCsrot
+	clblasZdrot
+	
+	clblasSrotm
+	clblasDrotm
+	
+	clblasSnrm2
+	clblasDnrm2
+	clblasScnrm2
+	clblasDznrm2
+
+    clblasSasum
+    clblasDasum
+    clblasScasum
+    clblasDzasum
+
+    clblasiSamax
+    clblasiDamax
+    clblasiCamax
+    clblasiZamax
+	
+	clblasAddScratchImage
+	clblasRemoveScratchImage
diff --git a/src/clBLAS.h b/src/clBLAS.h
new file mode 100644
index 0000000..6d219c3
--- /dev/null
+++ b/src/clBLAS.h
@@ -0,0 +1,9650 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+
+#ifndef CLBLAS_H_
+#define CLBLAS_H_
+
+/**
+ * @mainpage OpenCL BLAS
+ *
+ * This is an implementation of
+ * <A HREF="http://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms">
+ * Basic Linear Algebra Subprograms</A>, levels 1, 2 and 3 using
+ * <A HREF="http://www.khronos.org/opencl/">OpenCL</A> and optimized for
+ * the AMD GPU hardware.
+ */
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include <clBLAS-complex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @defgroup OVERVIEW Overview
+ *
+ * This library provides an implementation of the Basic Linear Algebra Subprograms levels 1, 2 and 3,
+ * using OpenCL and optimized for AMD GPU hardware. It provides BLAS-1 functions
+ * SWAP, SCAL, COPY, AXPY, DOT, DOTU, DOTC, ROTG, ROTMG, ROT, ROTM, iAMAX, ASUM and NRM2,
+ * BLAS-2 functions GEMV, SYMV, TRMV, TRSV, HEMV, SYR, SYR2, HER, HER2, GER, GERU, GERC,
+ * TPMV, SPMV, HPMV, TPSV, SPR, SPR2, HPR, HPR2, GBMV, TBMV, SBMV, HBMV and TBSV
+ * and BLAS-3 functions GEMM, SYMM, TRMM, TRSM, HEMM, HERK, HER2K, SYRK and SYR2K.
+ *
+ * This library’s primary goal is to assist the end user to enqueue OpenCL
+ * kernels to process BLAS functions in an OpenCL-efficient manner, while
+ * keeping interfaces familiar to users who know how to use BLAS. All
+ * functions accept matrices through buffer objects.
+ *
+ * @section deprecated
+ * This library provided support for the creation of scratch images to achieve better performance
+ * on older <a href="http://developer.amd.com/gpu/AMDAPPSDK/Pages/default.aspx">AMD APP SDK's</a>.
+ * However, memory buffers now give the same performance as buffers objects in the current SDK's.
+ * Scratch image buffers are being deprecated and users are advised not to use scratch images in
+ * new applications.
+ */
+
+/**
+ * @defgroup TYPES clblas types
+ */
+/*@{*/
+
+
+/** Shows how matrices are placed in memory. */
+typedef enum clblasOrder_ {
+    clblasRowMajor,           /**< Every row is placed sequentially */
+    clblasColumnMajor         /**< Every column is placed sequentially */
+} clblasOrder;
+
+/** Used to specify whether the matrix is to be transposed or not. */
+typedef enum clblasTranspose_ {
+    clblasNoTrans,           /**< Operate with the matrix. */
+    clblasTrans,             /**< Operate with the transpose of the matrix. */
+    clblasConjTrans          /**< Operate with the conjugate transpose of
+                                     the matrix. */
+} clblasTranspose;
+
+/** Used by the Hermitian, symmetric and triangular matrix
+ * routines to specify whether the upper or lower triangle is being referenced.
+ */
+typedef enum clblasUplo_ {
+    clblasUpper,               /**< Upper triangle. */
+    clblasLower                /**< Lower triangle. */
+} clblasUplo;
+
+/** It is used by the triangular matrix routines to specify whether the
+ * matrix is unit triangular.
+ */
+typedef enum clblasDiag_ {
+    clblasUnit,               /**< Unit triangular. */
+    clblasNonUnit             /**< Non-unit triangular. */
+} clblasDiag;
+
+/** Indicates the side matrix A is located relative to matrix B during multiplication. */
+typedef enum clblasSide_ {
+    clblasLeft,        /**< Multiply general matrix by symmetric,
+                               Hermitian or triangular matrix on the left. */
+    clblasRight        /**< Multiply general matrix by symmetric,
+                               Hermitian or triangular matrix on the right. */
+} clblasSide;
+
+/**
+ *   @brief clblas error codes definition, incorporating OpenCL error
+ *   definitions.
+ *
+ *   This enumeration is a subset of the OpenCL error codes extended with some
+ *   additional extra codes.  For example, CL_OUT_OF_HOST_MEMORY, which is
+ *   defined in cl.h is aliased as clblasOutOfHostMemory.
+ */
+typedef enum clblasStatus_ {
+    clblasSuccess                         = CL_SUCCESS,
+    clblasInvalidValue                    = CL_INVALID_VALUE,
+    clblasInvalidCommandQueue             = CL_INVALID_COMMAND_QUEUE,
+    clblasInvalidContext                  = CL_INVALID_CONTEXT,
+    clblasInvalidMemObject                = CL_INVALID_MEM_OBJECT,
+    clblasInvalidDevice                   = CL_INVALID_DEVICE,
+    clblasInvalidEventWaitList            = CL_INVALID_EVENT_WAIT_LIST,
+    clblasOutOfResources                  = CL_OUT_OF_RESOURCES,
+    clblasOutOfHostMemory                 = CL_OUT_OF_HOST_MEMORY,
+    clblasInvalidOperation                = CL_INVALID_OPERATION,
+    clblasCompilerNotAvailable            = CL_COMPILER_NOT_AVAILABLE,
+    clblasBuildProgramFailure             = CL_BUILD_PROGRAM_FAILURE,
+    /* Extended error codes */
+    clblasNotImplemented         = -1024, /**< Functionality is not implemented */
+    clblasNotInitialized,                 /**< clblas library is not initialized yet */
+    clblasInvalidMatA,                    /**< Matrix A is not a valid memory object */
+    clblasInvalidMatB,                    /**< Matrix B is not a valid memory object */
+    clblasInvalidMatC,                    /**< Matrix C is not a valid memory object */
+    clblasInvalidVecX,                    /**< Vector X is not a valid memory object */
+    clblasInvalidVecY,                    /**< Vector Y is not a valid memory object */
+    clblasInvalidDim,                     /**< An input dimension (M,N,K) is invalid */
+    clblasInvalidLeadDimA,                /**< Leading dimension A must not be less than the size of the first dimension */
+    clblasInvalidLeadDimB,                /**< Leading dimension B must not be less than the size of the second dimension */
+    clblasInvalidLeadDimC,                /**< Leading dimension C must not be less than the size of the third dimension */
+    clblasInvalidIncX,                    /**< The increment for a vector X must not be 0 */
+    clblasInvalidIncY,                    /**< The increment for a vector Y must not be 0 */
+    clblasInsufficientMemMatA,            /**< The memory object for Matrix A is too small */
+    clblasInsufficientMemMatB,            /**< The memory object for Matrix B is too small */
+    clblasInsufficientMemMatC,            /**< The memory object for Matrix C is too small */
+    clblasInsufficientMemVecX,            /**< The memory object for Vector X is too small */
+    clblasInsufficientMemVecY             /**< The memory object for Vector Y is too small */
+} clblasStatus;
+
+
+/*@}*/
+
+/**
+ * @defgroup VERSION Version information
+ */
+/*@{*/
+
+/**
+ * @brief Get the clblas library version info.
+ *
+ * @param[out] major        Location to store library's major version.
+ * @param[out] minor        Location to store library's minor version.
+ * @param[out] patch        Location to store library's patch version.
+ *
+ * @returns always \b clblasSuccess.
+ *
+ * @ingroup VERSION
+ */
+clblasStatus
+clblasGetVersion(cl_uint* major, cl_uint* minor, cl_uint* patch);
+
+/*@}*/
+
+/**
+ * @defgroup INIT Initialize library
+ */
+/*@{*/
+
+/**
+ * @brief Initialize the clblas library.
+ *
+ * Must be called before any other clblas API function is invoked.
+ * @note This function is not thread-safe.
+ *
+ * @return
+ *   - \b clblasSucces on success;
+ *   - \b clblasOutOfHostMemory if there is not enough of memory to allocate
+ *     library's internal structures;
+ *   - \b clblasOutOfResources in case of requested resources scarcity.
+ *
+ * @ingroup INIT
+ */
+clblasStatus
+clblasSetup(void);
+
+/**
+ * @brief Finalize the usage of the clblas library.
+ *
+ * Frees all memory allocated for different computational kernel and other
+ * internal data.
+ * @note This function is not thread-safe.
+ *
+ * @ingroup INIT
+ */
+void
+clblasTeardown(void);
+
+/*@}*/
+
+/**
+ * @defgroup BLAS1 BLAS-1 functions
+ *
+ * The Level 1 Basic Linear Algebra Subprograms are functions that perform
+ * vector-vector operations.
+ */
+/*@{*/
+/*@}*/
+
+/**
+ * @defgroup SWAP SWAP  - Swap elements from 2 vectors
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief interchanges two vectors of float.
+ *
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clblasInvalidMemObject if either \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SWAP
+ */
+clblasStatus
+clblasSswap(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_sswap.c
+ * Example of how to use the @ref clblasSswap function.
+ */
+
+ /**
+ * @brief interchanges two vectors of double.
+ *
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clblasSswap() function otherwise.
+ *
+ * @ingroup SWAP
+ */
+clblasStatus
+clblasDswap(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief interchanges two vectors of complex-float elements.
+ *
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - the same error codes as the clblasSwap() function otherwise.
+ *
+ * @ingroup SWAP
+ */
+clblasStatus
+clblasCswap(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief interchanges two vectors of double-complex elements.
+ *
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - the same error codes as the clblasDwap() function otherwise.
+ *
+ * @ingroup SWAP
+ */
+clblasStatus
+clblasZswap(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/*@}*/
+
+
+/**
+ * @defgroup SCAL SCAL  - Scales a vector by a constant
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief Scales a float vector by a float constant
+ *
+ *   - \f$ X \leftarrow \alpha X \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] alpha     The constant factor for vector \b X.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - \b incx zero, or
+ *     - the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clblasInvalidMemObject if either \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SCAL
+ */
+clblasStatus
+clblasSscal(
+    size_t N,
+    cl_float alpha,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/**
+ * @example example_sscal.c
+ * Example of how to use the @ref clblasSscal function.
+ */
+
+ /**
+ * @brief Scales a double vector by a double constant
+ *
+ *   - \f$ X \leftarrow \alpha X \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] alpha     The constant factor for vector \b X.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clblasSscal() function otherwise.
+ *
+ * @ingroup SCAL
+ */
+clblasStatus
+clblasDscal(
+    size_t N,
+    cl_double alpha,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief Scales a complex-float vector by a complex-float constant
+ *
+  *   - \f$ X \leftarrow \alpha X \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] alpha     The constant factor for vector \b X.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - the same error codes as the clblasSscal() function otherwise.
+ *
+ * @ingroup SCAL
+ */
+clblasStatus
+clblasCscal(
+    size_t N,
+    cl_float2 alpha,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief Scales a complex-double vector by a complex-double constant
+ *
+ *   - \f$ X \leftarrow \alpha X \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] alpha     The constant factor for vector \b X.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - the same error codes as the clblasDscal() function otherwise.
+ *
+ * @ingroup SCAL
+ */
+clblasStatus
+clblasZscal(
+    size_t N,
+    cl_double2 alpha,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/*@}*/
+
+/**
+ * @defgroup SSCAL SSCAL  - Scales a complex vector by a real constant
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief Scales a complex-float vector by a float constant
+ *
+ *   - \f$ X \leftarrow \alpha X \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] alpha     The constant factor for vector \b X.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - \b incx zero, or
+ *     - the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clblasInvalidMemObject if either \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SSCAL
+ */
+clblasStatus
+clblasCsscal(
+    size_t N,
+    cl_float alpha,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/**
+ * @example example_csscal.c
+ * Example of how to use the @ref clblasCsscal function.
+ */
+
+/**
+ * @brief Scales a complex-double vector by a double constant
+ *
+ *   - \f$ X \leftarrow \alpha X \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] alpha     The constant factor for vector \b X.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clblasCsscal() function otherwise.
+ *
+ * @ingroup SSCAL
+ */
+clblasStatus
+clblasZdscal(
+    size_t N,
+    cl_double alpha,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+ /*@}*/
+
+
+/**
+ * @defgroup COPY COPY  - Copies elements from vector X to vector Y
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief Copies float elements from vector X to vector Y
+ *
+ *   - \f$ Y \leftarrow X \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clblasInvalidMemObject if either \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup COPY
+ */
+clblasStatus
+clblasScopy(
+    size_t N,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_scopy.c
+ * Example of how to use the @ref clblasScopy function.
+ */
+
+ /**
+ * @brief Copies double elements from vector X to vector Y
+ *
+ *   - \f$ Y \leftarrow X \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clblasScopy() function otherwise.
+ *
+ * @ingroup COPY
+ */
+clblasStatus
+clblasDcopy(
+    size_t N,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief Copies complex-float elements from vector X to vector Y
+ *
+ *   - \f$ Y \leftarrow X \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - the same error codes as the clblasScopy() function otherwise.
+ *
+ * @ingroup COPY
+ */
+clblasStatus
+clblasCcopy(
+    size_t N,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief Copies complex-double elements from vector X to vector Y
+ *
+ *   - \f$ Y \leftarrow X \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - the same error codes as the clblasDcopy() function otherwise.
+ *
+ * @ingroup COPY
+ */
+clblasStatus
+clblasZcopy(
+    size_t N,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+ /*@}*/
+
+/**
+ * @defgroup AXPY AXPY  - Scale X and add to Y
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief Scale vector X of float elements and add to Y
+ *
+ *   - \f$ Y \leftarrow \alpha X + Y \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] alpha     The constant factor for vector \b X.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clblasInvalidMemObject if either \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup AXPY
+ */
+clblasStatus
+clblasSaxpy(
+    size_t N,
+    cl_float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_saxpy.c
+ * Example of how to use the @ref clblasSaxpy function.
+ */
+
+/**
+ * @brief Scale vector X of double elements and add to Y
+ *
+ *   - \f$ Y \leftarrow \alpha X + Y \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] alpha     The constant factor for vector \b X.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clblasSaxpy() function otherwise.
+ *
+ * @ingroup AXPY
+ */
+clblasStatus
+clblasDaxpy(
+    size_t N,
+    cl_double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief Scale vector X of complex-float elements and add to Y
+ *
+ *   - \f$ Y \leftarrow \alpha X + Y \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] alpha     The constant factor for vector \b X.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - the same error codes as the clblasSaxpy() function otherwise.
+ *
+ * @ingroup AXPY
+ */
+clblasStatus
+clblasCaxpy(
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief Scale vector X of double-complex elements and add to Y
+ *
+ *   - \f$ Y \leftarrow \alpha X + Y \f$
+ *
+ * @param[in] N         Number of elements in vector \b X.
+ * @param[in] alpha     The constant factor for vector \b X.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - the same error codes as the clblasDaxpy() function otherwise.
+ *
+ * @ingroup AXPY
+ */
+clblasStatus
+clblasZaxpy(
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/*@}*/
+
+
+/**
+ * @defgroup DOT DOT  - Dot product of two vectors
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief dot product of two vectors containing float elements
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] dotProduct   Buffer object that will contain the dot-product value
+ * @param[in] offDP         Offset to dot-product in \b dotProduct buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y             Buffer object storing the vector \b Y.
+ * @param[in] offy          Offset of first element of vector \b Y in buffer object.
+ *                          Counted in elements.
+ * @param[in] incy          Increment for the elements of \b Y. Must not be zero.
+ * @param[in] scratchBuff	Temporary cl_mem scratch buffer object of minimum size N
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clblasInvalidMemObject if either \b X, \b Y or \b dotProduct object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup DOT
+ */
+clblasStatus
+clblasSdot(
+    size_t N,
+    cl_mem dotProduct,
+    size_t offDP,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_sdot.c
+ * Example of how to use the @ref clblasSdot function.
+ */
+
+/**
+ * @brief dot product of two vectors containing double elements
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] dotProduct   Buffer object that will contain the dot-product value
+ * @param[in] offDP         Offset to dot-product in \b dotProduct buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y             Buffer object storing the vector \b Y.
+ * @param[in] offy          Offset of first element of vector \b Y in buffer object.
+ *                          Counted in elements.
+ * @param[in] incy          Increment for the elements of \b Y. Must not be zero.
+ * @param[in] scratchBuff	Temporary cl_mem scratch buffer object of minimum size N
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clblasSdot() function otherwise.
+ *
+ * @ingroup DOT
+ */
+clblasStatus
+clblasDdot(
+    size_t N,
+    cl_mem dotProduct,
+    size_t offDP,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+
+/**
+ * @brief dot product of two vectors containing float-complex elements
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] dotProduct   Buffer object that will contain the dot-product value
+ * @param[in] offDP         Offset to dot-product in \b dotProduct buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y             Buffer object storing the vector \b Y.
+ * @param[in] offy          Offset of first element of vector \b Y in buffer object.
+ *                          Counted in elements.
+ * @param[in] incy          Increment for the elements of \b Y. Must not be zero.
+ * @param[in] scratchBuff   Temporary cl_mem scratch buffer object of minimum size N
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - the same error codes as the clblasSdot() function otherwise.
+ *
+ * @ingroup DOT
+ */
+
+clblasStatus
+clblasCdotu(
+    size_t N,
+    cl_mem dotProduct,
+    size_t offDP,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+
+/**
+ * @brief dot product of two vectors containing double-complex elements
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] dotProduct   Buffer object that will contain the dot-product value
+ * @param[in] offDP         Offset to dot-product in \b dotProduct buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y             Buffer object storing the vector \b Y.
+ * @param[in] offy          Offset of first element of vector \b Y in buffer object.
+ *                          Counted in elements.
+ * @param[in] incy          Increment for the elements of \b Y. Must not be zero.
+ * @param[in] scratchBuff   Temporary cl_mem scratch buffer object of minimum size N
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clblasSdot() function otherwise.
+ *
+ * @ingroup DOT
+ */
+
+clblasStatus
+clblasZdotu(
+    size_t N,
+    cl_mem dotProduct,
+    size_t offDP,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+
+/**
+ * @brief dot product of two vectors containing float-complex elements conjugating the first vector
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] dotProduct   Buffer object that will contain the dot-product value
+ * @param[in] offDP         Offset to dot-product in \b dotProduct buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y             Buffer object storing the vector \b Y.
+ * @param[in] offy          Offset of first element of vector \b Y in buffer object.
+ *                          Counted in elements.
+ * @param[in] incy          Increment for the elements of \b Y. Must not be zero.
+ * @param[in] scratchBuff   Temporary cl_mem scratch buffer object of minimum size N
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - the same error codes as the clblasSdot() function otherwise.
+ *
+ * @ingroup DOT
+ */
+
+clblasStatus
+clblasCdotc(
+    size_t N,
+    cl_mem dotProduct,
+    size_t offDP,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+
+/**
+ * @brief dot product of two vectors containing double-complex elements conjugating the first vector
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] dotProduct   Buffer object that will contain the dot-product value
+ * @param[in] offDP         Offset to dot-product in \b dotProduct buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y             Buffer object storing the vector \b Y.
+ * @param[in] offy          Offset of first element of vector \b Y in buffer object.
+ *                          Counted in elements.
+ * @param[in] incy          Increment for the elements of \b Y. Must not be zero.
+ * @param[in] scratchBuff   Temporary cl_mem scratch buffer object of minimum size N
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clblasSdot() function otherwise.
+ *
+ * @ingroup DOT
+ */
+
+clblasStatus
+clblasZdotc(
+    size_t N,
+    cl_mem dotProduct,
+    size_t offDP,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/*@}*/
+
+
+/**
+ * @defgroup ROTG ROTG  - Constructs givens plane rotation
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief construct givens plane rotation on float elements
+ *
+ * @param[out] SA           Buffer object that contains SA
+ * @param[in] offSA         Offset to SA in \b SA buffer object.
+ *                          Counted in elements.
+ * @param[out] SB           Buffer object that contains SB
+ * @param[in] offSB         Offset to SB in \b SB buffer object.
+ *                          Counted in elements.
+ * @param[out] C            Buffer object that contains C
+ * @param[in] offC          Offset to C in \b C buffer object.
+ *                          Counted in elements.
+ * @param[out] S            Buffer object that contains S
+ * @param[in] offS          Offset to S in \b S buffer object.
+ *                          Counted in elements.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidMemObject if either \b SA, \b SB, \b C or \b S object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup ROTG
+ */
+clblasStatus
+clblasSrotg(
+    cl_mem SA,
+    size_t offSA,
+    cl_mem SB,
+    size_t offSB,
+    cl_mem C,
+    size_t offC,
+    cl_mem S,
+    size_t offS,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_srotg.c
+ * Example of how to use the @ref clblasSrotg function.
+ */
+
+/**
+ * @brief construct givens plane rotation on double elements
+ *
+ * @param[out] DA           Buffer object that contains DA
+ * @param[in] offDA         Offset to DA in \b DA buffer object.
+ *                          Counted in elements.
+ * @param[out] DB           Buffer object that contains DB
+ * @param[in] offDB         Offset to DB in \b DB buffer object.
+ *                          Counted in elements.
+ * @param[out] C            Buffer object that contains C
+ * @param[in] offC          Offset to C in \b C buffer object.
+ *                          Counted in elements.
+ * @param[out] S            Buffer object that contains S
+ * @param[in] offS          Offset to S in \b S buffer object.
+ *                          Counted in elements.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clblasSrotg() function otherwise.
+ *
+ * @ingroup ROTG
+ */
+clblasStatus
+clblasDrotg(
+    cl_mem DA,
+    size_t offDA,
+    cl_mem DB,
+    size_t offDB,
+    cl_mem C,
+    size_t offC,
+    cl_mem S,
+    size_t offS,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief construct givens plane rotation on float-complex elements
+ *
+ * @param[out] CA           Buffer object that contains CA
+ * @param[in] offCA         Offset to CA in \b CA buffer object.
+ *                          Counted in elements.
+ * @param[out] CB           Buffer object that contains CB
+ * @param[in] offCB         Offset to CB in \b CB buffer object.
+ *                          Counted in elements.
+ * @param[out] C            Buffer object that contains C. C is real.
+ * @param[in] offC          Offset to C in \b C buffer object.
+ *                          Counted in elements.
+ * @param[out] S            Buffer object that contains S
+ * @param[in] offS          Offset to S in \b S buffer object.
+ *                          Counted in elements.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - the same error codes as the clblasSrotg() function otherwise.
+ *
+ * @ingroup ROTG
+ */
+clblasStatus
+clblasCrotg(
+    cl_mem CA,
+    size_t offCA,
+    cl_mem CB,
+    size_t offCB,
+    cl_mem C,
+    size_t offC,
+    cl_mem S,
+    size_t offS,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief construct givens plane rotation on double-complex elements
+ *
+ * @param[out] CA           Buffer object that contains CA
+ * @param[in] offCA         Offset to CA in \b CA buffer object.
+ *                          Counted in elements.
+ * @param[out] CB           Buffer object that contains CB
+ * @param[in] offCB         Offset to CB in \b CB buffer object.
+ *                          Counted in elements.
+ * @param[out] C            Buffer object that contains C. C is real.
+ * @param[in] offC          Offset to C in \b C buffer object.
+ *                          Counted in elements.
+ * @param[out] S            Buffer object that contains S
+ * @param[in] offS          Offset to S in \b S buffer object.
+ *                          Counted in elements.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - the same error codes as the clblasDrotg() function otherwise.
+ *
+ * @ingroup ROTG
+ */
+clblasStatus
+clblasZrotg(
+    cl_mem CA,
+    size_t offCA,
+    cl_mem CB,
+    size_t offCB,
+    cl_mem C,
+    size_t offC,
+    cl_mem S,
+    size_t offS,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/*@}*/
+
+/**
+ * @defgroup ROTMG ROTMG  - Constructs the modified givens rotation
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief construct the modified givens rotation on float elements
+ *
+ * @param[out] SD1          Buffer object that contains SD1
+ * @param[in] offSD1        Offset to SD1 in \b SD1 buffer object.
+ *                          Counted in elements.
+ * @param[out] SD2          Buffer object that contains SD2
+ * @param[in] offSD2        Offset to SD2 in \b SD2 buffer object.
+ *                          Counted in elements.
+ * @param[out] SX1          Buffer object that contains SX1
+ * @param[in] offSX1        Offset to SX1 in \b SX1 buffer object.
+ *                          Counted in elements.
+ * @param[in] SY1           Buffer object that contains SY1
+ * @param[in] offSY1        Offset to SY1 in \b SY1 buffer object.
+ *                          Counted in elements.
+ * @param[out] SPARAM       Buffer object that contains SPARAM array of minimum length 5
+                            SPARAM(0) = SFLAG
+                            SPARAM(1) = SH11
+                            SPARAM(2) = SH21
+                            SPARAM(3) = SH12
+                            SPARAM(4) = SH22
+
+ * @param[in] offSparam     Offset to SPARAM in \b SPARAM buffer object.
+ *                          Counted in elements.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidMemObject if either \b SX1, \b SY1, \b SD1, \b SD2 or \b SPARAM object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup ROTMG
+ */
+clblasStatus
+clblasSrotmg(
+    cl_mem SD1,
+    size_t offSD1,
+    cl_mem SD2,
+    size_t offSD2,
+    cl_mem SX1,
+    size_t offSX1,
+    const cl_mem SY1,
+    size_t offSY1,
+    cl_mem SPARAM,
+    size_t offSparam,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_srotmg.c
+ * Example of how to use the @ref clblasSrotmg function.
+ */
+
+/**
+ * @brief construct the modified givens rotation on double elements
+ *
+ * @param[out] DD1          Buffer object that contains DD1
+ * @param[in] offDD1        Offset to DD1 in \b DD1 buffer object.
+ *                          Counted in elements.
+ * @param[out] DD2          Buffer object that contains DD2
+ * @param[in] offDD2        Offset to DD2 in \b DD2 buffer object.
+ *                          Counted in elements.
+ * @param[out] DX1          Buffer object that contains DX1
+ * @param[in] offDX1        Offset to DX1 in \b DX1 buffer object.
+ *                          Counted in elements.
+ * @param[in] DY1           Buffer object that contains DY1
+ * @param[in] offDY1        Offset to DY1 in \b DY1 buffer object.
+ *                          Counted in elements.
+ * @param[out] DPARAM       Buffer object that contains DPARAM array of minimum length 5
+                            DPARAM(0) = DFLAG
+                            DPARAM(1) = DH11
+                            DPARAM(2) = DH21
+                            DPARAM(3) = DH12
+                            DPARAM(4) = DH22
+
+ * @param[in] offDparam     Offset to DPARAM in \b DPARAM buffer object.
+ *                          Counted in elements.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clblasSrotmg() function otherwise.
+ *
+ * @ingroup ROTMG
+ */
+clblasStatus
+clblasDrotmg(
+    cl_mem DD1,
+    size_t offDD1,
+    cl_mem DD2,
+    size_t offDD2,
+    cl_mem DX1,
+    size_t offDX1,
+    const cl_mem DY1,
+    size_t offDY1,
+    cl_mem DPARAM,
+    size_t offDparam,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/*@}*/
+
+
+/**
+ * @defgroup ROT ROT  - Apply givens rotation
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief applies a plane rotation for float elements
+ *
+ * @param[in] N         Number of elements in vector \b X and \b Y.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] C         C specifies the cosine, cos.
+ * @param[in] S         S specifies the sine, sin.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clblasInvalidMemObject if either \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup ROT
+ */
+clblasStatus
+clblasSrot(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_float C,
+    cl_float S,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_srot.c
+ * Example of how to use the @ref clblasSrot function.
+ */
+
+/**
+ * @brief applies a plane rotation for double elements
+ *
+ * @param[in] N         Number of elements in vector \b X and \b Y.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] C         C specifies the cosine, cos.
+ * @param[in] S         S specifies the sine, sin.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clblasSrot() function otherwise.
+ *
+ * @ingroup ROT
+ */
+clblasStatus
+clblasDrot(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_double C,
+    cl_double S,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief applies a plane rotation for float-complex elements
+ *
+ * @param[in] N         Number of elements in vector \b X and \b Y.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] C         C specifies the cosine, cos. This number is real
+ * @param[in] S         S specifies the sine, sin. This number is real
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - the same error codes as the clblasSrot() function otherwise.
+ *
+ * @ingroup ROT
+ */
+clblasStatus
+clblasCsrot(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_float C,
+    cl_float S,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief applies a plane rotation for double-complex elements
+ *
+ * @param[in] N         Number of elements in vector \b X and \b Y.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] C         C specifies the cosine, cos. This number is real
+ * @param[in] S         S specifies the sine, sin. This number is real
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clblasSrot() function otherwise.
+ *
+ * @ingroup ROT
+ */
+clblasStatus
+clblasZdrot(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_double C,
+    cl_double S,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/*@}*/
+
+/**
+ * @defgroup ROTM ROTM  - Apply modified givens rotation for points in the plane
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief modified givens rotation for float elements
+ *
+ * @param[in] N         Number of elements in vector \b X and \b Y.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] SPARAM    Buffer object that contains SPARAM array of minimum length 5
+ *                      SPARAM(1)=SFLAG
+ *                      SPARAM(2)=SH11
+ *                      SPARAM(3)=SH21
+ *                      SPARAM(4)=SH12
+ *                      SPARAM(5)=SH22
+ * @param[in] offSparam Offset of first element of array \b SPARAM in buffer object.
+ *                      Counted in elements.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clblasInvalidMemObject if either \b X, \b Y or \b SPARAM object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup ROTM
+ */
+clblasStatus
+clblasSrotm(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    const cl_mem SPARAM,
+    size_t offSparam,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_srotm.c
+ * Example of how to use the @ref clblasSrotm function.
+ */
+
+/**
+ * @brief modified givens rotation for double elements
+ *
+ * @param[in] N         Number of elements in vector \b X and \b Y.
+ * @param[out] X        Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] Y        Buffer object storing the vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] DPARAM    Buffer object that contains SPARAM array of minimum length 5
+ *                      DPARAM(1)=DFLAG
+ *                      DPARAM(2)=DH11
+ *                      DPARAM(3)=DH21
+ *                      DPARAM(4)=DH12
+ *                      DPARAM(5)=DH22
+ * @param[in] offDparam Offset of first element of array \b DPARAM in buffer object.
+ *                      Counted in elements.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+* @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clblasSrotm() function otherwise.
+ *
+ * @ingroup ROTM
+ */
+clblasStatus
+clblasDrotm(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    const cl_mem DPARAM,
+    size_t offDparam,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/*@}*/
+
+/**
+ * @defgroup NRM2 NRM2  - Euclidean norm of a vector
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief computes the euclidean norm of vector containing float elements
+ *
+ *  NRM2 = sqrt( X' * X )
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] NRM2         Buffer object that will contain the NRM2 value
+ * @param[in] offNRM2       Offset to NRM2 value in \b NRM2 buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff	Temporary cl_mem scratch buffer object that can hold minimum of (2*N) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx is zero, or
+ *     - the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clblasInvalidMemObject if any of \b X or \b NRM2 or \b scratchBuff object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup NRM2
+ */
+clblasStatus
+clblasSnrm2(
+    size_t N,
+    cl_mem NRM2,
+    size_t offNRM2,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_snrm2.c
+ * Example of how to use the @ref clblasSnrm2 function.
+ */
+
+/**
+ * @brief computes the euclidean norm of vector containing double elements
+ *
+ *  NRM2 = sqrt( X' * X )
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] NRM2         Buffer object that will contain the NRM2 value
+ * @param[in] offNRM2       Offset to NRM2 value in \b NRM2 buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff	Temporary cl_mem scratch buffer object that can hold minimum of (2*N) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clblasSnrm2() function otherwise.
+ *
+ * @ingroup NRM2
+ */
+clblasStatus
+clblasDnrm2(
+    size_t N,
+    cl_mem NRM2,
+    size_t offNRM2,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief computes the euclidean norm of vector containing float-complex elements
+ *
+ *  NRM2 = sqrt( X**H * X )
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] NRM2         Buffer object that will contain the NRM2 value.
+ *                          Note that the answer of Scnrm2 is a real value.
+ * @param[in] offNRM2       Offset to NRM2 value in \b NRM2 buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff	Temporary cl_mem scratch buffer object that can hold minimum of (2*N) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - the same error codes as the clblasSnrm2() function otherwise.
+ *
+ * @ingroup NRM2
+ */
+clblasStatus
+clblasScnrm2(
+    size_t N,
+    cl_mem NRM2,
+    size_t offNRM2,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief computes the euclidean norm of vector containing double-complex elements
+ *
+ *  NRM2 = sqrt( X**H * X )
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] NRM2         Buffer object that will contain the NRM2 value.
+ *                          Note that the answer of Dznrm2 is a real value.
+ * @param[in] offNRM2       Offset to NRM2 value in \b NRM2 buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff	Temporary cl_mem scratch buffer object that can hold minimum of (2*N) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clblasSnrm2() function otherwise.
+ *     executable.
+ *
+ * @ingroup NRM2
+ */
+clblasStatus
+clblasDznrm2(
+    size_t N,
+    cl_mem NRM2,
+    size_t offNRM2,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/*@}*/
+
+/**
+ * @defgroup iAMAX iAMAX  - Index of max absolute value
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief index of max absolute value in a float array
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] iMax         Buffer object storing the index of first absolute max.
+ *                          The index will be of type unsigned int
+ * @param[in] offiMax       Offset for storing index in the buffer iMax
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff   Temprory cl_mem object to store intermediate results
+                            It should be able to hold minimum of (2*N) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx is zero, or
+ *     - the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clblasInvalidMemObject if any of \b iMax \b X or \b scratchBuff object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if the context, the passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup iAMAX
+ */
+clblasStatus
+clblasiSamax(
+    size_t N,
+    cl_mem iMax,
+    size_t offiMax,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/**
+ * @example example_isamax.c
+ * Example of how to use the @ref clblasiSamax function.
+ */
+
+
+/**
+ * @brief index of max absolute value in a double array
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] iMax         Buffer object storing the index of first absolute max.
+ *                          The index will be of type unsigned int
+ * @param[in] offiMax       Offset for storing index in the buffer iMax
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff   Temprory cl_mem object to store intermediate results
+                            It should be able to hold minimum of (2*N) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clblasiSamax() function otherwise.
+ *
+ * @ingroup iAMAX
+ */
+clblasStatus
+clblasiDamax(
+    size_t N,
+    cl_mem iMax,
+    size_t offiMax,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief index of max absolute value in a complex float array
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] iMax         Buffer object storing the index of first absolute max.
+ *                          The index will be of type unsigned int
+ * @param[in] offiMax       Offset for storing index in the buffer iMax
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff   Temprory cl_mem object to store intermediate results
+                            It should be able to hold minimum of (2*N) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - the same error codes as the clblasiSamax() function otherwise.
+ *
+ * @ingroup iAMAX
+ */
+clblasStatus
+clblasiCamax(
+    size_t N,
+    cl_mem iMax,
+    size_t offiMax,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief index of max absolute value in a complex double array
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] iMax         Buffer object storing the index of first absolute max.
+ *                          The index will be of type unsigned int
+ * @param[in] offiMax       Offset for storing index in the buffer iMax
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff   Temprory cl_mem object to store intermediate results
+                            It should be able to hold minimum of (2*N) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clblasiSamax() function otherwise.
+ *
+ * @ingroup iAMAX
+ */
+clblasStatus
+clblasiZamax(
+    size_t N,
+    cl_mem iMax,
+    size_t offiMax,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/*@}*/
+
+/**
+ * @defgroup ASUM ASUM  - Sum of absolute values
+ * @ingroup BLAS1
+ */
+/*@{*/
+
+/**
+ * @brief absolute sum of values of a vector containing float elements
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] asum         Buffer object that will contain the absoule sum value
+ * @param[in] offAsum       Offset to absolute sum in \b asum buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff   Temporary cl_mem scratch buffer object of minimum size N
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx is zero, or
+ *     - the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clblasInvalidMemObject if any of \b X or \b asum or \b scratchBuff object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup ASUM
+ */
+
+clblasStatus
+clblasSasum(
+    size_t N,
+    cl_mem asum,
+    size_t offAsum,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_sasum.c
+ * Example of how to use the @ref clblasSasum function.
+ */
+
+/**
+ * @brief absolute sum of values of a vector containing double elements
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] asum         Buffer object that will contain the absoulte sum value
+ * @param[in] offAsum       Offset to absoule sum in \b asum buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff   Temporary cl_mem scratch buffer object of minimum size N
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clblasSasum() function otherwise.
+ *
+ * @ingroup ASUM
+ */
+
+clblasStatus
+clblasDasum(
+    size_t N,
+    cl_mem asum,
+    size_t offAsum,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+
+/**
+ * @brief absolute sum of values of a vector containing float-complex elements
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] asum         Buffer object that will contain the absolute sum value
+ * @param[in] offAsum       Offset to absolute sum in \b asum buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff   Temporary cl_mem scratch buffer object of minimum size N
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - the same error codes as the clblasSasum() function otherwise.
+ *
+ * @ingroup ASUM
+ */
+
+clblasStatus
+clblasScasum(
+    size_t N,
+    cl_mem asum,
+    size_t offAsum,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+
+/**
+ * @brief absolute sum of values of a vector containing double-complex elements
+ *
+ * @param[in] N             Number of elements in vector \b X.
+ * @param[out] asum         Buffer object that will contain the absolute sum value
+ * @param[in] offAsum       Offset to absolute sum in \b asum buffer object.
+ *                          Counted in elements.
+ * @param[in] X             Buffer object storing vector \b X.
+ * @param[in] offx          Offset of first element of vector \b X in buffer object.
+ *                          Counted in elements.
+ * @param[in] incx          Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff   Temporary cl_mem scratch buffer object of minimum size N
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - the same error codes as the clblasSasum() function otherwise.
+ *
+ * @ingroup ASUM
+ */
+
+clblasStatus
+clblasDzasum(
+    size_t N,
+    cl_mem asum,
+    size_t offAsum,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/*@}*/
+
+/**
+ * @defgroup BLAS2 BLAS-2 functions
+ *
+ * The Level 2 Basic Linear Algebra Subprograms are functions that perform
+ * matrix-vector operations.
+ */
+/*@{*/
+/*@}*/
+
+
+/**
+ * @defgroup GEMV GEMV  - General matrix-Vector multiplication
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief Matrix-vector product with a general rectangular matrix and
+ *        float elements. Extended version.
+ *
+ * Matrix-vector products:
+ *   - \f$ y \leftarrow \alpha A x + \beta y \f$
+ *   - \f$ y \leftarrow \alpha A^T x + \beta y \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in
+ *                      the buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or less than \b M when the
+ *                      parameter is set to \b clblasColumnMajor.
+ * @param[in] x         Buffer object storing vector \b x.
+ * @param[in] offx      Offset of first element of vector \b x in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b x. It cannot be zero.
+ * @param[in] beta      The factor of the vector \b y.
+ * @param[out] y        Buffer object storing the vector \b y.
+ * @param[in] offy      Offset of first element of vector \b y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidValue if \b offA exceeds the size of \b A buffer
+ *     object;
+ *   - the same error codes as the clblasSgemv() function otherwise.
+ *
+ * @ingroup GEMV
+ */
+clblasStatus
+clblasSgemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_float beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_sgemv.c
+ * This is an example of how to use the @ref clblasSgemvEx function.
+ */
+
+/**
+ * @brief Matrix-vector product with a general rectangular matrix and
+ *        double elements. Extended version.
+ *
+ * Matrix-vector products:
+ *   - \f$ y \leftarrow \alpha A x + \beta y \f$
+ *   - \f$ y \leftarrow \alpha A^T x + \beta y \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of \b A in the buffer
+ *                      object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. For a detailed description,
+ *                      see clblasSgemv().
+ * @param[in] x         Buffer object storing vector \b x.
+ * @param[in] offx      Offset of first element of vector \b x in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b x. It cannot be zero.
+ * @param[in] beta      The factor of the vector \b y.
+ * @param[out] y        Buffer object storing the vector \b y.
+ * @param[in] offy      Offset of first element of vector \b y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - \b clblasInvalidValue if \b offA exceeds the size of \b A buffer
+ *     object;
+ *   - the same error codes as the clblasSgemv() function otherwise.
+ *
+ * @ingroup GEMV
+ */
+clblasStatus
+clblasDgemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_double beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief Matrix-vector product with a general rectangular matrix and
+ *        float complex elements. Extended version.
+ *
+ * Matrix-vector products:
+ *   - \f$ y \leftarrow \alpha A x + \beta y \f$
+ *   - \f$ y \leftarrow \alpha A^T x + \beta y \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in
+ *                      the buffer object. Counted in elements
+ * @param[in] lda       Leading dimension of matrix \b A. For a detailed description,
+ *                      see clblasSgemv().
+ * @param[in] x         Buffer object storing vector \b x.
+ * @param[in] offx      Offset of first element of vector \b x in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b x. It cannot be zero.
+ * @param[in] beta      The factor of the vector \b y.
+ * @param[out] y        Buffer object storing the vector \b y.
+ * @param[in] offy      Offset of first element of vector \b y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidValue if \b offA exceeds the size of \b A buffer
+ *     object;
+ *   - the same error codes as the clblasSgemv() function otherwise.
+ *
+ * @ingroup GEMV
+ */
+clblasStatus
+clblasCgemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    FloatComplex beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief Matrix-vector product with a general rectangular matrix and
+ *        double complex elements. Extended version.
+ *
+ * Matrix-vector products:
+ *   - \f$ y \leftarrow \alpha A x + \beta y \f$
+ *   - \f$ y \leftarrow \alpha A^T x + \beta y \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in
+ *                      the buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. For a detailed description,
+ *                      see clblasSgemv().
+ * @param[in] x         Buffer object storing vector \b x.
+ * @param[in] offx      Offset of first element of vector \b x in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b x. It cannot be zero.
+ * @param[in] beta      The factor of the vector \b y.
+ * @param[out] y        Buffer object storing the vector \b y.
+ * @param[in] offy      Offset of first element of vector \b y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support the
+ *     floating point arithmetic with double precision;
+ *   - \b clblasInvalidValue if \b offA exceeds the size of \b A buffer
+ *     object;
+ *   - the same error codes as the clblasSgemv() function otherwise.
+ *
+ * @ingroup GEMV
+ */
+clblasStatus
+clblasZgemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    DoubleComplex beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/*@}*/
+
+/**
+ * @defgroup SYMV SYMV  - Symmetric matrix-Vector multiplication
+ * @ingroup BLAS2
+ */
+
+/*@{*/
+
+/**
+ * @brief Matrix-vector product with a symmetric matrix and float elements.
+ *
+ *
+ * Matrix-vector products:
+ * - \f$ y \leftarrow \alpha A x + \beta y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in
+ *                      the buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot less
+ *                      than \b N.
+ * @param[in] x         Buffer object storing vector \b x.
+ * @param[in] offx      Offset of first element of vector \b x in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b x. It cannot be zero.
+ * @param[in] beta      The factor of vector \b y.
+ * @param[out] y        Buffer object storing vector \b y.
+ * @param[in] offy      Offset of first element of vector \b y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidValue if \b offA exceeds the size of \b A buffer
+ *     object;
+ *   - the same error codes as the clblasSgemv() function otherwise.
+ *
+ * @ingroup SYMV
+ */
+clblasStatus
+clblasSsymv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_float beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_ssymv.c
+ * This is an example of how to use the @ref clblasSsymv function.
+ */
+
+/**
+ * @brief Matrix-vector product with a symmetric matrix and double elements.
+ *
+ *
+ * Matrix-vector products:
+ * - \f$ y \leftarrow \alpha A x + \beta y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in
+ *                      the buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot less
+ *                      than \b N.
+ * @param[in] x         Buffer object storing vector \b x.
+ * @param[in] offx      Offset of first element of vector \b x in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b x. It cannot be zero.
+ * @param[in] beta      The factor of vector \b y.
+ * @param[out] y        Buffer object storing vector \b y.
+ * @param[in] offy      Offset of first element of vector \b y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - \b clblasInvalidValue if \b offA exceeds the size of \b A buffer
+ *     object;
+ *   - the same error codes as the clblasSsymv() function otherwise.
+ *
+ * @ingroup SYMV
+ */
+clblasStatus
+clblasDsymv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_double beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/*@}*/
+
+
+/**
+ * @defgroup HEMV HEMV  - Hermitian matrix-vector multiplication
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief Matrix-vector product with a hermitian matrix and float-complex elements.
+ *
+ * Matrix-vector products:
+ * - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offa		Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot less
+ *                      than \b N.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b X. It cannot be zero.
+ * @param[in] beta      The factor of vector \b Y.
+ * @param[out] Y        Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b Y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - any of the leading dimensions is invalid;
+ *     - the matrix sizes or the vector sizes along with the increments lead to
+ *       accessing outsize of any of the buffers;
+ *   - \b clblasInvalidMemObject if either \b A, \b X, or \b Y object is
+ *     invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs to
+ *     was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup HEMV
+ */
+clblasStatus
+clblasChemv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    FloatComplex beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief Matrix-vector product with a hermitian matrix and double-complex elements.
+ *
+ * Matrix-vector products:
+ * - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offa		Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot less
+ *                      than \b N.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b X. It cannot be zero.
+ * @param[in] beta      The factor of vector \b Y.
+ * @param[out] Y        Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b Y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasChemv() function otherwise.
+ *
+ * @ingroup HEMV
+ */
+clblasStatus
+clblasZhemv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    DoubleComplex beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/**
+ * @example example_zhemv.cpp
+ * Example of how to use the @ref clblasZhemv function.
+ */
+/*@}*/
+
+
+
+/**
+ * @defgroup TRMV TRMV  - Triangular matrix vector multiply
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief Matrix-vector product with a triangular matrix and
+ * float elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than \b N
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b incx is zero, or
+ *     - the leading dimension is invalid;
+ *   - \b clblasInvalidMemObject if either \b A or \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup TRMV
+ */
+clblasStatus
+clblasStrmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_strmv.c
+ * Example of how to use the @ref clblasStrmv function.
+ */
+
+/**
+ * @brief Matrix-vector product with a triangular matrix and
+ * double elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than \b N
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasStrmv() function otherwise.
+ *
+ * @ingroup TRMV
+ */
+clblasStatus
+clblasDtrmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief Matrix-vector product with a triangular matrix and
+ * float complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than \b N
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return The same result as the clblasStrmv() function.
+ * @ingroup TRMV
+ */
+clblasStatus
+clblasCtrmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief Matrix-vector product with a triangular matrix and
+ * double complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than \b N
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return The same result as the clblasDtrmv() function.
+ * @ingroup TRMV
+ */
+clblasStatus
+clblasZtrmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+
+/*@}*/
+
+/**
+ * @defgroup TRSV TRSV  - Triangular matrix vector Solve
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief solving triangular matrix problems with float elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than \b N
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b incx is zero, or
+ *     - the leading dimension is invalid;
+ *   - \b clblasInvalidMemObject if either \b A or \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup TRSV
+ */
+clblasStatus
+clblasStrsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_strsv.c
+ * Example of how to use the @ref clblasStrsv function.
+ */
+
+
+/**
+ * @brief solving triangular matrix problems with double elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than \b N
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasStrsv() function otherwise.
+ *
+ * @ingroup TRSV
+ */
+clblasStatus
+clblasDtrsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+
+/**
+ * @brief solving triangular matrix problems with float-complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than \b N
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return The same result as the clblasStrsv() function.
+ *
+ * @ingroup TRSV
+ */
+clblasStatus
+clblasCtrsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+
+/**
+ * @brief solving triangular matrix problems with double-complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than \b N
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return The same result as the clblasDtrsv() function.
+ *
+ * @ingroup TRSV
+ */
+clblasStatus
+clblasZtrsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/*@}*/
+
+/**
+ * @defgroup GER GER   - General matrix rank 1 operation
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief vector-vector product with float elements and
+ * performs the rank 1 operation A
+ *
+ * Vector-vector products:
+ *   - \f$ A \leftarrow \alpha X Y^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     specifies the scalar alpha.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset in number of elements for the first element in vector \b Y.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] A 		Buffer object storing matrix \b A. On exit, A is
+ *				        overwritten by the updated matrix.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or less than \b M when the
+ *                      parameter is set to \b clblasColumnMajor.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - \b M, \b N or
+ *	   - either \b incx or \b incy is zero, or
+ *     - a leading dimension is invalid;
+ *   - \b clblasInvalidMemObject if A, X, or Y object is invalid,
+ *     or an image object rather than the buffer one;
+ *   - \b clblasOutOfResources if you use image-based function implementation
+ *     and no suitable scratch image available;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs to
+ *     was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup GER
+ */
+clblasStatus
+clblasSger(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    cl_float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_sger.c
+ * Example of how to use the @ref clblasSger function.
+ */
+
+
+/**
+ * @brief vector-vector product with double elements and
+ * performs the rank 1 operation A
+ *
+ * Vector-vector products:
+ *   - \f$ A \leftarrow \alpha X Y^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     specifies the scalar alpha.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset in number of elements for the first element in vector \b Y.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] A 		Buffer object storing matrix \b A. On exit, A is
+ *				        overwritten by the updated matrix.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or less than \b M when the
+ *                      parameter is set to \b clblasColumnMajor.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasSger() function otherwise.
+ *
+ * @ingroup GER
+ */
+clblasStatus
+clblasDger(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    cl_double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/*@}*/
+
+/**
+ * @defgroup GERU GERU  - General matrix rank 1 operation
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief vector-vector product with float complex elements and
+ * performs the rank 1 operation A
+ *
+ * Vector-vector products:
+ *   - \f$ A \leftarrow \alpha X Y^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     specifies the scalar alpha.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset in number of elements for the first element in vector \b Y.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] A 		Buffer object storing matrix \b A. On exit, A is
+ *				        overwritten by the updated matrix.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or less than \b M when the
+ *                      parameter is set to \b clblasColumnMajor.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - \b M, \b N or
+ *	   - either \b incx or \b incy is zero, or
+ *     - a leading dimension is invalid;
+ *   - \b clblasInvalidMemObject if A, X, or Y object is invalid,
+ *     or an image object rather than the buffer one;
+ *   - \b clblasOutOfResources if you use image-based function implementation
+ *     and no suitable scratch image available;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs to
+ *     was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup GERU
+ */
+clblasStatus
+clblasCgeru(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A ,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief vector-vector product with double complex elements and
+ * performs the rank 1 operation A
+ *
+ * Vector-vector products:
+ *   - \f$ A \leftarrow \alpha X Y^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     specifies the scalar alpha.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset in number of elements for the first element in vector \b Y.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] A		   Buffer object storing matrix \b A. On exit, A is
+ *				        overwritten by the updated matrix.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or less than \b M when the
+ *                      parameter is set to \b clblasColumnMajor.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasCgeru() function otherwise.
+ *
+ * @ingroup GERU
+ */
+clblasStatus
+clblasZgeru(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/*@}*/
+
+/**
+ * @defgroup GERC GERC  - General matrix rank 1 operation
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief vector-vector product with float complex elements and
+ * performs the rank 1 operation A
+ *
+ * Vector-vector products:
+ *   - \f$ A \leftarrow \alpha X Y^H + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     specifies the scalar alpha.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset in number of elements for the first element in vector \b Y.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] A 	    Buffer object storing matrix \b A. On exit, A is
+ *				        overwritten by the updated matrix.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or less than \b M when the
+ *                      parameter is set to \b clblasColumnMajor.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - \b M, \b N or
+ *	   - either \b incx or \b incy is zero, or
+ *     - a leading dimension is invalid;
+ *   - \b clblasInvalidMemObject if A, X, or Y object is invalid,
+ *     or an image object rather than the buffer one;
+ *   - \b clblasOutOfResources if you use image-based function implementation
+ *     and no suitable scratch image available;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs to
+ *     was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup GERC
+ */
+
+clblasStatus
+clblasCgerc(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A ,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief vector-vector product with double complex elements and
+ * performs the rank 1 operation A
+ *
+ * Vector-vector products:
+ *   - \f$ A \leftarrow \alpha X Y^H + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     specifies the scalar alpha.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset in number of elements for the first element in vector \b Y.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] A		Buffer object storing matrix \b A. On exit, A is
+ *				        overwritten by the updated matrix.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or less than \b M when the
+ *                      parameter is set to \b clblasColumnMajor.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasCgerc() function otherwise.
+ *
+ * @ingroup GERC
+ */
+clblasStatus
+clblasZgerc(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+
+/*@}*/
+
+/**
+ * @defgroup SYR SYR   - Symmetric rank 1 update
+ *
+ * The Level 2 Basic Linear Algebra Subprograms are functions that perform
+ * symmetric rank 1 update operations.
+  * @ingroup BLAS2
+ */
+
+/*@{*/
+/**
+ * @brief Symmetric rank 1 operation with a general triangular matrix and
+ * float elements.
+ *
+ * Symmetric rank 1 operation:
+ *   - \f$ A \leftarrow \alpha x x^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] A 	    Buffer object storing matrix \b A.
+ * @param[in] offa      Offset of first element of matrix \b A in buffer object.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx is zero, or
+ *     - the leading dimension is invalid;
+ *   - \b clblasInvalidMemObject if either \b A, \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SYR
+ */
+clblasStatus
+clblasSsyr(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events);
+
+/**
+ * @brief Symmetric rank 1 operation with a general triangular matrix and
+ * double elements.
+ *
+ * Symmetric rank 1 operation:
+ *   - \f$ A \leftarrow \alpha x x^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] A		Buffer object storing matrix \b A.
+ * @param[in] offa      Offset of first element of matrix \b A in buffer object.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasSsyr() function otherwise.
+ *
+ * @ingroup SYR
+ */
+
+clblasStatus
+clblasDsyr(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events);
+/*@}*/
+
+
+/**
+ * @defgroup HER HER   - Hermitian rank 1 operation
+ *
+ * The Level 2 Basic Linear Algebra Subprogram functions that perform
+ * hermitian rank 1 operations.
+ * @ingroup BLAS2
+ */
+
+/*@{*/
+/**
+ * @brief hermitian rank 1 operation with a general triangular matrix and
+ * float-complex elements.
+ *
+ * hermitian rank 1 operation:
+ *   - \f$ A \leftarrow \alpha X X^H + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A (a scalar float value)
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] A		Buffer object storing matrix \b A.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx is zero, or
+ *     - the leading dimension is invalid;
+ *   - \b clblasInvalidMemObject if either \b A, \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup HER
+ */
+clblasStatus
+clblasCher(
+	clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events);
+/**
+ * @example example_cher.c
+ * Example of how to use the @ref clblasCher function.
+ */
+
+/**
+ * @brief hermitian rank 1 operation with a general triangular matrix and
+ * double-complex elements.
+ *
+ * hermitian rank 1 operation:
+ *   - \f$ A \leftarrow \alpha X X^H + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A (a scalar double value)
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] A		Buffer object storing matrix \b A.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasCher() function otherwise.
+ *
+ * @ingroup HER
+ */
+clblasStatus
+clblasZher(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events);
+/*@}*/
+
+/**
+ * @defgroup SYR2 SYR2  - Symmetric rank 2 update
+ *
+ * The Level 2 Basic Linear Algebra Subprograms are functions that perform
+ * symmetric rank 2 update operations.
+  * @ingroup BLAS2
+ */
+
+/*@{*/
+/**
+ * @brief Symmetric rank 2 operation with a general triangular matrix and
+ * float elements.
+ *
+ * Symmetric rank 2 operation:
+ *   - \f$ A \leftarrow \alpha x y^T + \alpha y x^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] A 	    Buffer object storing matrix \b A.
+ * @param[in] offa      Offset of first element of matrix \b A in buffer object.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - either \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - the leading dimension is invalid;
+ *   - \b clblasInvalidMemObject if either \b A, \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SYR2
+ */
+
+clblasStatus
+clblasSsyr2(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem X,
+    size_t offx,
+    int  incx,
+	const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events);
+
+/**
+ * @brief Symmetric rank 2 operation with a general triangular matrix and
+ * double elements.
+ *
+ * Symmetric rank 2 operation:
+ *   - \f$ A \leftarrow \alpha x y^T + \alpha y x^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] A 	    Buffer object storing matrix \b A.
+ * @param[in] offa      Offset of first element of matrix \b A in buffer object.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - either \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - the leading dimension is invalid;
+ *   - \b clblasInvalidMemObject if either \b A, \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SYR2
+ */
+
+clblasStatus
+clblasDsyr2(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events);
+
+/*@}*/
+
+/**
+ * @defgroup HER2 HER2  - Hermitian rank 2 update
+ *
+ * The Level 2 Basic Linear Algebra Subprograms are functions that perform
+ * hermitian rank 2 update operations.
+ * @ingroup BLAS2
+ */
+
+/*@{*/
+/**
+ * @brief Hermitian rank 2 operation with a general triangular matrix and
+ * float-compelx elements.
+ *
+ * Hermitian rank 2 operation:
+ *   - \f$ A \leftarrow \alpha X Y^H + \overline{ \alpha } Y X^H + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset in number of elements for the first element in vector \b Y.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] A		Buffer object storing matrix \b A.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - either \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - the leading dimension is invalid;
+ *   - \b clblasInvalidMemObject if either \b A, \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup HER2
+ */
+clblasStatus
+clblasCher2(
+	clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+	const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events);
+
+
+/**
+* @brief Hermitian rank 2 operation with a general triangular matrix and
+ * double-compelx elements.
+ *
+ * Hermitian rank 2 operation:
+ *   - \f$ A \leftarrow \alpha X Y^H + \overline{ \alpha } Y X^H + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset in number of elements for the first element in vector \b Y.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] A		Buffer object storing matrix \b A.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasCher2() function otherwise.
+ *
+ * @ingroup HER2
+ */
+clblasStatus
+clblasZher2(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+	cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events);
+
+/**
+ * @example example_zher2.c
+ * Example of how to use the @ref clblasZher2 function.
+ */
+
+/*@}*/
+
+/**
+ * @defgroup TPMV TPMV  - Triangular packed matrix-vector multiply
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief Matrix-vector product with a packed triangular matrix and
+ * float elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b AP is to be transposed.
+ * @param[in] diag				Specify whether matrix \b AP is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b A.
+ * @param[in] AP				Buffer object storing matrix \b AP in packed format.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b AP.
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b incx is zero
+ *   - \b clblasInvalidMemObject if either \b AP or \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup TPMV
+ */
+clblasStatus
+clblasStpmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem AP,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_stpmv.c
+ * Example of how to use the @ref clblasStpmv function.
+ */
+
+/**
+ * @brief Matrix-vector product with a packed triangular matrix and
+ * double elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b AP is to be transposed.
+ * @param[in] diag				Specify whether matrix \b AP is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b AP.
+ * @param[in] AP				Buffer object storing matrix \b AP in packed format.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b AP.
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasStpmv() function otherwise.
+ *
+ * @ingroup TPMV
+ */
+clblasStatus
+clblasDtpmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem AP,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+  * @brief Matrix-vector product with a packed triangular matrix and
+ * float-complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b AP is to be transposed.
+ * @param[in] diag				Specify whether matrix \b AP is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b AP.
+ * @param[in] AP				Buffer object storing matrix \b AP in packed format.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b AP.
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return The same result as the clblasStpmv() function.
+ * @ingroup TPMV
+ */
+clblasStatus
+clblasCtpmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem AP,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief Matrix-vector product with a packed triangular matrix and
+ * double-complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b AP is to be transposed.
+ * @param[in] diag				Specify whether matrix \b AP is unit triangular.
+ * @param[in] N					Number of rows/columns in matrix \b AP.
+ * @param[in] AP				Buffer object storing matrix \b AP in packed format.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b AP.
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return The same result as the clblasDtpmv() function.
+ * @ingroup TPMV
+ */
+clblasStatus
+clblasZtpmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem AP,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/*@}*/
+
+
+
+/**
+ * @defgroup TPSV TPSV  - Triangular packed matrix vector solve
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief solving triangular packed matrix problems with float elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo              The triangle in matrix being referenced.
+ * @param[in] trans             How matrix \b A is to be transposed.
+ * @param[in] diag              Specify whether matrix \b A is unit triangular.
+ * @param[in] N                 Number of rows/columns in matrix \b A.
+ * @param[in] A                 Buffer object storing matrix in packed format.\b A.
+ * @param[in] offa              Offset in number of elements for first element in matrix \b A.
+ * @param[out] X                Buffer object storing vector \b X.
+ * @param[in] offx              Offset in number of elements for first element in vector \b X.
+ * @param[in] incx              Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b incx is zero, or
+ *     - the leading dimension is invalid;
+ *   - \b clblasInvalidMemObject if either \b A or \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup TPSV
+ */
+
+clblasStatus
+clblasStpsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_stpsv.c
+ * Example of how to use the @ref clblasStpsv function.
+ */
+
+/**
+ * @brief solving triangular packed matrix problems with double elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo              The triangle in matrix being referenced.
+ * @param[in] trans             How matrix \b A is to be transposed.
+ * @param[in] diag              Specify whether matrix \b A is unit triangular.
+ * @param[in] N                 Number of rows/columns in matrix \b A.
+ * @param[in] A                 Buffer object storing matrix in packed format.\b A.
+ * @param[in] offa              Offset in number of elements for first element in matrix \b A.
+ * @param[out] X                Buffer object storing vector \b X.
+ * @param[in] offx              Offset in number of elements for first element in vector \b X.
+ * @param[in] incx              Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b incx is zero, or
+ *     - the leading dimension is invalid;
+ *   - \b clblasInvalidMemObject if either \b A or \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup TPSV
+ */
+
+clblasStatus
+clblasDtpsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief solving triangular packed matrix problems with float complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo              The triangle in matrix being referenced.
+ * @param[in] trans             How matrix \b A is to be transposed.
+ * @param[in] diag              Specify whether matrix \b A is unit triangular.
+ * @param[in] N                 Number of rows/columns in matrix \b A.
+ * @param[in] A                 Buffer object storing matrix in packed format.\b A.
+ * @param[in] offa              Offset in number of elements for first element in matrix \b A.
+ * @param[out] X                Buffer object storing vector \b X.
+ * @param[in] offx              Offset in number of elements for first element in vector \b X.
+ * @param[in] incx              Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b incx is zero, or
+ *     - the leading dimension is invalid;
+ *   - \b clblasInvalidMemObject if either \b A or \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup TPSV
+ */
+
+clblasStatus
+clblasCtpsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief solving triangular packed matrix problems with double complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo              The triangle in matrix being referenced.
+ * @param[in] trans             How matrix \b A is to be transposed.
+ * @param[in] diag              Specify whether matrix \b A is unit triangular.
+ * @param[in] N                 Number of rows/columns in matrix \b A.
+ * @param[in] A                 Buffer object storing matrix in packed format.\b A.
+ * @param[in] offa              Offset in number of elements for first element in matrix \b A.
+ * @param[out] X                Buffer object storing vector \b X.
+ * @param[in] offx              Offset in number of elements for first element in vector \b X.
+ * @param[in] incx              Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b incx is zero, or
+ *     - the leading dimension is invalid;
+ *   - \b clblasInvalidMemObject if either \b A or \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup TPSV
+ */
+
+clblasStatus
+clblasZtpsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/*@}*/
+
+
+/**
+ * @defgroup SPMV SPMV  - Symmetric packed matrix vector multiply
+ * @ingroup BLAS2
+ */
+
+/*@{*/
+
+/**
+ * @brief Matrix-vector product with a symmetric packed-matrix and float elements.
+ *
+ * Matrix-vector products:
+ * - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in matrix \b AP.
+ * @param[in] alpha     The factor of matrix \b AP.
+ * @param[in] AP        Buffer object storing matrix \b AP.
+ * @param[in] offa		Offset in number of elements for first element in matrix \b AP.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b X. It cannot be zero.
+ * @param[in] beta      The factor of vector \b Y.
+ * @param[out] Y        Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b Y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - the matrix sizes or the vector sizes along with the increments lead to
+ *       accessing outsize of any of the buffers;
+ *   - \b clblasInvalidMemObject if either \b AP, \b X, or \b Y object is
+ *     invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs to
+ *     was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SPMV
+ */
+clblasStatus
+clblasSspmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem AP,
+    size_t offa,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_float beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_sspmv.c
+ * This is an example of how to use the @ref clblasSspmv function.
+ */
+
+/**
+ * @brief Matrix-vector product with a symmetric packed-matrix and double elements.
+ *
+ * Matrix-vector products:
+ * - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in matrix \b AP.
+ * @param[in] alpha     The factor of matrix \b AP.
+ * @param[in] AP        Buffer object storing matrix \b AP.
+ * @param[in] offa		Offset in number of elements for first element in matrix \b AP.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b X. It cannot be zero.
+ * @param[in] beta      The factor of vector \b Y.
+ * @param[out] Y        Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b Y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasSspmv() function otherwise.
+ *
+ * @ingroup SPMV
+ */
+clblasStatus
+clblasDspmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem AP,
+    size_t offa,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_double beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/*@}*/
+
+
+
+/**
+ * @defgroup HPMV HPMV  - Hermitian packed matrix-vector multiplication
+ * @ingroup BLAS2
+ */
+
+/*@{*/
+
+/**
+ * @brief Matrix-vector product with a packed hermitian matrix and float-complex elements.
+ *
+ * Matrix-vector products:
+ * - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in matrix \b AP.
+ * @param[in] alpha     The factor of matrix \b AP.
+ * @param[in] AP        Buffer object storing packed matrix \b AP.
+ * @param[in] offa		Offset in number of elements for first element in matrix \b AP.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b X. It cannot be zero.
+ * @param[in] beta      The factor of vector \b Y.
+ * @param[out] Y        Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b Y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx or \b incy is zero, or
+ *     - the matrix sizes or the vector sizes along with the increments lead to
+ *       accessing outsize of any of the buffers;
+ *   - \b clblasInvalidMemObject if either \b AP, \b X, or \b Y object is
+ *     invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs to
+ *     was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup HPMV
+ */
+clblasStatus
+clblasChpmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem AP,
+    size_t offa,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_float2 beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_chpmv.c
+ * This is an example of how to use the @ref clblasChpmv function.
+ */
+
+
+/**
+ * @brief Matrix-vector product with a packed hermitian matrix and double-complex elements.
+ *
+ * Matrix-vector products:
+ * - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in matrix \b AP.
+ * @param[in] alpha     The factor of matrix \b AP.
+ * @param[in] AP        Buffer object storing packed matrix \b AP.
+ * @param[in] offa		Offset in number of elements for first element in matrix \b AP.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b X. It cannot be zero.
+ * @param[in] beta      The factor of vector \b Y.
+ * @param[out] Y        Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b Y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasChpmv() function otherwise.
+ *
+ * @ingroup HPMV
+ */
+clblasStatus
+clblasZhpmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem AP,
+    size_t offa,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_double2 beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/*@}*/
+
+
+/**
+ * @defgroup SPR SPR   - Symmetric packed matrix rank 1 update
+ *
+ * The Level 2 Basic Linear Algebra Subprograms are functions that perform
+ * symmetric rank 1 update operations on packed matrix
+ * @ingroup BLAS2
+ */
+
+/*@{*/
+/**
+ * @brief Symmetric rank 1 operation with a general triangular packed-matrix and
+ * float elements.
+ *
+ * Symmetric rank 1 operation:
+ *   - \f$ A \leftarrow \alpha X X^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] AP 	    Buffer object storing packed-matrix \b AP.
+ * @param[in] offa      Offset of first element of matrix \b AP in buffer object.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx is zero
+ *   - \b clblasInvalidMemObject if either \b AP, \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SPR
+ */
+clblasStatus
+clblasSspr(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events);
+/**
+ * @example example_sspr.c
+ * Example of how to use the @ref clblasSspr function.
+ */
+
+/**
+ * @brief Symmetric rank 1 operation with a general triangular packed-matrix and
+ * double elements.
+ *
+ * Symmetric rank 1 operation:
+ *   - \f$ A \leftarrow \alpha X X^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] AP 	    Buffer object storing packed-matrix \b AP.
+ * @param[in] offa      Offset of first element of matrix \b AP in buffer object.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasSspr() function otherwise.
+ *
+ * @ingroup SPR
+ */
+
+clblasStatus
+clblasDspr(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events);
+/*@}*/
+
+/**
+ * @defgroup HPR HPR   - Hermitian packed matrix rank 1 update
+ *
+ * The Level 2 Basic Linear Algebra Subprogram functions that perform
+ * hermitian rank 1 operations on packed matrix
+ * @ingroup BLAS2
+ */
+
+/*@{*/
+/**
+ * @brief hermitian rank 1 operation with a general triangular packed-matrix and
+ * float-complex elements.
+ *
+ * hermitian rank 1 operation:
+ *   - \f$ A \leftarrow \alpha X X^H + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A (a scalar float value)
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] AP 	    Buffer object storing matrix \b AP.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b AP.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - \b N is zero, or
+ *     - either \b incx is zero
+ *   - \b clblasInvalidMemObject if either \b AP, \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup HPR
+ */
+clblasStatus
+clblasChpr(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem X,
+    size_t offx,
+    int  incx,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events);
+/**
+ * @example example_chpr.c
+ * Example of how to use the @ref clblasChpr function.
+ */
+
+/**
+ * @brief hermitian rank 1 operation with a general triangular packed-matrix and
+ * double-complex elements.
+ *
+ * hermitian rank 1 operation:
+ *   - \f$ A \leftarrow \alpha X X^H + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A (a scalar float value)
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[out] AP 	    Buffer object storing matrix \b AP.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b AP.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasChpr() function otherwise.
+ *
+ * @ingroup HPR
+ */
+clblasStatus
+clblasZhpr(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events);
+/*@}*/
+
+/**
+ * @defgroup SPR2 SPR2  - Symmetric packed matrix rank 2 update
+ *
+ * The Level 2 Basic Linear Algebra Subprograms are functions that perform
+ * symmetric rank 2 update operations on packed matrices
+ * @ingroup BLAS2
+ */
+
+/*@{*/
+/**
+ * @brief Symmetric rank 2 operation with a general triangular packed-matrix and
+ * float elements.
+ *
+ * Symmetric rank 2 operation:
+ *   - \f$ A \leftarrow \alpha X Y^T + \alpha Y X^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] AP		Buffer object storing packed-matrix \b AP.
+ * @param[in] offa      Offset of first element of matrix \b AP in buffer object.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - either \b N is zero, or
+ *     - either \b incx or \b incy is zero
+ *   - \b clblasInvalidMemObject if either \b AP, \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SPR2
+ */
+
+clblasStatus
+clblasSspr2(
+	clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+	const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events);
+/**
+ * @example example_sspr2.c
+ * Example of how to use the @ref clblasSspr2 function.
+ */
+
+/**
+ * @brief Symmetric rank 2 operation with a general triangular packed-matrix and
+ * double elements.
+ *
+ * Symmetric rank 2 operation:
+ *   - \f$ A \leftarrow \alpha X Y^T + \alpha Y X^T + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] AP		Buffer object storing packed-matrix \b AP.
+ * @param[in] offa      Offset of first element of matrix \b AP in buffer object.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasSspr2() function otherwise.
+ *
+ * @ingroup SPR2
+ */
+
+clblasStatus
+clblasDspr2(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+	cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events);
+/*@}*/
+
+/**
+ * @defgroup HPR2 HPR2  - Hermitian packed matrix rank 2 update
+ *
+ * The Level 2 Basic Linear Algebra Subprograms are functions that perform
+ * hermitian rank 2 update operations on packed matrices
+ * @ingroup BLAS2
+ */
+
+/*@{*/
+/**
+ * @brief Hermitian rank 2 operation with a general triangular packed-matrix and
+ * float-compelx elements.
+ *
+ * Hermitian rank 2 operation:
+ *   - \f$ A \leftarrow \alpha X Y^H + \conjg( alpha ) Y X^H + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset in number of elements for the first element in vector \b Y.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] AP		Buffer object storing packed-matrix \b AP.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b AP.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - either \b N is zero, or
+ *     - either \b incx or \b incy is zero
+ *   - \b clblasInvalidMemObject if either \b AP, \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup HPR2
+ */
+clblasStatus
+clblasChpr2(
+	clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+	const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events);
+
+
+/**
+ * @brief Hermitian rank 2 operation with a general triangular packed-matrix and
+ * double-compelx elements.
+ *
+ * Hermitian rank 2 operation:
+ *   - \f$ A \leftarrow \alpha X Y^H + \conjg( alpha ) Y X^H + A \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of columns in matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset in number of elements for the first element in vector \b X.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] Y         Buffer object storing vector \b Y.
+ * @param[in] offy      Offset in number of elements for the first element in vector \b Y.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[out] AP		Buffer object storing packed-matrix \b AP.
+ * @param[in] offa      Offset in number of elements for the first element in matrix \b AP.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasChpr2() function otherwise.
+ *
+ * @ingroup HPR2
+ */
+clblasStatus
+clblasZhpr2(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+	cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events);
+
+/**
+ * @example example_zhpr2.c
+ * Example of how to use the @ref clblasZhpr2 function.
+ */
+/*@}*/
+
+
+
+/**
+ * @defgroup GBMV GBMV  - General banded matrix-vector multiplication
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief Matrix-vector product with a general rectangular banded matrix and
+ * float elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *   - \f$ Y \leftarrow \alpha A^T X + \beta Y \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] trans     How matrix \b A is to be transposed.
+ * @param[in] M         Number of rows in banded matrix \b A.
+ * @param[in] N         Number of columns in banded matrix \b A.
+ * @param[in] KL        Number of sub-diagonals in banded matrix \b A.
+ * @param[in] KU        Number of super-diagonals in banded matrix \b A.
+ * @param[in] alpha     The factor of banded matrix \b A.
+ * @param[in] A         Buffer object storing banded matrix \b A.
+ * @param[in] offa      Offset in number of elements for the first element in banded matrix \b A.
+ * @param[in] lda       Leading dimension of banded matrix \b A. It cannot be less
+ *                      than ( \b KL + \b KU + 1 )
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] beta      The factor of the vector \b Y.
+ * @param[out] Y        Buffer object storing the vector \b y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - either \b M or \b N is zero, or
+ *     - KL is greater than \b M - 1, or
+ *     - KU is greater than \b N - 1, or
+ *     - either \b incx or \b incy is zero, or
+ *     - any of the leading dimensions is invalid;
+ *     - the matrix size or the vector sizes along with the increments lead to
+ *       accessing outside of any of the buffers;
+ *   - \b clblasInvalidMemObject if either \b A, \b X, or \b Y object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup GBMV
+ */
+clblasStatus
+clblasSgbmv(
+    clblasOrder order,
+    clblasTranspose trans,
+    size_t M,
+    size_t N,
+    size_t KL,
+    size_t KU,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_float beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/**
+ * @example example_sgbmv.c
+ * Example of how to use the @ref clblasSgbmv function.
+ */
+
+
+/**
+ * @brief Matrix-vector product with a general rectangular banded matrix and
+ * double elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *   - \f$ Y \leftarrow \alpha A^T X + \beta Y \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] trans     How matrix \b A is to be transposed.
+ * @param[in] M         Number of rows in banded matrix \b A.
+ * @param[in] N         Number of columns in banded matrix \b A.
+ * @param[in] KL        Number of sub-diagonals in banded matrix \b A.
+ * @param[in] KU        Number of super-diagonals in banded matrix \b A.
+ * @param[in] alpha     The factor of banded matrix \b A.
+ * @param[in] A         Buffer object storing banded matrix \b A.
+ * @param[in] offa      Offset in number of elements for the first element in banded matrix \b A.
+ * @param[in] lda       Leading dimension of banded matrix \b A. It cannot be less
+ *                      than ( \b KL + \b KU + 1 )
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] beta      The factor of the vector \b Y.
+ * @param[out] Y        Buffer object storing the vector \b y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasSgbmv() function otherwise.
+ *
+ * @ingroup GBMV
+ */
+clblasStatus
+clblasDgbmv(
+    clblasOrder order,
+    clblasTranspose trans,
+    size_t M,
+    size_t N,
+    size_t KL,
+    size_t KU,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_double beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+
+/**
+ * @brief Matrix-vector product with a general rectangular banded matrix and
+ * float-complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *   - \f$ Y \leftarrow \alpha A^T X + \beta Y \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] trans     How matrix \b A is to be transposed.
+ * @param[in] M         Number of rows in banded matrix \b A.
+ * @param[in] N         Number of columns in banded matrix \b A.
+ * @param[in] KL        Number of sub-diagonals in banded matrix \b A.
+ * @param[in] KU        Number of super-diagonals in banded matrix \b A.
+ * @param[in] alpha     The factor of banded matrix \b A.
+ * @param[in] A         Buffer object storing banded matrix \b A.
+ * @param[in] offa      Offset in number of elements for the first element in banded matrix \b A.
+ * @param[in] lda       Leading dimension of banded matrix \b A. It cannot be less
+ *                      than ( \b KL + \b KU + 1 )
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] beta      The factor of the vector \b Y.
+ * @param[out] Y        Buffer object storing the vector \b y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return The same result as the clblasSgbmv() function.
+ *
+ * @ingroup GBMV
+ */
+clblasStatus
+clblasCgbmv(
+    clblasOrder order,
+    clblasTranspose trans,
+    size_t M,
+    size_t N,
+    size_t KL,
+    size_t KU,
+    cl_float2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_float2 beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+
+/**
+ * @brief Matrix-vector product with a general rectangular banded matrix and
+ * double-complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *   - \f$ Y \leftarrow \alpha A^T X + \beta Y \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] trans     How matrix \b A is to be transposed.
+ * @param[in] M         Number of rows in banded matrix \b A.
+ * @param[in] N         Number of columns in banded matrix \b A.
+ * @param[in] KL        Number of sub-diagonals in banded matrix \b A.
+ * @param[in] KU        Number of super-diagonals in banded matrix \b A.
+ * @param[in] alpha     The factor of banded matrix \b A.
+ * @param[in] A         Buffer object storing banded matrix \b A.
+ * @param[in] offa      Offset in number of elements for the first element in banded matrix \b A.
+ * @param[in] lda       Leading dimension of banded matrix \b A. It cannot be less
+ *                      than ( \b KL + \b KU + 1 )
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of \b X. Must not be zero.
+ * @param[in] beta      The factor of the vector \b Y.
+ * @param[out] Y        Buffer object storing the vector \b y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of \b Y. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return The same result as the clblasDgbmv() function.
+ *
+ * @ingroup GBMV
+ */
+clblasStatus
+clblasZgbmv(
+    clblasOrder order,
+    clblasTranspose trans,
+    size_t M,
+    size_t N,
+    size_t KL,
+    size_t KU,
+    cl_double2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_double2 beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/*@}*/
+
+
+/**
+ * @defgroup TBMV TBMV  - Triangular banded matrix vector multiply
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief Matrix-vector product with a triangular banded matrix and
+ * float elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in banded matrix \b A.
+ * @param[in] K					Number of sub-diagonals/super-diagonals in triangular banded matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than ( \b K + 1 )
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b incx is zero, or
+ *     - K is greater than \b N - 1
+ *     - the leading dimension is invalid;
+ *   - \b clblasInvalidMemObject if either \b A or \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup TBMV
+ */
+clblasStatus
+clblasStbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/**
+ * @example example_stbmv.c
+ * Example of how to use the @ref clblasStbmv function.
+ */
+
+
+/**
+ * @brief Matrix-vector product with a triangular banded matrix and
+ * double elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in banded matrix \b A.
+ * @param[in] K					Number of sub-diagonals/super-diagonals in triangular banded matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than ( \b K + 1 )
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasStbmv() function otherwise.
+ *
+ * @ingroup TBMV
+ */
+clblasStatus
+clblasDtbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+
+/**
+ * @brief Matrix-vector product with a triangular banded matrix and
+ * float-complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in banded matrix \b A.
+ * @param[in] K					Number of sub-diagonals/super-diagonals in triangular banded matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than ( \b K + 1 )
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+* @return The same result as the clblasStbmv() function.
+ *
+ * @ingroup TBMV
+ */
+clblasStatus
+clblasCtbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+
+/**
+ * @brief Matrix-vector product with a triangular banded matrix and
+ * double-complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ X \leftarrow  A X \f$
+ *   - \f$ X \leftarrow  A^T X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in banded matrix \b A.
+ * @param[in] K					Number of sub-diagonals/super-diagonals in triangular banded matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than ( \b K + 1 )
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] scratchBuff		Temporary cl_mem scratch buffer object which can hold a
+ *								minimum of (1 + (N-1)*abs(incx)) elements
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+* @return The same result as the clblasDtbmv() function.
+ *
+ * @ingroup TBMV
+ */
+clblasStatus
+clblasZtbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/*@}*/
+
+
+/**
+ * @defgroup SBMV SBMV  - Symmetric banded matrix-vector multiplication
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief Matrix-vector product with a symmetric banded matrix and float elements.
+ *
+ * Matrix-vector products:
+ * - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in banded matrix \b A.
+ * @param[in] K			Number of sub-diagonals/super-diagonals in banded matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A			Buffer object storing matrix \b A.
+ * @param[in] offa		Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda		Leading dimension of matrix \b A. It cannot be less
+ *						than ( \b K + 1 )
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b X. It cannot be zero.
+ * @param[in] beta      The factor of vector \b Y.
+ * @param[out] Y        Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b Y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b incx is zero, or
+ *     - K is greater than \b N - 1
+ *     - the leading dimension is invalid;
+ *   - \b clblasInvalidMemObject if either \b A or \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SBMV
+ */
+clblasStatus
+clblasSsbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    size_t K,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_float beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/**
+ * @example example_ssbmv.c
+ * This is an example of how to use the @ref clblasSsbmv function.
+ */
+
+
+/**
+ * @brief Matrix-vector product with a symmetric banded matrix and double elements.
+ *
+ * Matrix-vector products:
+ * - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in banded matrix \b A.
+ * @param[in] K			Number of sub-diagonals/super-diagonals in banded matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A			Buffer object storing matrix \b A.
+ * @param[in] offa		Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda		Leading dimension of matrix \b A. It cannot be less
+ *						than ( \b K + 1 )
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b X. It cannot be zero.
+ * @param[in] beta      The factor of vector \b Y.
+ * @param[out] Y        Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b Y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasSsbmv() function otherwise.
+ *
+ * @ingroup SBMV
+ */
+clblasStatus
+clblasDsbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    size_t K,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_double beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/*@}*/
+
+
+/**
+ * @defgroup HBMV HBMV  - Hermitian banded matrix-vector multiplication
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief Matrix-vector product with a hermitian banded matrix and float elements.
+ *
+ * Matrix-vector products:
+ * - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in banded matrix \b A.
+ * @param[in] K			Number of sub-diagonals/super-diagonals in banded matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A			Buffer object storing matrix \b A.
+ * @param[in] offa		Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda		Leading dimension of matrix \b A. It cannot be less
+ *						than ( \b K + 1 )
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b X. It cannot be zero.
+ * @param[in] beta      The factor of vector \b Y.
+ * @param[out] Y        Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b Y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b incx is zero, or
+ *     - K is greater than \b N - 1
+ *     - the leading dimension is invalid;
+ *   - \b clblasInvalidMemObject if either \b A or \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup HBMV
+ */
+clblasStatus
+clblasChbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    size_t K,
+    cl_float2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_float2 beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/**
+ * @example example_chbmv.c
+ * This is an example of how to use the @ref clblasChbmv function.
+ */
+
+
+/**
+ * @brief Matrix-vector product with a hermitian banded matrix and double elements.
+ *
+ * Matrix-vector products:
+ * - \f$ Y \leftarrow \alpha A X + \beta Y \f$
+ *
+ * @param[in] order     Row/columns order.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] N         Number of rows and columns in banded matrix \b A.
+ * @param[in] K			Number of sub-diagonals/super-diagonals in banded matrix \b A.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A			Buffer object storing matrix \b A.
+ * @param[in] offa		Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda		Leading dimension of matrix \b A. It cannot be less
+ *						than ( \b K + 1 )
+ * @param[in] X         Buffer object storing vector \b X.
+ * @param[in] offx      Offset of first element of vector \b X in buffer object.
+ *                      Counted in elements.
+ * @param[in] incx      Increment for the elements of vector \b X. It cannot be zero.
+ * @param[in] beta      The factor of vector \b Y.
+ * @param[out] Y        Buffer object storing vector \b Y.
+ * @param[in] offy      Offset of first element of vector \b Y in buffer object.
+ *                      Counted in elements.
+ * @param[in] incy      Increment for the elements of vector \b Y. It cannot be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasChbmv() function otherwise.
+ *
+ * @ingroup HBMV
+ */
+clblasStatus
+clblasZhbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    size_t K,
+    cl_double2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_double2 beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/*@}*/
+
+
+/**
+ * @defgroup TBSV TBSV  - Solving triangular banded matrix
+ * @ingroup BLAS2
+ */
+/*@{*/
+
+/**
+ * @brief solving triangular banded matrix problems with float elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in banded matrix \b A.
+ * @param[in] K					Number of sub-diagonals/super-diagonals in triangular banded matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than ( \b K + 1 )
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b incx is zero, or
+ *     - K is greater than \b N - 1
+ *     - the leading dimension is invalid;
+ *   - \b clblasInvalidMemObject if either \b A or \b X object is
+ *     Invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs
+ *     to was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup TBSV
+ */
+ clblasStatus
+clblasStbsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/**
+ * @example example_stbsv.c
+ * This is an example of how to use the @ref clblasStbsv function.
+ */
+
+
+/**
+ * @brief solving triangular banded matrix problems with double elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in banded matrix \b A.
+ * @param[in] K					Number of sub-diagonals/super-diagonals in triangular banded matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than ( \b K + 1 )
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasStbsv() function otherwise.
+ *
+ * @ingroup TBSV
+ */
+clblasStatus
+clblasDtbsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief solving triangular banded matrix problems with float-complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in banded matrix \b A.
+ * @param[in] K					Number of sub-diagonals/super-diagonals in triangular banded matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than ( \b K + 1 )
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return The same result as the clblasStbsv() function.
+ *
+ * @ingroup TBSV
+ */
+clblasStatus
+clblasCtbsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief solving triangular banded matrix problems with double-complex elements.
+ *
+ * Matrix-vector products:
+ *   - \f$ A X \leftarrow  X \f$
+ *   - \f$ A^T X \leftarrow  X \f$
+ *
+ * @param[in] order				Row/column order.
+ * @param[in] uplo				The triangle in matrix being referenced.
+ * @param[in] trans				How matrix \b A is to be transposed.
+ * @param[in] diag				Specify whether matrix \b A is unit triangular.
+ * @param[in] N					Number of rows/columns in banded matrix \b A.
+ * @param[in] K					Number of sub-diagonals/super-diagonals in triangular banded matrix \b A.
+ * @param[in] A					Buffer object storing matrix \b A.
+ * @param[in] offa				Offset in number of elements for first element in matrix \b A.
+ * @param[in] lda				Leading dimension of matrix \b A. It cannot be less
+ *								than ( \b K + 1 )
+ * @param[out] X				Buffer object storing vector \b X.
+ * @param[in] offx				Offset in number of elements for first element in vector \b X.
+ * @param[in] incx				Increment for the elements of \b X. Must not be zero.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return The same result as the clblasDtbsv() function.
+ *
+ * @ingroup TBSV
+ */
+clblasStatus
+clblasZtbsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/*@}*/
+
+
+/**
+ * @defgroup BLAS3 BLAS-3 functions
+ *
+ * The Level 3 Basic Linear Algebra Subprograms are funcions that perform
+ * matrix-matrix operations.
+ */
+/*@{*/
+/*@}*/
+
+/**
+ * @defgroup GEMM GEMM - General matrix-matrix multiplication
+ * @ingroup BLAS3
+ */
+/*@{*/
+
+/**
+ * @brief Matrix-matrix product of general rectangular matrices with float
+ *        elements. Extended version.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A B^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] transB    How matrix \b B is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] K         Number of columns in matrix \b A and rows in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b K when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or less than \b M when the
+ *                      parameter is set to \b clblasColumnMajor.
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or less than \b K
+ *                      when it is set to \b clblasColumnMajor.
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in]  offC     Offset of the first element of the matrix \b C in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldc       Leading dimension of matrix \b C. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or less than \b M when
+ *                      it is set to \b clblasColumnMajorOrder.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidValue if either \b offA, \b offB or \b offC exceeds
+ *        the size of the respective buffer object;
+ *   - the same error codes as clblasSgemm() otherwise.
+ *
+ * @ingroup GEMM
+ */
+clblasStatus
+clblasSgemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_float beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_sgemm.c
+ * This is an example of how to use the @ref clblasSgemmEx function.
+ */
+
+/**
+ * @brief Matrix-matrix product of general rectangular matrices with double
+ *        elements. Extended version.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A B^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] transB    How matrix \b B is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] K         Number of columns in matrix \b A and rows in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed description,
+ *                      see clblasSgemm().
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed description,
+ *                      see clblasSgemm().
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in] offC      Offset of the first element of the matrix \b C in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldc       Leading dimension of matrix \b C. For detailed description,
+ *                      see clblasSgemm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *        point arithmetic with double precision;
+ *   - \b clblasInvalidValue if either \b offA, \b offB or offC exceeds
+ *        the size of the respective buffer object;
+ *   - the same error codes as the clblasSgemm() function otherwise.
+ *
+ * @ingroup GEMM
+ */
+clblasStatus
+clblasDgemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_double beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief Matrix-matrix product of general rectangular matrices with float
+ *        complex elements. Extended version.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A B^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] transB    How matrix \b B is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] K         Number of columns in matrix \b A and rows in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed description,
+ *                      see clblasSgemm().
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed description,
+ *                      see clblasSgemm().
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in] offC      Offset of the first element of the matrix \b C in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldc       Leading dimension of matrix \b C. For detailed description,
+ *                      see clblasSgemm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidValue if either \b offA, \b offB or offC exceeds
+ *        the size of the respective buffer object;
+ *   - the same error codes as the clblasSgemm() function otherwise.
+ *
+ * @ingroup GEMM
+ */
+clblasStatus
+clblasCgemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    FloatComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief Matrix-matrix product of general rectangular matrices with double
+ *        complex elements. Exteneded version.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A B^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] transB    How matrix \b B is to be transposed.
+ * @param[in] M         Number of rows in matrix \b A.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] K         Number of columns in matrix \b A and rows in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed description,
+ *                      see clblasSgemm().
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed description,
+ *                      see clblasSgemm().
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in] offC      Offset of the first element of the matrix \b C in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldc       Leading dimension of matrix \b C. For detailed description,
+ *                      see clblasSgemm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *        point arithmetic with double precision;
+ *   - \b clblasInvalidValue if either \b offA, \b offB or offC exceeds
+ *        the size of the respective buffer object;
+ *   - the same error codes as the clblasSgemm() function otherwise.
+ *
+ * @ingroup GEMM
+ */
+clblasStatus
+clblasZgemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    DoubleComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/*@}*/
+
+/**
+ * @defgroup TRMM TRMM - Triangular matrix-matrix multiplication
+ * @ingroup BLAS3
+ */
+/*@{*/
+
+/**
+ * @brief Multiplying a matrix by a triangular matrix with float elements.
+ *        Extended version.
+ *
+ * Matrix-triangular matrix products:
+ *   - \f$ B \leftarrow \alpha A B \f$
+ *   - \f$ B \leftarrow \alpha A^T B \f$
+ *   - \f$ B \leftarrow \alpha B A \f$
+ *   - \f$ B \leftarrow \alpha B A^T \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b M when the \b side parameter is set to
+ *                      \b clblasLeft,\n or less than \b N when it is set
+ *                      to \b clblasRight.
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or not less than \b M
+ *                      when it is set to \b clblasColumnMajor.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidValue if either \b offA or \b offB exceeds the size
+ *        of the respective buffer object;
+ *   - the same error codes as clblasStrmm() otherwise.
+ *
+ * @ingroup TRMM
+ */
+clblasStatus
+clblasStrmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_strmm.c
+ * This is an example of how to use the @ref clblasStrmmEx function.
+ */
+
+/**
+ * @brief Multiplying a matrix by a triangular matrix with double elements.
+ *        Extended version.
+ *
+ * Matrix-triangular matrix products:
+ *   - \f$ B \leftarrow \alpha A B \f$
+ *   - \f$ B \leftarrow \alpha A^T B \f$
+ *   - \f$ B \leftarrow \alpha B A \f$
+ *   - \f$ B \leftarrow \alpha B A^T \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed
+ *                      description, see clblasStrmm().
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed
+ *                      description, see clblasStrmm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - \b clblasInvalidValue if either \b offA or \b offB exceeds the size
+ *        of the respective buffer object;
+ *   - the same error codes as the clblasStrmm() function otherwise.
+ *
+ * @ingroup TRMM
+ */
+clblasStatus
+clblasDtrmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief Multiplying a matrix by a triangular matrix with float complex
+ *        elements. Extended version.
+ *
+ * Matrix-triangular matrix products:
+ *   - \f$ B \leftarrow \alpha A B \f$
+ *   - \f$ B \leftarrow \alpha A^T B \f$
+ *   - \f$ B \leftarrow \alpha B A \f$
+ *   - \f$ B \leftarrow \alpha B A^T \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed
+ *                      description, see clblasStrmm().
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed
+ *                      description, see clblasStrmm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidValue if either \b offA or \b offB exceeds the size
+ *        of the respective buffer object;
+ *   - the same error codes as clblasStrmm() otherwise.
+ *
+ * @ingroup TRMM
+ */
+clblasStatus
+clblasCtrmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief Multiplying a matrix by a triangular matrix with double complex
+ *        elements. Extended version.
+ *
+ * Matrix-triangular matrix products:
+ *   - \f$ B \leftarrow \alpha A B \f$
+ *   - \f$ B \leftarrow \alpha A^T B \f$
+ *   - \f$ B \leftarrow \alpha B A \f$
+ *   - \f$ B \leftarrow \alpha B A^T \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed
+ *                      description, see clblasStrmm().
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed
+ *                      description, see clblasStrmm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - \b clblasInvalidValue if either \b offA or \b offB exceeds the size
+ *        of the respective buffer object;
+ *   - the same error codes as the clblasStrmm() function otherwise.
+ *
+ * @ingroup TRMM
+ */
+clblasStatus
+clblasZtrmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/*@}*/
+
+/**
+ * @defgroup TRSM TRSM - Solving triangular systems of equations
+ * @ingroup BLAS3
+ */
+/*@{*/
+
+/**
+ * @brief Solving triangular systems of equations with multiple right-hand
+ *        sides and float elements. Extended version.
+ *
+ * Solving triangular systems of equations:
+ *   - \f$ B \leftarrow \alpha A^{-1} B \f$
+ *   - \f$ B \leftarrow \alpha A^{-T} B \f$
+ *   - \f$ B \leftarrow \alpha B A^{-1} \f$
+ *   - \f$ B \leftarrow \alpha B A^{-T} \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b M when the \b side parameter is set to
+ *                      \b clblasLeft,\n or less than \b N
+ *                      when it is set to \b clblasRight.
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or less than \b M
+ *                      when it is set to \b clblasColumnMajor.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidValue if either \b offA or \b offB exceeds the size
+ *        of the respective buffer object;
+ *   - the same error codes as clblasStrsm() otherwise.
+ *
+ * @ingroup TRSM
+ */
+clblasStatus
+clblasStrsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_strsm.c
+ * This is an example of how to use the @ref clblasStrsmEx function.
+ */
+
+/**
+ * @brief Solving triangular systems of equations with multiple right-hand
+ *        sides and double elements. Extended version.
+ *
+ * Solving triangular systems of equations:
+ *   - \f$ B \leftarrow \alpha A^{-1} B \f$
+ *   - \f$ B \leftarrow \alpha A^{-T} B \f$
+ *   - \f$ B \leftarrow \alpha B A^{-1} \f$
+ *   - \f$ B \leftarrow \alpha B A^{-T} \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed
+ *                      description, see clblasStrsm().
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed
+ *                      description, see clblasStrsm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *        point arithmetic with double precision;
+ *   - \b clblasInvalidValue if either \b offA or \b offB exceeds the size
+ *        of the respective buffer object;
+ *   - the same error codes as the clblasStrsm() function otherwise.
+ *
+ * @ingroup TRSM
+ */
+clblasStatus
+clblasDtrsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief Solving triangular systems of equations with multiple right-hand
+ *        sides and float complex elements. Extended version.
+ *
+ * Solving triangular systems of equations:
+ *   - \f$ B \leftarrow \alpha A^{-1} B \f$
+ *   - \f$ B \leftarrow \alpha A^{-T} B \f$
+ *   - \f$ B \leftarrow \alpha B A^{-1} \f$
+ *   - \f$ B \leftarrow \alpha B A^{-T} \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed
+ *                      description, see clblasStrsm().
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed
+ *                      description, see clblasStrsm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidValue if either \b offA or \b offB exceeds the size
+ *        of the respective buffer object;
+ *   - the same error codes as clblasStrsm() otherwise.
+ *
+ * @ingroup TRSM
+ */
+clblasStatus
+clblasCtrsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief Solving triangular systems of equations with multiple right-hand
+ *        sides and double complex elements. Extended version.
+ *
+ * Solving triangular systems of equations:
+ *   - \f$ B \leftarrow \alpha A^{-1} B \f$
+ *   - \f$ B \leftarrow \alpha A^{-T} B \f$
+ *   - \f$ B \leftarrow \alpha B A^{-1} \f$
+ *   - \f$ B \leftarrow \alpha B A^{-T} \f$
+ *
+ * where \b T is an upper or lower triangular matrix.
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side      The side of triangular matrix.
+ * @param[in] uplo      The triangle in matrix being referenced.
+ * @param[in] transA    How matrix \b A is to be transposed.
+ * @param[in] diag      Specify whether matrix is unit triangular.
+ * @param[in] M         Number of rows in matrix \b B.
+ * @param[in] N         Number of columns in matrix \b B.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offA      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. For detailed
+ *                      description, see clblasStrsm().
+ * @param[out] B        Buffer object storing matrix \b B.
+ * @param[in] offB      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. For detailed
+ *                      description, see clblasStrsm().
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *        point arithmetic with double precision;
+ *   - \b clblasInvalidValue if either \b offA or \b offB exceeds the size
+ *        of the respective buffer object;
+ *   - the same error codes as the clblasStrsm() function otherwise
+ *
+ * @ingroup TRSM
+ */
+clblasStatus
+clblasZtrsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/*@}*/
+
+/**
+ * @defgroup SYRK SYRK - Symmetric rank-k update of a matrix
+ * @ingroup BLAS3
+ */
+
+/*@{*/
+
+/**
+ * @brief Rank-k update of a symmetric matrix with float elements.
+ *        Extended version.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T A + \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transA     How matrix \b A is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrix \b A if it is not
+ *                       transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrix \b A.
+ * @param[in] A          Buffer object storing the matrix \b A.
+ * @param[in] offA       Offset of the first element of the matrix \b A in the
+ *                       buffer object. Counted in elements.
+ * @param[in] lda        Leading dimension of matrix \b A. It cannot be
+ *                       less than \b K if \b A is
+ *                       in the row-major format, and less than \b N
+ *                       otherwise.
+ * @param[in] beta       The factor of the matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offC       Offset of the first element of the matrix \b C in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldc        Leading dimension of matric \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidValue if either \b offA or \b offC exceeds the size
+ *        of the respective buffer object;
+ *   - the same error codes as the clblasSsyrk() function otherwise.
+ *
+ * @ingroup SYRK
+ */
+clblasStatus
+clblasSsyrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_float beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_ssyrk.c
+ * This is an example of how to use the @ref clblasSsyrkEx function.
+ */
+
+/**
+ * @brief Rank-k update of a symmetric matrix with double elements.
+ *        Extended version.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T A + \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transA     How matrix \b A is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrix \b A if it is not
+ *                       transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrix \b A.
+ * @param[in] A          Buffer object storing the matrix \b A.
+ * @param[in] offA       Offset of the first element of the matrix \b A in the
+ *                       buffer object. Counted in elements.
+ * @param[in] lda        Leading dimension of matrix \b A. For detailed
+ *                       description, see clblasSsyrk().
+ * @param[in] beta       The factor of the matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offC       Offset of the first element of the matrix \b C in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldc        Leading dimension of matrix \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *        point arithmetic with double precision;
+ *   - \b clblasInvalidValue if either \b offA or \b offC exceeds the size
+ *        of the respective buffer object;
+ *   - the same error codes as the clblasSsyrk() function otherwise.
+ *
+ * @ingroup SYRK
+ */
+clblasStatus
+clblasDsyrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_double beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief Rank-k update of a symmetric matrix with complex float elements.
+ *        Extended version.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T A + \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transA     How matrix \b A is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrix \b A if it is not
+ *                       transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrix \b A.
+ * @param[in] A          Buffer object storing the matrix \b A.
+ * @param[in] offA       Offset of the first element of the matrix \b A in the
+ *                       buffer object. Counted in elements.
+ * @param[in] lda        Leading dimension of matrix \b A. For detailed
+ *                       description, see clblasSsyrk().
+ * @param[in] beta       The factor of the matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offC       Offset of the first element of the matrix \b C in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldc        Leading dimension of matrix \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidValue if either \b offA or \b offC exceeds the size
+ *        of the respective buffer object;
+ *   - \b clblasInvalidValue if \b transA is set to \ref clblasConjTrans.
+ *   - the same error codes as the clblasSsyrk() function otherwise.
+ *
+ * @ingroup SYRK
+ */
+clblasStatus
+clblasCsyrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    FloatComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief Rank-k update of a symmetric matrix with complex double elements.
+ *        Extended version.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T A + \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transA     How matrix \b A is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrix \b A if it is not
+ *                       transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrix \b A.
+ * @param[in] A          Buffer object storing the matrix \b A.
+ * @param[in] offA       Offset of the first element of the matrix \b A in the
+ *                       buffer object. Counted in elements.
+ * @param[in] lda        Leading dimension of matrix \b A. For detailed
+ *                       description, see clblasSsyrk().
+ * @param[in] beta       The factor of the matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offC       Offset of the first element of the matrix \b C in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldc        Leading dimension of matrix \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *         point arithmetic with double precision;
+ *   - \b clblasInvalidValue if either \b offA or \b offC exceeds the size
+ *        of the respective buffer object;
+ *   - \b clblasInvalidValue if \b transA is set to \ref clblasConjTrans.
+ *   - the same error codes as the clblasSsyrk() function otherwise.
+ *
+ * @ingroup SYRK
+ */
+clblasStatus
+clblasZsyrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    DoubleComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/*@}*/
+
+/**
+ * @defgroup SYR2K SYR2K - Symmetric rank-2k update to a matrix
+ * @ingroup BLAS3
+ */
+
+/*@{*/
+
+/**
+ * @brief Rank-2k update of a symmetric matrix with float elements.
+ *        Extended version.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transAB    How matrices \b A and \b B is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrices \b A and \b B if they
+ *                       are not transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrices \b A and \b B.
+ * @param[in] A          Buffer object storing matrix \b A.
+ * @param[in] offA       Offset of the first element of the matrix \b A in the
+ *                       buffer object. Counted in elements.
+ * @param[in] lda        Leading dimension of matrix \b A. It cannot be less
+ *                       than \b K if \b A is
+ *                       in the row-major format, and less than \b N
+ *                       otherwise.
+ * @param[in] B          Buffer object storing matrix \b B.
+ * @param[in] offB       Offset of the first element of the matrix \b B in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldb        Leading dimension of matrix \b B. It cannot be less
+ *                       less than \b K if \b B matches to the op(\b B) matrix
+ *                       in the row-major format, and less than \b N
+ *                       otherwise.
+ * @param[in] beta       The factor of matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offC       Offset of the first element of the matrix \b C in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldc        Leading dimension of matrix \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidValue if either \b offA, \b offB or \b offC exceeds
+ *        the size of the respective buffer object;
+ *   - the same error codes as the clblasSsyr2k() function otherwise.
+ *
+ * @ingroup SYR2K
+ */
+clblasStatus
+clblasSsyr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transAB,
+    size_t N,
+    size_t K,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_float beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @example example_ssyr2k.c
+ * This is an example of how to use the @ref clblasSsyr2kEx function.
+ */
+
+/**
+ * @brief Rank-2k update of a symmetric matrix with double elements.
+ *        Extended version.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transAB    How matrices \b A and \b B is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrices \b A and \b B if they
+ *                       are not transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrices \b A and \b B.
+ * @param[in] A          Buffer object storing matrix \b A.
+ * @param[in] offA       Offset of the first element of the matrix \b A in the
+ *                       buffer object. Counted in elements.
+ * @param[in] lda        Leading dimension of matrix \b A. For detailed
+ *                       description, see clblasSsyr2k().
+ * @param[in] B          Buffer object storing matrix \b B.
+ * @param[in] offB       Offset of the first element of the matrix \b B in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldb        Leading dimension of matrix \b B. For detailed
+ *                       description, see clblasSsyr2k().
+ * @param[in] beta       The factor of matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offC       Offset of the first element of the matrix \b C in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldc        Leading dimension of matrix \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *        point arithmetic with double precision;
+ *   - \b clblasInvalidValue if either \b offA, \b offB or \b offC exceeds
+ *        the size of the respective buffer object;
+ *   - the same error codes as the clblasSsyr2k() function otherwise.
+ *
+ * @ingroup SYR2K
+ */
+clblasStatus
+clblasDsyr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transAB,
+    size_t N,
+    size_t K,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_double beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief Rank-2k update of a symmetric matrix with complex float elements.
+ *        Extended version.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transAB    How matrices \b A and \b B is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrices \b A and \b B if they
+ *                       are not transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrices \b A and \b B.
+ * @param[in] A          Buffer object storing matrix \b A.
+ * @param[in] offA       Offset of the first element of the matrix \b A in the
+ *                       buffer object. Counted in elements.
+ * @param[in] lda        Leading dimension of matrix \b A. For detailed
+ *                       description, see clblasSsyr2k().
+ * @param[in] B          Buffer object storing matrix \b B.
+ * @param[in] offB       Offset of the first element of the matrix \b B in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldb        Leading dimension of matrix \b B. For detailed
+ *                       description, see clblasSsyr2k().
+ * @param[in] beta       The factor of matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offC       Offset of the first element of the matrix \b C in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldc        Leading dimension of matrix \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidValue if either \b offA, \b offB or \b offC exceeds
+ *        the size of the respective buffer object;
+ *   - \b clblasInvalidValue if \b transAB is set to \ref clblasConjTrans.
+ *   - the same error codes as the clblasSsyr2k() function otherwise.
+ *
+ * @ingroup SYR2K
+ */
+clblasStatus
+clblasCsyr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transAB,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    FloatComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief Rank-2k update of a symmetric matrix with complex double elements.
+ *        Extended version.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$
+ *
+ * where \b C is a symmetric matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transAB    How matrices \b A and \b B is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrices \b A and \b B if they
+ *                       are not transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrices \b A and \b B.
+ * @param[in] A          Buffer object storing matrix \b A.
+ * @param[in] offA       Offset of the first element of the matrix \b A in the
+ *                       buffer object. Counted in elements.
+ * @param[in] lda        Leading dimension of matrix \b A. For detailed
+ *                       description, see clblasSsyr2k().
+ * @param[in] B          Buffer object storing matrix \b B.
+ * @param[in] offB       Offset of the first element of the matrix \b B in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldb        Leading dimension of matrix \b B. For detailed
+ *                       description, see clblasSsyr2k().
+ * @param[in] beta       The factor of matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offC       Offset of the first element of the matrix \b C in the
+ *                       buffer object. Counted in elements.
+ * @param[in] ldc        Leading dimension of matrix \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *        point arithmetic with double precision;
+ *   - \b clblasInvalidValue if either \b offA, \b offB or \b offC exceeds
+ *        the size of the respective buffer object;
+ *   - \b clblasInvalidValue if \b transAB is set to \ref clblasConjTrans.
+ *   - the same error codes as the clblasSsyr2k() function otherwise.
+ *
+ * @ingroup SYR2K
+ */
+clblasStatus
+clblasZsyr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transAB,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    DoubleComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/*@}*/
+
+
+/**
+ * @defgroup SYMM SYMM  - Symmetric matrix-matrix multiply
+ * @ingroup BLAS3
+ */
+/*@{*/
+
+/**
+ * @brief Matrix-matrix product of symmetric rectangular matrices with float
+ * elements.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha B A + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side		The side of triangular matrix.
+ * @param[in] uplo		The triangle in matrix being referenced.
+ * @param[in] M         Number of rows in matrices \b B and \b C.
+ * @param[in] N         Number of columns in matrices \b B and \b C.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offa      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b M when the \b side parameter is set to
+ *                      \b clblasLeft,\n or less than \b N when the
+ *                      parameter is set to \b clblasRight.
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] offb      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or less than \b M
+ *                      when it is set to \b clblasColumnMajor.
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in] offc      Offset of the first element of the matrix \b C in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldc       Leading dimension of matrix \b C. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or less than \b M when
+ *                      it is set to \b clblasColumnMajorOrder.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events			  Event objects per each command queue that identify
+ *								  a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - \b M or \b N is zero, or
+ *     - any of the leading dimensions is invalid;
+ *     - the matrix sizes lead to accessing outsize of any of the buffers;
+ *   - \b clblasInvalidMemObject if A, B, or C object is invalid,
+ *     or an image object rather than the buffer one;
+ *   - \b clblasOutOfResources if you use image-based function implementation
+ *     and no suitable scratch image available;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs to
+ *     was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup SYMM
+ */
+clblasStatus
+clblasSsymm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    size_t M,
+    size_t N,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_float beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/**
+ * @example example_ssymm.c
+ * This is an example of how to use the @ref clblasSsymm function.
+ */
+
+
+/**
+ * @brief Matrix-matrix product of symmetric rectangular matrices with double
+ * elements.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha B A + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side		The side of triangular matrix.
+ * @param[in] uplo		The triangle in matrix being referenced.
+ * @param[in] M         Number of rows in matrices \b B and \b C.
+ * @param[in] N         Number of columns in matrices \b B and \b C.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offa      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b M when the \b side parameter is set to
+ *                      \b clblasLeft,\n or less than \b N when the
+ *                      parameter is set to \b clblasRight.
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] offb      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or less than \b M
+ *                      when it is set to \b clblasColumnMajor.
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in] offc      Offset of the first element of the matrix \b C in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldc       Leading dimension of matrix \b C. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or less than \b M when
+ *                      it is set to \b clblasColumnMajorOrder.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events			  Event objects per each command queue that identify
+ *								  a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasSsymm() function otherwise.
+ *
+ * @ingroup SYMM
+ */
+clblasStatus
+clblasDsymm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    size_t M,
+    size_t N,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_double beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+
+/**
+ * @brief Matrix-matrix product of symmetric rectangular matrices with
+ * float-complex elements.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha B A + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side		The side of triangular matrix.
+ * @param[in] uplo		The triangle in matrix being referenced.
+ * @param[in] M         Number of rows in matrices \b B and \b C.
+ * @param[in] N         Number of columns in matrices \b B and \b C.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offa      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b M when the \b side parameter is set to
+ *                      \b clblasLeft,\n or less than \b N when the
+ *                      parameter is set to \b clblasRight.
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] offb      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or less than \b M
+ *                      when it is set to \b clblasColumnMajor.
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in] offc      Offset of the first element of the matrix \b C in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldc       Leading dimension of matrix \b C. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or less than \b M when
+ *                      it is set to \b clblasColumnMajorOrder.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events			  Event objects per each command queue that identify
+ *								  a particular kernel execution instance.
+ *
+ * @return The same result as the clblasSsymm() function.
+ *
+ * @ingroup SYMM
+ */
+clblasStatus
+clblasCsymm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    size_t M,
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_float2 beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+
+/**
+ * @brief Matrix-matrix product of symmetric rectangular matrices with
+ * double-complex elements.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha B A + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side		The side of triangular matrix.
+ * @param[in] uplo		The triangle in matrix being referenced.
+ * @param[in] M         Number of rows in matrices \b B and \b C.
+ * @param[in] N         Number of columns in matrices \b B and \b C.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offa      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b M when the \b side parameter is set to
+ *                      \b clblasLeft,\n or less than \b N when the
+ *                      parameter is set to \b clblasRight.
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] offb      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or less than \b M
+ *                      when it is set to \b clblasColumnMajor.
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in] offc      Offset of the first element of the matrix \b C in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldc       Leading dimension of matrix \b C. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or less than \b M when
+ *                      it is set to \b clblasColumnMajorOrder.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events			  Event objects per each command queue that identify
+ *								  a particular kernel execution instance.
+ *
+ * @return The same result as the clblasDsymm() function.
+ *
+ * @ingroup SYMM
+ */
+clblasStatus
+clblasZsymm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    size_t M,
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_double2 beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/*@}*/
+
+
+/**
+ * @defgroup HEMM HEMM  - Hermitian matrix-matrix multiplication
+ * @ingroup BLAS3
+ */
+/*@{*/
+
+/**
+ * @brief Matrix-matrix product of hermitian rectangular matrices with
+ * float-complex elements.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha B A + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side		The side of triangular matrix.
+ * @param[in] uplo		The triangle in matrix being referenced.
+ * @param[in] M         Number of rows in matrices \b B and \b C.
+ * @param[in] N         Number of columns in matrices \b B and \b C.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offa      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b M when the \b side parameter is set to
+ *                      \b clblasLeft,\n or less than \b N when the
+ *                      parameter is set to \b clblasRight.
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] offb      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or less than \b M
+ *                      when it is set to \b clblasColumnMajor.
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in] offc      Offset of the first element of the matrix \b C in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldc       Leading dimension of matrix \b C. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or less than \b M when
+ *                      it is set to \b clblasColumnMajorOrder.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - \b M or \b N is zero, or
+ *     - any of the leading dimensions is invalid;
+ *     - the matrix sizes lead to accessing outsize of any of the buffers;
+ *   - \b clblasInvalidMemObject if A, B, or C object is invalid,
+ *     or an image object rather than the buffer one;
+ *   - \b clblasOutOfResources if you use image-based function implementation
+ *     and no suitable scratch image available;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs to
+ *     was released;
+ *   - \b clblasInvalidOperation if kernel compilation relating to a previous
+ *     call has not completed for any of the target devices;
+ *   - \b clblasCompilerNotAvailable if a compiler is not available;
+ *   - \b clblasBuildProgramFailure if there is a failure to build a program
+ *     executable.
+ *
+ * @ingroup HEMM
+ */
+clblasStatus
+clblasChemm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    size_t M,
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_float2 beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/**
+ * @example example_chemm.cpp
+ * This is an example of how to use the @ref clblasChemm function.
+ */
+
+
+/**
+ * @brief Matrix-matrix product of hermitian rectangular matrices with
+ * double-complex elements.
+ *
+ * Matrix-matrix products:
+ *   - \f$ C \leftarrow \alpha A B + \beta C \f$
+ *   - \f$ C \leftarrow \alpha B A + \beta C \f$
+ *
+ * @param[in] order     Row/column order.
+ * @param[in] side		The side of triangular matrix.
+ * @param[in] uplo		The triangle in matrix being referenced.
+ * @param[in] M         Number of rows in matrices \b B and \b C.
+ * @param[in] N         Number of columns in matrices \b B and \b C.
+ * @param[in] alpha     The factor of matrix \b A.
+ * @param[in] A         Buffer object storing matrix \b A.
+ * @param[in] offa      Offset of the first element of the matrix \b A in the
+ *                      buffer object. Counted in elements.
+ * @param[in] lda       Leading dimension of matrix \b A. It cannot be less
+ *                      than \b M when the \b side parameter is set to
+ *                      \b clblasLeft,\n or less than \b N when the
+ *                      parameter is set to \b clblasRight.
+ * @param[in] B         Buffer object storing matrix \b B.
+ * @param[in] offb      Offset of the first element of the matrix \b B in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldb       Leading dimension of matrix \b B. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or less than \b M
+ *                      when it is set to \b clblasColumnMajor.
+ * @param[in] beta      The factor of matrix \b C.
+ * @param[out] C        Buffer object storing matrix \b C.
+ * @param[in] offc      Offset of the first element of the matrix \b C in the
+ *                      buffer object. Counted in elements.
+ * @param[in] ldc       Leading dimension of matrix \b C. It cannot be less
+ *                      than \b N when the \b order parameter is set to
+ *                      \b clblasRowMajor,\n or less than \b M when
+ *                      it is set to \b clblasColumnMajorOrder.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasChemm() function otherwise.
+ *
+ * @ingroup HEMM
+ */
+clblasStatus
+clblasZhemm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    size_t M,
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_double2 beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/*@}*/
+
+
+/**
+ * @defgroup HERK HERK  - Hermitian rank-k update to a matrix
+ * @ingroup BLAS3
+ */
+/*@{*/
+
+/**
+ * @brief Rank-k update of a hermitian matrix with float-complex elements.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A A^H + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^H A + \beta C \f$
+ *
+ * where \b C is a hermitian matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transA     How matrix \b A is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrix \b A if it is not
+ *                       transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrix \b A.
+ * @param[in] A          Buffer object storing the matrix \b A.
+ * @param[in] offa       Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda        Leading dimension of matrix \b A. It cannot be
+ *                       less than \b K if \b A is
+ *                       in the row-major format, and less than \b N
+ *                       otherwise.
+ * @param[in] beta       The factor of the matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offc       Offset in number of elements for the first element in matrix \b C.
+ * @param[in] ldc        Leading dimension of matric \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b K is zero, or
+ *     - any of the leading dimensions is invalid;
+ *     - the matrix sizes lead to accessing outsize of any of the buffers;
+ *   - \b clblasInvalidMemObject if either \b A or \b C object is
+ *     invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs to
+ *     was released.
+ *
+ * @ingroup HERK
+ */
+clblasStatus
+clblasCherk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    float alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    float beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/**
+ * @example example_cherk.cpp
+ * This is an example of how to use the @ref clblasCherk function.
+ */
+
+
+/**
+ * @brief Rank-k update of a hermitian matrix with double-complex elements.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A A^H + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^H A + \beta C \f$
+ *
+ * where \b C is a hermitian matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] transA     How matrix \b A is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrix \b A if it is not
+ *                       transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrix \b A.
+ * @param[in] A          Buffer object storing the matrix \b A.
+ * @param[in] offa       Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda        Leading dimension of matrix \b A. It cannot be
+ *                       less than \b K if \b A is
+ *                       in the row-major format, and less than \b N
+ *                       otherwise.
+ * @param[in] beta       The factor of the matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offc       Offset in number of elements for the first element in matrix \b C.
+ * @param[in] ldc        Leading dimension of matric \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasCherk() function otherwise.
+ *
+ * @ingroup HERK
+ */
+clblasStatus
+clblasZherk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    double alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    double beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/*@}*/
+
+
+/**
+ * @defgroup HER2K HER2K  - Hermitian rank-2k update to a matrix
+ * @ingroup BLAS3
+ */
+/*@{*/
+
+/**
+ * @brief Rank-2k update of a hermitian matrix with float-complex elements.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A B^H + conj( \alpha ) B A^H + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^H B + conj( \alpha ) B^H A + \beta C \f$
+ *
+ * where \b C is a hermitian matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] trans      How matrix \b A is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrix \b A if it is not
+ *                       transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrix \b A.
+ * @param[in] A          Buffer object storing the matrix \b A.
+ * @param[in] offa       Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda        Leading dimension of matrix \b A. It cannot be
+ *                       less than \b K if \b A is
+ *                       in the row-major format, and less than \b N
+ *                       otherwise. Vice-versa for transpose case.
+ * @param[in] B          Buffer object storing the matrix \b B.
+ * @param[in] offb       Offset in number of elements for the first element in matrix \b B.
+ * @param[in] ldb        Leading dimension of matrix \b B. It cannot be
+ *                       less than \b K if \b B is
+ *                       in the row-major format, and less than \b N
+ *                       otherwise. Vice-versa for transpose case
+ * @param[in] beta       The factor of the matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offc       Offset in number of elements for the first element in matrix \b C.
+ * @param[in] ldc        Leading dimension of matric \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasNotInitialized if clblasSetup() was not called;
+ *   - \b clblasInvalidValue if invalid parameters are passed:
+ *     - either \b N or \b K is zero, or
+ *     - any of the leading dimensions is invalid;
+ *     - the matrix sizes lead to accessing outsize of any of the buffers;
+ *   - \b clblasInvalidMemObject if either \b A , \b B or \b C object is
+ *     invalid, or an image object rather than the buffer one;
+ *   - \b clblasOutOfHostMemory if the library can't allocate memory for
+ *     internal structures;
+ *   - \b clblasInvalidCommandQueue if the passed command queue is invalid;
+ *   - \b clblasInvalidContext if a context a passed command queue belongs to
+ *     was released.
+ *
+ * @ingroup HER2K
+ */
+clblasStatus
+clblasCher2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_float beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/**
+ * @example example_cher2k.c
+ * This is an example of how to use the @ref clblasCher2k function.
+ */
+
+
+/**
+ * @brief Rank-2k update of a hermitian matrix with double-complex elements.
+ *
+ * Rank-k updates:
+ *   - \f$ C \leftarrow \alpha A B^H + conj( \alpha ) B A^H + \beta C \f$
+ *   - \f$ C \leftarrow \alpha A^H B + conj( \alpha ) B^H A + \beta C \f$
+ *
+ * where \b C is a hermitian matrix.
+ *
+ * @param[in] order      Row/column order.
+ * @param[in] uplo       The triangle in matrix \b C being referenced.
+ * @param[in] trans      How matrix \b A is to be transposed.
+ * @param[in] N          Number of rows and columns in matrix \b C.
+ * @param[in] K          Number of columns of the matrix \b A if it is not
+ *                       transposed, and number of rows otherwise.
+ * @param[in] alpha      The factor of matrix \b A.
+ * @param[in] A          Buffer object storing the matrix \b A.
+ * @param[in] offa       Offset in number of elements for the first element in matrix \b A.
+ * @param[in] lda        Leading dimension of matrix \b A. It cannot be
+ *                       less than \b K if \b A is
+ *                       in the row-major format, and less than \b N
+ *                       otherwise. Vice-versa for transpose case.
+ * @param[in] B          Buffer object storing the matrix \b B.
+ * @param[in] offb       Offset in number of elements for the first element in matrix \b B.
+ * @param[in] ldb        Leading dimension of matrix \b B. It cannot be
+ *                       less than \b K if B is
+ *                       in the row-major format, and less than \b N
+ *                       otherwise. Vice-versa for transpose case.
+ * @param[in] beta       The factor of the matrix \b C.
+ * @param[out] C         Buffer object storing matrix \b C.
+ * @param[in] offc       Offset in number of elements for the first element in matrix \b C.
+ * @param[in] ldc        Leading dimension of matric \b C. It cannot be less
+ *                       than \b N.
+ * @param[in] numCommandQueues    Number of OpenCL command queues in which the
+ *                                task is to be performed.
+ * @param[in] commandQueues       OpenCL command queues.
+ * @param[in] numEventsInWaitList Number of events in the event wait list.
+ * @param[in] eventWaitList       Event wait list.
+ * @param[in] events     Event objects per each command queue that identify
+ *                       a particular kernel execution instance.
+ *
+ * @return
+ *   - \b clblasSuccess on success;
+ *   - \b clblasInvalidDevice if a target device does not support floating
+ *     point arithmetic with double precision;
+ *   - the same error codes as the clblasCher2k() function otherwise.
+ *
+ * @ingroup HER2K
+ */
+clblasStatus
+clblasZher2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_double beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events);
+/*@}*/
+
+
+
+
+#ifdef __cplusplus
+}      /* extern "C" { */
+#endif
+
+#endif /* CLBLAS_H_ */
diff --git a/src/client/CMakeLists.txt b/src/client/CMakeLists.txt
new file mode 100644
index 0000000..5154a31
--- /dev/null
+++ b/src/client/CMakeLists.txt
@@ -0,0 +1,68 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+set(CLIENT_SRC client.cpp stdafx.cpp statisticalTimer.cpp)
+set(CLIENT_HEADER
+    stdafx.h
+    targetver.h
+    statisticalTimer.h
+    clfunc_common.hpp
+    clfunc_xgemm.hpp
+    clfunc_xgemv.hpp
+    clfunc_xsymv.hpp
+    clfunc_xtrmm.hpp
+    clfunc_xtrsm.hpp
+    clfunc_xsyrk.hpp
+    clfunc_xsyr2k.hpp)
+
+set(WRAPPER_SRC testPerfWrapper.cpp)
+
+add_definitions(-D_CRT_SECURE_NO_WARNINGS)
+
+# Having problems on build server, compiling gtest headers with -pedantic; disabling detection of long long
+# http://code.google.com/p/googletest/issues/detail?id=334
+if( CMAKE_COMPILER_IS_GNUCXX )
+	add_definitions( -Wno-long-long )
+endif( )
+
+include_directories(
+    ${Boost_INCLUDE_DIRS}
+    ${OPENCL_INCLUDE_DIRS}
+    ${clBLAS_SOURCE_DIR}
+    ${clBLAS_SOURCE_DIR}/include
+    ${clBLAS_SOURCE_DIR}/tests/include
+    .)
+
+add_executable(client ${CLIENT_SRC} ${CLIENT_HEADER})
+target_link_libraries(client ${Boost_LIBRARIES} ${OPENCL_LIBRARIES} clBLAS)
+
+add_executable(testPerfWrapper ${WRAPPER_SRC})
+target_link_libraries(testPerfWrapper ${Boost_LIBRARIES})
+
+if( TARGET_PLATFORM EQUAL 64 )
+    set( BIN_DIR bin64 )
+    set( LIB_DIR lib64 )
+else()
+    set( BIN_DIR bin32 )
+    set( LIB_DIR lib32 )
+endif()
+
+# CPack configuration; include the executable into the package
+install( TARGETS client testPerfWrapper
+		RUNTIME DESTINATION ${BIN_DIR}
+		LIBRARY DESTINATION ${LIB_DIR}
+		ARCHIVE DESTINATION ${LIB_DIR}/import
+		)
diff --git a/src/client/clGemm.h b/src/client/clGemm.h
new file mode 100644
index 0000000..8f201b7
--- /dev/null
+++ b/src/client/clGemm.h
@@ -0,0 +1,627 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <iostream>
+#include <clBLAS.h>
+#include <string>
+#include <map>
+
+cl_int gemm_err;
+
+std::string prettyPrintClStatus( const cl_int& status )
+{
+	switch( status )
+	{
+		case CL_INVALID_GLOBAL_WORK_SIZE:
+			return "CL_INVALID_GLOBAL_WORK_SIZE";
+		case CL_INVALID_MIP_LEVEL:
+			return "CL_INVALID_MIP_LEVEL";
+		case CL_INVALID_BUFFER_SIZE:
+			return "CL_INVALID_BUFFER_SIZE";
+		case CL_INVALID_GL_OBJECT:
+			return "CL_INVALID_GL_OBJECT";
+		case CL_INVALID_OPERATION:
+			return "CL_INVALID_OPERATION";
+		case CL_INVALID_EVENT:
+			return "CL_INVALID_EVENT";
+		case CL_INVALID_EVENT_WAIT_LIST:
+			return "CL_INVALID_EVENT_WAIT_LIST";
+		case CL_INVALID_GLOBAL_OFFSET:
+			return "CL_INVALID_GLOBAL_OFFSET";
+		case CL_INVALID_WORK_ITEM_SIZE:
+			return "CL_INVALID_WORK_ITEM_SIZE";
+		case CL_INVALID_WORK_GROUP_SIZE:
+			return "CL_INVALID_WORK_GROUP_SIZE";
+		case CL_INVALID_WORK_DIMENSION:
+			return "CL_INVALID_WORK_DIMENSION";
+		case CL_INVALID_KERNEL_ARGS:
+			return "CL_INVALID_KERNEL_ARGS";
+		case CL_INVALID_ARG_SIZE:
+			return "CL_INVALID_ARG_SIZE";
+		case CL_INVALID_ARG_VALUE:
+			return "CL_INVALID_ARG_VALUE";
+		case CL_INVALID_ARG_INDEX:
+			return "CL_INVALID_ARG_INDEX";
+		case CL_INVALID_KERNEL:
+			return "CL_INVALID_KERNEL";
+		case CL_INVALID_KERNEL_DEFINITION:
+			return "CL_INVALID_KERNEL_DEFINITION";
+		case CL_INVALID_KERNEL_NAME:
+			return "CL_INVALID_KERNEL_NAME";
+		case CL_INVALID_PROGRAM_EXECUTABLE:
+			return "CL_INVALID_PROGRAM_EXECUTABLE";
+		case CL_INVALID_PROGRAM:
+			return "CL_INVALID_PROGRAM";
+		case CL_INVALID_BUILD_OPTIONS:
+			return "CL_INVALID_BUILD_OPTIONS";
+		case CL_INVALID_BINARY:
+			return "CL_INVALID_BINARY";
+		case CL_INVALID_SAMPLER:
+			return "CL_INVALID_SAMPLER";
+		case CL_INVALID_IMAGE_SIZE:
+			return "CL_INVALID_IMAGE_SIZE";
+		case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+			return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+		case CL_INVALID_MEM_OBJECT:
+			return "CL_INVALID_MEM_OBJECT";
+		case CL_INVALID_HOST_PTR:
+			return "CL_INVALID_HOST_PTR";
+		case CL_INVALID_COMMAND_QUEUE:
+			return "CL_INVALID_COMMAND_QUEUE";
+		case CL_INVALID_QUEUE_PROPERTIES:
+			return "CL_INVALID_QUEUE_PROPERTIES";
+		case CL_INVALID_CONTEXT:
+			return "CL_INVALID_CONTEXT";
+		case CL_INVALID_DEVICE:
+			return "CL_INVALID_DEVICE";
+		case CL_INVALID_PLATFORM:
+			return "CL_INVALID_PLATFORM";
+		case CL_INVALID_DEVICE_TYPE:
+			return "CL_INVALID_DEVICE_TYPE";
+		case CL_INVALID_VALUE:
+			return "CL_INVALID_VALUE";
+		case CL_MAP_FAILURE:
+			return "CL_MAP_FAILURE";
+		case CL_BUILD_PROGRAM_FAILURE:
+			return "CL_BUILD_PROGRAM_FAILURE";
+		case CL_IMAGE_FORMAT_NOT_SUPPORTED:
+			return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+		case CL_IMAGE_FORMAT_MISMATCH:
+			return "CL_IMAGE_FORMAT_MISMATCH";
+		case CL_MEM_COPY_OVERLAP:
+			return "CL_MEM_COPY_OVERLAP";
+		case CL_PROFILING_INFO_NOT_AVAILABLE:
+			return "CL_PROFILING_INFO_NOT_AVAILABLE";
+		case CL_OUT_OF_HOST_MEMORY:
+			return "CL_OUT_OF_HOST_MEMORY";
+		case CL_OUT_OF_RESOURCES:
+			return "CL_OUT_OF_RESOURCES";
+		case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+			return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+		case CL_COMPILER_NOT_AVAILABLE:
+			return "CL_COMPILER_NOT_AVAILABLE";
+		case CL_DEVICE_NOT_AVAILABLE:
+			return "CL_DEVICE_NOT_AVAILABLE";
+		case CL_DEVICE_NOT_FOUND:
+			return "CL_DEVICE_NOT_FOUND";
+		case CL_SUCCESS:
+			return "CL_SUCCESS";
+		default:
+			return "Error code not defined";
+		break;
+	}
+}
+
+//	This is used to either wrap an OpenCL function call, or to explicitly check a variable for an OpenCL error condition.
+//	If an error occurs, we throw.
+//	Note: std::runtime_error does not take unicode strings as input, so only strings supported
+inline cl_int OpenCL_V_Throw( cl_int res, const std::string& msg, size_t lineno )
+{
+	switch( res )
+	{
+		case CL_SUCCESS: /**< No error */
+			break;
+		default:
+		{
+			std::stringstream tmp;
+			tmp << "OPENCL_V_THROWERROR< ";
+			tmp << prettyPrintClStatus(res) ;
+			tmp << " > (";
+			tmp << lineno;
+			tmp << "): ";
+			tmp << msg;
+			std::string errorm(tmp.str());
+			std::cout << errorm<< std::endl;
+			throw	std::runtime_error( errorm );
+		}
+	}
+
+	return	res;
+}
+#define OPENCL_V_THROW(_status,_message) OpenCL_V_Throw(_status, _message, __LINE__)
+
+enum complexity_t { not_complex = 1, yes_complex = 2 };
+
+//can be cl_float, cl_double
+//TODO should be cl_float2 and cl_double2 instead of using float/double * complexity?
+template< class T >
+class buffers
+{
+public:
+    size_t M, N, K;
+    size_t lda, ldb, ldc;
+    complexity_t complexity;
+    T* A;
+    T* B;
+    T* C;
+    cl_mem bufA, bufB, bufC;
+    cl_command_queue queue;
+    std::map<std::string, T*> buffer_map;
+    std::map<std::string, size_t> rows_map;
+    std::map<std::string, size_t> ldx_map;
+
+    buffers( cl_context ctx, cl_command_queue _queue,
+             size_t _M, size_t _N, size_t _K,
+             size_t _lda, size_t _ldb, size_t _ldc,
+             complexity_t _complexity )
+    : M(_M)
+    , N(_N)
+    , K(_K)
+    , lda(_lda)
+    , ldb(_ldb)
+    , ldc(_ldc)
+    , complexity(_complexity)
+    , A(new T[M*lda*sizeof(T)*complexity])
+    , B(new T[K*ldb*sizeof(T)*complexity])
+    , C(new T[M*ldc*sizeof(T)*complexity])
+    , queue(_queue)
+    {
+        // request and initialize openCL memory
+        bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * lda * sizeof(*A) * complexity,
+                              NULL, &gemm_err);
+        OPENCL_V_THROW( gemm_err, "creating buffer A" );
+        bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, K * ldb * sizeof(*B) * complexity,
+                              NULL, &gemm_err);
+        OPENCL_V_THROW( gemm_err, "creating buffer B" );
+        bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * ldc * sizeof(*C) * complexity,
+                              NULL, &gemm_err);
+        OPENCL_V_THROW( gemm_err, "creating buffer C" );
+
+        buffer_map.insert(std::pair<std::string,T*>("A",A));
+        buffer_map.insert(std::pair<std::string,T*>("B",B));
+        buffer_map.insert(std::pair<std::string,T*>("C",C));
+        rows_map.insert(std::pair<std::string,size_t>("A",M));
+        rows_map.insert(std::pair<std::string,size_t>("B",K));
+        rows_map.insert(std::pair<std::string,size_t>("C",M));
+        ldx_map.insert(std::pair<std::string,size_t>("A",lda));
+        ldx_map.insert(std::pair<std::string,size_t>("B",ldb));
+        ldx_map.insert(std::pair<std::string,size_t>("C",ldc));
+
+        initialize_data();
+    }
+
+    ~buffers()
+    {
+        OPENCL_V_THROW( clReleaseMemObject(bufC), "releasing buffer A");
+        OPENCL_V_THROW( clReleaseMemObject(bufB), "releasing buffer B");
+        OPENCL_V_THROW( clReleaseMemObject(bufA), "releasing buffer C");
+        delete[] A;
+        delete[] B;
+        delete[] C;
+    }
+
+    void initialize_data()
+    {
+        initializeLocalMatrix("A");
+        initializeLocalMatrix("B");
+        initializeLocalMatrix("C");
+
+        gemm_err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+            M * K * sizeof(*A) * complexity, A, 0, NULL, NULL);
+        OPENCL_V_THROW( gemm_err, "writing to buffer A" );
+        gemm_err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0,
+            K * N * sizeof(*B) * complexity, A, 0, NULL, NULL);
+        OPENCL_V_THROW( gemm_err, "writing to buffer B" );
+        gemm_err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0,
+            M * N * sizeof(*C) * complexity, C, 0, NULL, NULL);
+        OPENCL_V_THROW( gemm_err, "writing to buffer C" );
+    }
+
+    void read_back_result()
+    {
+        OPENCL_V_THROW( clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, M * N * sizeof(*C) * complexity, C, 0, NULL, NULL),
+                        "reading from buffer C" );
+    }
+
+    void initializeLocalMatrix(std::string matrix)
+    {
+        for (size_t i = 0; i < rows_map[matrix]*complexity; i++) {
+            for (size_t j = 0; j < ldx_map[matrix]; j++) {
+                buffer_map[matrix][i * ldx_map[matrix] + j] = (i+1)*10 + (j+1);
+            }
+        }
+    }
+
+    void printLocalMatrix(std::string matrix)
+    {
+        for (size_t i = 0; i < rows_map[matrix]*complexity; i++) {
+            for (size_t j = 0; j < ldx_map[matrix]; j++) {
+                std::cout << (int)buffer_map[matrix][i * ldx_map[matrix] + j] << " ";
+            }
+            std::cout << std::endl;
+        }
+        std::cout << std::endl;
+    }
+};
+
+class clGemm
+{
+public:
+    size_t M;
+    size_t N;
+    size_t K;
+    size_t lda;
+    size_t ldb;
+    size_t ldc;
+    clblasOrder order;
+    clblasTranspose transA;
+    clblasTranspose transB;
+    cl_context_properties props[3];
+    cl_platform_id platform;
+    cl_device_id device;
+    cl_context ctx;
+    cl_device_type deviceType;
+    cl_command_queue queue;
+    cl_event event;
+    cl_uint commandQueueFlags;
+    bool useimages;
+    cl_ulong imgA;
+    cl_ulong imgB;
+    StatisticalTimer& timer;
+	StatisticalTimer::sTimerID gemm_timer_id;
+
+    clGemm( size_t _M, size_t _N, size_t _K,
+            size_t _lda, size_t _ldb, size_t _ldc,
+            bool _useimages,
+            clblasOrder _order,
+            clblasTranspose _transA, clblasTranspose _transB,
+            cl_device_type _deviceType, cl_uint _commandQueueFlags,
+            StatisticalTimer& _timer )
+    : M(_M)
+    , N(_N)
+    , K(_K)
+    , lda(_lda)
+    , ldb(_ldb)
+    , ldc(_ldc)
+    , order(_order)
+    , transA(_transA)
+    , transB(_transB)
+    , deviceType(_deviceType)
+    , event(NULL)
+    , commandQueueFlags(_commandQueueFlags)
+    , useimages(_useimages)
+    , imgA(0)
+    , imgB(0)
+    , timer(_timer)
+    {
+        props[0] = CL_CONTEXT_PLATFORM;
+        props[1] = 0;
+        props[2] = 0;
+        OPENCL_V_THROW( clGetPlatformIDs(1, &platform, NULL), "getting platform IDs" );
+        OPENCL_V_THROW( clGetDeviceIDs(platform, deviceType, 1, &device, NULL), "getting device IDs" );
+        props[1] = (cl_context_properties)platform;
+        ctx = clCreateContext(props, 1, &device, NULL, NULL, &gemm_err);
+        OPENCL_V_THROW( gemm_err, "creating context" );
+        queue = clCreateCommandQueue(ctx, device, commandQueueFlags, &gemm_err);
+        OPENCL_V_THROW( gemm_err, "creating command queue" );
+
+        gemm_err = clblasSetup();
+        if (gemm_err != CL_SUCCESS) {
+            std::cout << "clblasSetup() failed with " << gemm_err << std::endl;
+            clReleaseCommandQueue(queue);
+            clReleaseContext(ctx);
+            exit(1);
+        }
+
+        if (useimages) {
+            imgA = clblasAddScratchImage(ctx, 16, 64, NULL);
+            imgB = clblasAddScratchImage(ctx, 16, 64, NULL);
+        }
+
+	    gemm_timer_id = timer.getUniqueID( "clGemm", 0 );
+    }
+
+    ~clGemm()
+    {
+        if (useimages) {
+            clblasRemoveScratchImage(imgA);
+            clblasRemoveScratchImage(imgB);
+        }
+
+        clblasTeardown();
+        OPENCL_V_THROW( clReleaseCommandQueue(queue), "releasing command queue" );
+        OPENCL_V_THROW( clReleaseContext(ctx), "releasing context" );
+    }
+
+    void wait_and_check()
+    {
+        cl_int wait_status = clWaitForEvents(1, &event);
+
+        if( wait_status != CL_SUCCESS )
+        {
+    	    if( wait_status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST )
+    	    {
+    	    	clGetEventInfo( event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &gemm_err, NULL );
+    	    	std::cout << "blas function execution status error: " << gemm_err << std::endl;
+                exit(1);
+    	    }
+            else
+            {
+    	    	std::cout << "blas function wait status error: " << wait_status << std::endl;
+                exit(1);
+            }
+        }
+    }
+
+    double time_in_ns()
+    {
+	    StatisticalTimer& timer = StatisticalTimer::getInstance( );
+        return timer.getAverageTime( gemm_timer_id ) * 1e9;
+    }
+
+    virtual void call_gemm() = 0;
+    virtual void clear_buffers() = 0;
+    virtual double gflops() = 0;
+    virtual std::string gflops_formula() = 0;
+};
+
+class clSgemm : public clGemm
+{
+public:
+    cl_float alpha;
+    cl_float beta;
+    buffers<cl_float> mybuffers;
+
+    clSgemm( size_t _M, size_t _N, size_t _K,
+            size_t _lda, size_t _ldb, size_t _ldc,
+            bool _useimages,
+            clblasOrder _order,
+            clblasTranspose _transA, clblasTranspose _transB,
+            cl_float _alpha, cl_float _beta,
+            cl_device_type _deviceType, cl_uint _commandQueueFlags,
+            StatisticalTimer& _timer)
+    : clGemm( _M, _N, _K,
+              _lda, _ldb, _ldc,
+              _useimages, _order, _transA, _transB,
+              _deviceType, _commandQueueFlags, _timer )
+    , alpha(_alpha)
+    , beta(_beta)
+    , mybuffers( ctx, queue, M, N, K, lda, ldb, ldc, not_complex )
+    {}
+
+    void call_gemm()
+    {
+	    timer.Start(gemm_timer_id);
+        OPENCL_V_THROW( clblasSgemm(order, transA, transB,
+                                       M, N, K,
+                                       alpha,
+                                       mybuffers.bufA, lda,
+                                       mybuffers.bufB, ldb,
+                                       beta,
+                                       mybuffers.bufC, ldc,
+                                       1, &queue, 0, NULL, &event),
+                        "clblasSgemm" );
+        wait_and_check();
+	    timer.Stop(gemm_timer_id);
+        //mybuffers.read_back_result();
+        //mybuffers.printLocalMatrix("C");
+    }
+
+    void clear_buffers()
+    {
+        mybuffers.initialize_data();
+    }
+
+    double gflops()
+    {
+        return (2*M*N*K)/time_in_ns();
+    }
+
+    std::string gflops_formula()
+    {
+        return "(2*M*N*K)/time_in_ns";
+    }
+};
+
+class clDgemm : public clGemm
+{
+public:
+    cl_double alpha;
+    cl_double beta;
+    buffers<cl_double> mybuffers;
+
+    clDgemm( size_t _M, size_t _N, size_t _K,
+            size_t _lda, size_t _ldb, size_t _ldc,
+            bool _useimages,
+            clblasOrder _order,
+            clblasTranspose _transA, clblasTranspose _transB,
+            cl_double _alpha, cl_double _beta,
+            cl_device_type _deviceType, cl_uint _commandQueueFlags,
+            StatisticalTimer& _timer)
+    : clGemm( _M, _N, _K,
+              _lda, _ldb, _ldc,
+              _useimages, _order, _transA, _transB,
+              _deviceType, _commandQueueFlags, _timer )
+    , alpha(_alpha)
+    , beta(_beta)
+    , mybuffers( ctx, queue, M, N, K, lda, ldb, ldc, not_complex )
+    {}
+
+    void call_gemm()
+    {
+	    timer.Start(gemm_timer_id);
+        OPENCL_V_THROW( clblasDgemm(order, transA, transB,
+                                       M, N, K,
+                                       alpha,
+                                       mybuffers.bufA, lda,
+                                       mybuffers.bufB, ldb,
+                                       beta,
+                                       mybuffers.bufC, ldc,
+                                       1, &queue, 0, NULL, &event),
+                        "clblasDgemm" );
+        wait_and_check();
+	    timer.Stop(gemm_timer_id);
+        //mybuffers.read_back_result();
+        //mybuffers.printLocalMatrix("C");
+    }
+
+    void clear_buffers()
+    {
+        mybuffers.initialize_data();
+    }
+
+    double gflops()
+    {
+        return (2*M*N*K)/time_in_ns();
+    }
+
+    std::string gflops_formula()
+    {
+        return "(2*M*N*K)/time_in_ns";
+    }
+};
+
+class clCgemm : public clGemm
+{
+public:
+    cl_float2 alpha;
+    cl_float2 beta;
+    buffers<cl_float> mybuffers;
+
+    clCgemm( size_t _M, size_t _N, size_t _K,
+            size_t _lda, size_t _ldb, size_t _ldc,
+            bool _useimages,
+            clblasOrder _order,
+            clblasTranspose _transA, clblasTranspose _transB,
+            cl_float _alpha, cl_float _beta,
+            cl_device_type _deviceType, cl_uint _commandQueueFlags,
+            StatisticalTimer& _timer)
+    : clGemm( _M, _N, _K,
+              _lda, _ldb, _ldc,
+              _useimages, _order, _transA, _transB,
+              _deviceType, _commandQueueFlags, _timer )
+    , mybuffers( ctx, queue, M, N, K, lda, ldb, ldc, yes_complex )
+    {
+        alpha.s[0] = _alpha;
+        alpha.s[1] = _alpha;
+        beta.s[0] = _beta;
+        beta.s[1] = _beta;
+    }
+
+    void call_gemm()
+    {
+	    timer.Start(gemm_timer_id);
+        OPENCL_V_THROW( clblasCgemm(order, transA, transB,
+                                       M, N, K,
+                                       alpha,
+                                       mybuffers.bufA, lda,
+                                       mybuffers.bufB, ldb,
+                                       beta,
+                                       mybuffers.bufC, ldc,
+                                       1, &queue, 0, NULL, &event),
+                        "clblasCgemm" );
+        wait_and_check();
+	    timer.Stop(gemm_timer_id);
+        //mybuffers.read_back_result();
+        //mybuffers.printLocalMatrix("C");
+    }
+
+    void clear_buffers()
+    {
+        mybuffers.initialize_data();
+    }
+
+    double gflops()
+    {
+        return (8*M*N*K)/time_in_ns();
+    }
+
+    std::string gflops_formula()
+    {
+        return "(8*M*N*K)/time_in_ns";
+    }
+};
+
+class clZgemm : public clGemm
+{
+public:
+    cl_double2 alpha;
+    cl_double2 beta;
+    buffers<cl_double> mybuffers;
+
+    clZgemm( size_t _M, size_t _N, size_t _K,
+            size_t _lda, size_t _ldb, size_t _ldc,
+            bool _useimages,
+            clblasOrder _order,
+            clblasTranspose _transA, clblasTranspose _transB,
+            cl_double _alpha, cl_double _beta,
+            cl_device_type _deviceType, cl_uint _commandQueueFlags,
+            StatisticalTimer& _timer)
+    : clGemm( _M, _N, _K,
+              _lda, _ldb, _ldc,
+              _useimages, _order, _transA, _transB,
+              _deviceType, _commandQueueFlags, _timer )
+    , mybuffers( ctx, queue, M, N, K, lda, ldb, ldc, yes_complex )
+    {
+        alpha.s[0] = _alpha;
+        alpha.s[1] = _alpha;
+        beta.s[0] = _beta;
+        beta.s[1] = _beta;
+    }
+
+    void call_gemm()
+    {
+	    timer.Start(gemm_timer_id);
+        OPENCL_V_THROW( clblasZgemm(order, transA, transB,
+                                       M, N, K,
+                                       alpha,
+                                       mybuffers.bufA, lda,
+                                       mybuffers.bufB, ldb,
+                                       beta,
+                                       mybuffers.bufC, ldc,
+                                       1, &queue, 0, NULL, &event),
+                        "clblasZgemm" );
+        wait_and_check();
+	    timer.Stop(gemm_timer_id);
+        //mybuffers.read_back_result();
+        //mybuffers.printLocalMatrix("C");
+    }
+
+    void clear_buffers()
+    {
+        mybuffers.initialize_data();
+    }
+
+    double gflops()
+    {
+        return (8*M*N*K)/time_in_ns();
+    }
+
+    std::string gflops_formula()
+    {
+        return "(8*M*N*K)/time_in_ns";
+    }
+};
diff --git a/src/client/clfunc_common.hpp b/src/client/clfunc_common.hpp
new file mode 100644
index 0000000..4876daf
--- /dev/null
+++ b/src/client/clfunc_common.hpp
@@ -0,0 +1,335 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#ifndef CLBLAS_BENCHMARK_COMMON_HXX__
+#define CLBLAS_BENCHMARK_COMMON_HXX__
+
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <cstdlib>
+
+#include "blas-math.h"
+#include "test-limits.h"
+#include "dis_warning.h"
+
+#include "clBLAS.h"
+
+template<typename T>
+static T
+makeScalar(double val)
+{
+    return static_cast<T>(val);
+}
+
+template<>
+__template_static FloatComplex
+makeScalar(double val)
+{
+    FloatComplex c;
+
+    c.s[0] = static_cast<float>(val);
+    c.s[1] = 0;
+
+    return c;
+}
+
+template<>
+__template_static DoubleComplex
+makeScalar(double val)
+{
+    DoubleComplex c;
+
+    c.s[0] = val;
+    c.s[1] = 0;
+
+    return c;
+}
+
+template<typename T>
+static T
+randomScale()
+{
+    T t = random<T>(UPPER_BOUND<T>());
+    if (module(t) == 0) {
+        t = t + ONE<T>();
+    }
+
+    return t;
+}
+
+std::string
+prettyPrintClStatus( const cl_int& status )
+{
+    switch( status )
+    {
+    case CL_INVALID_GLOBAL_WORK_SIZE:
+        return "CL_INVALID_GLOBAL_WORK_SIZE";
+    case CL_INVALID_MIP_LEVEL:
+        return "CL_INVALID_MIP_LEVEL";
+    case CL_INVALID_BUFFER_SIZE:
+        return "CL_INVALID_BUFFER_SIZE";
+    case CL_INVALID_GL_OBJECT:
+        return "CL_INVALID_GL_OBJECT";
+    case CL_INVALID_OPERATION:
+        return "CL_INVALID_OPERATION";
+    case CL_INVALID_EVENT:
+        return "CL_INVALID_EVENT";
+    case CL_INVALID_EVENT_WAIT_LIST:
+        return "CL_INVALID_EVENT_WAIT_LIST";
+    case CL_INVALID_GLOBAL_OFFSET:
+        return "CL_INVALID_GLOBAL_OFFSET";
+    case CL_INVALID_WORK_ITEM_SIZE:
+        return "CL_INVALID_WORK_ITEM_SIZE";
+    case CL_INVALID_WORK_GROUP_SIZE:
+        return "CL_INVALID_WORK_GROUP_SIZE";
+    case CL_INVALID_WORK_DIMENSION:
+        return "CL_INVALID_WORK_DIMENSION";
+    case CL_INVALID_KERNEL_ARGS:
+        return "CL_INVALID_KERNEL_ARGS";
+    case CL_INVALID_ARG_SIZE:
+        return "CL_INVALID_ARG_SIZE";
+    case CL_INVALID_ARG_VALUE:
+        return "CL_INVALID_ARG_VALUE";
+    case CL_INVALID_ARG_INDEX:
+        return "CL_INVALID_ARG_INDEX";
+    case CL_INVALID_KERNEL:
+        return "CL_INVALID_KERNEL";
+    case CL_INVALID_KERNEL_DEFINITION:
+        return "CL_INVALID_KERNEL_DEFINITION";
+    case CL_INVALID_KERNEL_NAME:
+        return "CL_INVALID_KERNEL_NAME";
+    case CL_INVALID_PROGRAM_EXECUTABLE:
+        return "CL_INVALID_PROGRAM_EXECUTABLE";
+    case CL_INVALID_PROGRAM:
+        return "CL_INVALID_PROGRAM";
+    case CL_INVALID_BUILD_OPTIONS:
+        return "CL_INVALID_BUILD_OPTIONS";
+    case CL_INVALID_BINARY:
+        return "CL_INVALID_BINARY";
+    case CL_INVALID_SAMPLER:
+        return "CL_INVALID_SAMPLER";
+    case CL_INVALID_IMAGE_SIZE:
+        return "CL_INVALID_IMAGE_SIZE";
+    case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+        return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+    case CL_INVALID_MEM_OBJECT:
+        return "CL_INVALID_MEM_OBJECT";
+    case CL_INVALID_HOST_PTR:
+        return "CL_INVALID_HOST_PTR";
+    case CL_INVALID_COMMAND_QUEUE:
+        return "CL_INVALID_COMMAND_QUEUE";
+    case CL_INVALID_QUEUE_PROPERTIES:
+        return "CL_INVALID_QUEUE_PROPERTIES";
+    case CL_INVALID_CONTEXT:
+        return "CL_INVALID_CONTEXT";
+    case CL_INVALID_DEVICE:
+        return "CL_INVALID_DEVICE";
+    case CL_INVALID_PLATFORM:
+        return "CL_INVALID_PLATFORM";
+    case CL_INVALID_DEVICE_TYPE:
+        return "CL_INVALID_DEVICE_TYPE";
+    case CL_INVALID_VALUE:
+        return "CL_INVALID_VALUE";
+    case CL_MAP_FAILURE:
+        return "CL_MAP_FAILURE";
+    case CL_BUILD_PROGRAM_FAILURE:
+        return "CL_BUILD_PROGRAM_FAILURE";
+    case CL_IMAGE_FORMAT_NOT_SUPPORTED:
+        return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+    case CL_IMAGE_FORMAT_MISMATCH:
+        return "CL_IMAGE_FORMAT_MISMATCH";
+    case CL_MEM_COPY_OVERLAP:
+        return "CL_MEM_COPY_OVERLAP";
+    case CL_PROFILING_INFO_NOT_AVAILABLE:
+        return "CL_PROFILING_INFO_NOT_AVAILABLE";
+    case CL_OUT_OF_HOST_MEMORY:
+        return "CL_OUT_OF_HOST_MEMORY";
+    case CL_OUT_OF_RESOURCES:
+        return "CL_OUT_OF_RESOURCES";
+    case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+        return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+    case CL_COMPILER_NOT_AVAILABLE:
+        return "CL_COMPILER_NOT_AVAILABLE";
+    case CL_DEVICE_NOT_AVAILABLE:
+        return "CL_DEVICE_NOT_AVAILABLE";
+    case CL_DEVICE_NOT_FOUND:
+        return "CL_DEVICE_NOT_FOUND";
+    case CL_SUCCESS:
+        return "CL_SUCCESS";
+    default:
+        return "Error code not defined";
+        break;
+    }
+}
+
+// This is used to either wrap an OpenCL function call, or to
+// explicitly check a variable for an OpenCL error condition.
+// If an error occurs, we throw.
+// Note: std::runtime_error does not take unicode strings as input, so
+// only strings supported
+inline cl_int
+OpenCL_V_Throw( cl_int res, const std::string& msg, size_t lineno )
+{
+    switch( res )
+    {
+    case CL_SUCCESS: /**< No error */
+        break;
+    default:
+        {
+            std::stringstream tmp;
+
+            tmp << "OPENCL_V_THROWERROR< ";
+            tmp << prettyPrintClStatus(res) ;
+            tmp << " > (";
+            tmp << lineno;
+            tmp << "): ";
+            tmp << msg;
+            std::string errorm(tmp.str());
+            std::cout << errorm<< std::endl;
+            throw std::runtime_error( errorm );
+        }
+    }
+
+    return res;
+}
+
+#define OPENCL_V_THROW(_status,_message) OpenCL_V_Throw(_status, _message, \
+                                                        __LINE__)
+
+inline cl_ulong
+queryMemAllocSize( cl_device_id device_ )
+{
+    cl_int err;
+    cl_ulong rc = 0;
+
+    err = clGetDeviceInfo(device_, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                          sizeof(rc), &rc, NULL);
+
+    return rc;
+}
+
+class clblasFunc
+{
+public:
+    clblasFunc(StatisticalTimer& _timer, cl_device_type devType)
+          : timer(_timer)
+    {
+        cl_int err;
+
+        /* Setup OpenCL environment. */
+        OPENCL_V_THROW(clGetPlatformIDs(1, &platform_, NULL),
+                       "getting platform IDs");
+        OPENCL_V_THROW(clGetDeviceIDs(platform_, devType, 1,
+                                      &device_, NULL), "getting device IDs");
+        props_[0] = CL_CONTEXT_PLATFORM;
+        props_[1] = (cl_context_properties)platform_;
+        props_[2] = 0;
+        ctx_ = clCreateContext(props_, 1, &device_, NULL, NULL, &err);
+        OPENCL_V_THROW(err, "creating context");
+        queue_ = clCreateCommandQueue(ctx_, device_, 0, &err);
+
+        timer_id = timer.getUniqueID( "clfunc", 0 );
+
+
+        maxMemAllocSize = queryMemAllocSize( device_ );
+
+    /* Setup clblas. */
+        err = clblasSetup();
+        if (err != CL_SUCCESS) {
+            std::cerr << "clblasSetup() failed with %d\n";
+            clReleaseCommandQueue(queue_);
+            clReleaseContext(ctx_);
+        }
+    }
+
+    virtual ~clblasFunc()
+    {
+        clblasTeardown();
+        OPENCL_V_THROW( clReleaseCommandQueue(queue_),
+                        "releasing command queue" );
+        OPENCL_V_THROW( clReleaseContext(ctx_), "releasing context" );
+    }
+
+    void wait_and_check()
+    {
+		cl_int err;
+        cl_int wait_status = clWaitForEvents(1, &event_);
+
+        if( wait_status != CL_SUCCESS )
+        {
+    	    if( wait_status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST )
+    	    {
+    	    	clGetEventInfo( event_, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                sizeof(cl_int), &err, NULL );
+    	    	std::cout << "blas function execution status error: " << err << std::endl;
+                exit(1);
+    	    }
+            else
+            {
+    	    	std::cout << "blas function wait status error: " << wait_status << std::endl;
+                exit(1);
+            }
+        }
+    }
+
+    double time_in_ns()
+    {
+	    StatisticalTimer& timer = StatisticalTimer::getInstance( );
+        return timer.getAverageTime( timer_id ) * 1e9;
+    }
+
+    virtual void call_func() = 0;
+    virtual double gflops() = 0;
+    virtual std::string gflops_formula() = 0;
+    virtual void setup_buffer(int order_option, int side_option,
+                              int uplo_option, int diag_option, int
+                              transA_option, int transB_option,
+                              size_t M, size_t N, size_t K, size_t lda,
+                              size_t ldb, size_t ldc, size_t offA, size_t offBX,
+                              size_t offCY, double alpha, double beta) = 0;
+    virtual void initialize_cpu_buffer() = 0;
+    virtual void initialize_gpu_buffer() = 0;
+    virtual void reset_gpu_write_buffer() = 0;
+	virtual void read_gpu_buffer() = 0;
+	virtual void roundtrip_func() = 0;
+	virtual void roundtrip_setup_buffer(int order_option, int side_option,
+                              int uplo_option, int diag_option, int
+                              transA_option, int transB_option,
+                              size_t M, size_t N, size_t K, size_t lda,
+                              size_t ldb, size_t ldc, size_t offA, size_t offBX,
+                              size_t offCY, double alpha, double beta) = 0;
+
+    StatisticalTimer& timer;
+    StatisticalTimer::sTimerID timer_id;
+
+protected:
+    virtual void initialize_scalars(double alpha, double beta) = 0;
+
+protected:
+    cl_platform_id platform_;
+    cl_device_id device_;
+    cl_context_properties props_[3];
+    cl_context ctx_;
+    cl_command_queue queue_;
+    clblasOrder order_;
+    cl_event event_;
+    size_t maxMemAllocSize;
+}; // class clblasFunc
+
+#endif // ifndef CLBLAS_BENCHMARK_COMMON_HXX__
+
diff --git a/src/client/clfunc_xgemm.hpp b/src/client/clfunc_xgemm.hpp
new file mode 100644
index 0000000..17223a6
--- /dev/null
+++ b/src/client/clfunc_xgemm.hpp
@@ -0,0 +1,995 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XGEMM_HXX__
+#define CLBLAS_BENCHMARK_XGEMM_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xGemmBuffer
+{
+    clblasOrder order_;
+    size_t m_;
+    size_t n_;
+    size_t k_;
+    size_t lda_;
+    size_t ldb_;
+    size_t ldc_;
+    size_t offA_;
+    size_t offB_;
+    size_t offC_;
+    size_t a_num_vectors_;
+    size_t b_num_vectors_;
+    size_t c_num_vectors_;
+    clblasTranspose trans_a_;
+    clblasTranspose trans_b_;
+    T* a_;
+    T* b_;
+    T* c_;
+    cl_mem buf_a_;
+    cl_mem buf_b_;
+    cl_mem buf_c_;
+    T alpha_;
+    T beta_;
+}; // struct buffer
+
+template <typename T>
+class xGemm : public clblasFunc
+{
+public:
+    xGemm(StatisticalTimer& timer, cl_device_type devType) :
+        clblasFunc(timer, devType)
+    {
+        timer.getUniqueID("clGemm", 0);
+    }
+
+    ~xGemm()
+    {
+        delete buffer_.a_;
+        delete buffer_.b_;
+        delete buffer_.c_;
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
+                        "releasing buffer A");
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_b_),
+                        "releasing buffer B");
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
+                        "releasing buffer C");
+    }
+
+    void call_func()
+    {
+        std::cout << "xGemm::call_func\n";
+    }
+
+    double gflops()
+    {
+        return (2.0*buffer_.m_*buffer_.n_*buffer_.k_)/time_in_ns();
+    }
+
+    std::string gflops_formula()
+    {
+        return "2.0*M*N*K/time";
+    }
+
+    void setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+    {
+        DUMMY_ARGS_USAGE_3(side_option, uplo_option, diag_option);
+
+        initialize_scalars(alpha, beta);
+
+        buffer_.m_ = M;
+        buffer_.n_ = N;
+        buffer_.k_ = K;
+        buffer_.offA_ = offA;
+        buffer_.offB_ = offBX;
+        buffer_.offC_ = offCY;
+
+        if (order_option == 0)
+        {
+            order_ = clblasRowMajor;
+            if (transA_option == 0)
+            {
+                buffer_.trans_a_ = clblasNoTrans;
+                buffer_.a_num_vectors_ = M;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = K;
+                if (transA_option == 1)
+                {
+                    buffer_.trans_a_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.trans_a_ = clblasConjTrans;
+                }
+                if (lda == 0)
+                {
+                    buffer_.lda_ = M;
+                }
+                else if (lda < M)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+
+            if (transB_option == 0)
+            {
+                buffer_.b_num_vectors_ = K;
+                buffer_.trans_b_ = clblasNoTrans;
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = N;
+                }
+                else if (ldb < N)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+            else
+            {
+                buffer_.b_num_vectors_ = N;
+                if (transB_option == 1)
+                {
+                    buffer_.trans_b_ = clblasTrans;
+                }
+                else if (transB_option == 2)
+                {
+                    buffer_.trans_b_ = clblasConjTrans;
+                }
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = K;
+                }
+                else if (ldb < K)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+
+            if (ldc == 0)
+            {
+                buffer_.ldc_ = N;
+            }
+            else if (ldc < N)
+            {
+                std::cerr << "ldc:wrong size\n";
+            }
+            else
+            {
+                buffer_.ldc_ = ldc;
+            }
+            buffer_.c_num_vectors_ = M;
+        }
+        else
+        {
+            order_ = clblasColumnMajor;
+            if (transA_option == 0)
+            {
+                buffer_.a_num_vectors_ = K;
+                buffer_.trans_a_ = clblasNoTrans;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = M;
+                }
+                else if (lda < M)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = M;
+                if (transA_option == 1)
+                {
+                    buffer_.trans_a_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.trans_a_ = clblasConjTrans;
+                }
+
+
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+
+            if (transB_option == 0)
+            {
+                buffer_.b_num_vectors_ = N;
+                buffer_.trans_b_ = clblasNoTrans;
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = K;
+                }
+                else if (ldb < K)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+            else
+            {
+                buffer_.b_num_vectors_ = K;
+                if (transB_option == 1)
+                {
+                    buffer_.trans_b_ = clblasTrans;
+                }
+                else if (transB_option == 2)
+                {
+                    buffer_.trans_b_ = clblasConjTrans;
+                }
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = N;
+                }
+                else if (ldb < N)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+
+            if (ldc == 0)
+            {
+                buffer_.ldc_ = M;
+            }
+            else if (ldc < M)
+            {
+                std::cerr << "ldc:wrong size\n";
+            }
+            else
+            {
+                buffer_.ldc_ = ldc;
+            }
+            buffer_.c_num_vectors_ = N;
+        }
+        buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+        buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
+        buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_ ];
+
+        cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                       (buffer_.lda_*buffer_.a_num_vectors_ +
+                                           buffer_.offA_) * sizeof(T),
+                                       NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                        NULL, &err);
+
+        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(T),
+                                        NULL, &err);
+
+    }
+
+    void initialize_cpu_buffer()
+    {
+        srand(10);
+        for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+        {
+            for (size_t j = 0; j < buffer_.lda_; ++j)
+            {
+                buffer_.a_[i*buffer_.lda_+j] = random<T>(UPPER_BOUND<T>()) /
+                                               randomScale<T>();
+            }
+        }
+
+        for (size_t i = 0; i < buffer_.b_num_vectors_; ++i)
+        {
+            for (size_t j = 0; j < buffer_.ldb_; ++j)
+            {
+                buffer_.b_[i*buffer_.ldb_+j] = random<T>(UPPER_BOUND<T>()) /
+                                               randomScale<T>();
+            }
+        }
+
+        for (size_t i = 0; i < buffer_.c_num_vectors_; ++i)
+        {
+            for (size_t j = 0; j < buffer_.ldc_; ++j)
+            {
+                buffer_.c_[i*buffer_.ldc_+j] = random<T>(UPPER_BOUND<T>()) /
+                                               randomScale<T>();
+            }
+        }
+    }
+
+    void initialize_gpu_buffer()
+    {
+
+		cl_int err;
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(T),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.a_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(T),
+                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.b_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+                                   buffer_.offC_ * sizeof(T),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                   sizeof(T),
+                                   buffer_.c_, 0, NULL, NULL);
+
+
+    }
+
+    void reset_gpu_write_buffer()
+    {
+        cl_int err;
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+                                   buffer_.offC_ * sizeof(T),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.c_, 0, NULL, NULL);
+    }
+
+	void read_gpu_buffer()
+	{
+		cl_int err;
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+			                      buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+								  buffer_.c_, 0, NULL, NULL);
+	}
+
+	void roundtrip_func()
+	{
+		std::cout << "xGemm::roundtrip_func\n";
+	}
+	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+    {
+        DUMMY_ARGS_USAGE_3(side_option, uplo_option, diag_option);
+
+        initialize_scalars(alpha, beta);
+
+        buffer_.m_ = M;
+        buffer_.n_ = N;
+        buffer_.k_ = K;
+        buffer_.offA_ = offA;
+        buffer_.offB_ = offBX;
+        buffer_.offC_ = offCY;
+
+        if (order_option == 0)
+        {
+            order_ = clblasRowMajor;
+            if (transA_option == 0)
+            {
+                buffer_.trans_a_ = clblasNoTrans;
+                buffer_.a_num_vectors_ = M;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = K;
+                if (transA_option == 1)
+                {
+                    buffer_.trans_a_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.trans_a_ = clblasConjTrans;
+                }
+                if (lda == 0)
+                {
+                    buffer_.lda_ = M;
+                }
+                else if (lda < M)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+
+            if (transB_option == 0)
+            {
+                buffer_.b_num_vectors_ = K;
+                buffer_.trans_b_ = clblasNoTrans;
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = N;
+                }
+                else if (ldb < N)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+            else
+            {
+                buffer_.b_num_vectors_ = N;
+                if (transB_option == 1)
+                {
+                    buffer_.trans_b_ = clblasTrans;
+                }
+                else if (transB_option == 2)
+                {
+                    buffer_.trans_b_ = clblasConjTrans;
+                }
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = K;
+                }
+                else if (ldb < K)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+
+            if (ldc == 0)
+            {
+                buffer_.ldc_ = N;
+            }
+            else if (ldc < N)
+            {
+                std::cerr << "ldc:wrong size\n";
+            }
+            else
+            {
+                buffer_.ldc_ = ldc;
+            }
+            buffer_.c_num_vectors_ = M;
+        }
+        else
+        {
+            order_ = clblasColumnMajor;
+            if (transA_option == 0)
+            {
+                buffer_.a_num_vectors_ = K;
+                buffer_.trans_a_ = clblasNoTrans;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = M;
+                }
+                else if (lda < M)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = M;
+                if (transA_option == 1)
+                {
+                    buffer_.trans_a_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.trans_a_ = clblasConjTrans;
+                }
+
+
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+
+            if (transB_option == 0)
+            {
+                buffer_.b_num_vectors_ = N;
+                buffer_.trans_b_ = clblasNoTrans;
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = K;
+                }
+                else if (ldb < K)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+            else
+            {
+                buffer_.b_num_vectors_ = K;
+                if (transB_option == 1)
+                {
+                    buffer_.trans_b_ = clblasTrans;
+                }
+                else if (transB_option == 2)
+                {
+                    buffer_.trans_b_ = clblasConjTrans;
+                }
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = N;
+                }
+                else if (ldb < N)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+
+            if (ldc == 0)
+            {
+                buffer_.ldc_ = M;
+            }
+            else if (ldc < M)
+            {
+                std::cerr << "ldc:wrong size\n";
+            }
+            else
+            {
+                buffer_.ldc_ = ldc;
+            }
+            buffer_.c_num_vectors_ = N;
+        }
+        buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+        buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
+        buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_ ];
+
+    }
+
+protected:
+    void initialize_scalars(double alpha, double beta)
+    {
+        buffer_.alpha_ = makeScalar<T>(alpha);
+        buffer_.beta_ = makeScalar<T>(beta);
+    }
+
+private:
+    xGemmBuffer<T> buffer_;
+
+}; // class xgemm
+
+
+
+template<>
+void
+xGemm<cl_float>::
+call_func()
+{
+    timer.Start(timer_id);
+
+	clblasSgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+                     buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                     buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                     buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+	clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xGemm<cl_float>::
+roundtrip_func()
+{
+    timer.Start(timer_id);
+	cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                       (buffer_.lda_*buffer_.a_num_vectors_ +
+                                           buffer_.offA_) * sizeof(cl_float),
+                                       NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(cl_float),
+                                        NULL, &err);
+
+        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(cl_float),
+                                        NULL, &err);
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(cl_float),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(cl_float),
+                                   buffer_.a_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(cl_float),
+                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(cl_float),
+                                   buffer_.b_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+                                   buffer_.offC_ * sizeof(cl_float),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                   sizeof(cl_float),
+                                   buffer_.c_, 0, NULL, NULL);
+		clblasSgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+                     buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                     buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                     buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+			                      buffer_.offC_ * sizeof(cl_float), buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(cl_float),
+								  buffer_.c_, 0, NULL, &event_);
+	clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+}
+
+
+template<>
+void
+xGemm<cl_double>::
+call_func()
+{
+    timer.Start(timer_id);
+
+	clblasDgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+                     buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                     buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                     buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xGemm<cl_double>::
+roundtrip_func()
+	{
+    timer.Start(timer_id);
+	cl_int err;
+	//set up buffer
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                       (buffer_.lda_*buffer_.a_num_vectors_ +
+                                           buffer_.offA_) * sizeof(cl_double),
+                                       NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(cl_double),
+                                        NULL, &err);
+
+        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(cl_double),
+                                        NULL, &err);
+		//initialize gpu buffer
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(cl_double),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(cl_double),
+                                   buffer_.a_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(cl_double),
+                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(cl_double),
+                                   buffer_.b_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+                                   buffer_.offC_ * sizeof(cl_double),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                   sizeof(cl_double),
+                                   buffer_.c_, 0, NULL, NULL);
+		//call_func
+		clblasDgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+                     buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                     buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                     buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+		//read gpu buffer
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+			                      buffer_.offC_ * sizeof(cl_double), buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(cl_double),
+								  buffer_.c_, 0, NULL, &event_);
+	clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+	}
+
+template<>
+void
+xGemm<cl_float2>::
+call_func()
+{
+    timer.Start(timer_id);
+
+    clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+                     buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                     buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                     buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+	clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xGemm<cl_float2>::
+roundtrip_func()
+	{
+    timer.Start(timer_id);
+	cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                       (buffer_.lda_*buffer_.a_num_vectors_ +
+                                           buffer_.offA_) * sizeof(cl_float2),
+                                       NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(cl_float2),
+                                        NULL, &err);
+
+        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(cl_float2),
+                                        NULL, &err);
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(cl_float2),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(cl_float2),
+                                   buffer_.a_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(cl_float2),
+                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(cl_float2),
+                                   buffer_.b_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+                                   buffer_.offC_ * sizeof(cl_float2),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                   sizeof(cl_float2),
+                                   buffer_.c_, 0, NULL, NULL);
+		clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+                     buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                     buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                     buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+			                      buffer_.offC_ * sizeof(cl_float2), buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(cl_float2),
+								  buffer_.c_, 0, NULL, &event_);
+	clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+	}
+
+template<>
+void
+xGemm<cl_double2>::
+call_func()
+{
+    timer.Start(timer_id);
+
+    clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+                     buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                     buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                     buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+	clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xGemm<cl_double2>::
+roundtrip_func()
+	{
+    timer.Start(timer_id);
+	cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                       (buffer_.lda_*buffer_.a_num_vectors_ +
+                                           buffer_.offA_) * sizeof(cl_double2),
+                                       NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(cl_double2),
+                                        NULL, &err);
+
+        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(cl_double2),
+                                        NULL, &err);
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(cl_double2),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(cl_double2),
+                                   buffer_.a_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(cl_double2),
+                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(cl_double2),
+                                   buffer_.b_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+                                   buffer_.offC_ * sizeof(cl_double2),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                   sizeof(cl_double2),
+                                   buffer_.c_, 0, NULL, NULL);
+		clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+                     buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                     buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                     buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+			                      buffer_.offC_ * sizeof(cl_double2), buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(cl_double2),
+								  buffer_.c_, 0, NULL, &event_);
+	clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+	}
+
+
+template<>
+double
+xGemm<cl_float2>::
+gflops()
+{
+    return (8.0*buffer_.m_*buffer_.n_*buffer_.k_)/time_in_ns();
+}
+
+template<>
+double
+xGemm<cl_double2>::
+gflops()
+{
+    return (8.0*buffer_.m_*buffer_.n_*buffer_.k_)/time_in_ns();
+}
+
+template<>
+std::string
+xGemm<cl_float2>::
+gflops_formula()
+{
+    return "8.0*M*N*K/time";
+}
+
+template<>
+std::string
+xGemm<cl_double2>::
+gflops_formula()
+{
+    return "8.0*M*N*K/time";
+}
+
+#endif // ifndef CLBLAS_BENCHMARK_XGEMM_HXX__
diff --git a/src/client/clfunc_xgemv.hpp b/src/client/clfunc_xgemv.hpp
new file mode 100644
index 0000000..2d1d5b0
--- /dev/null
+++ b/src/client/clfunc_xgemv.hpp
@@ -0,0 +1,367 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XGEMV_HXX__
+#define CLBLAS_BENCHMARK_XGEMV_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xGemvBuffer
+{
+    clblasOrder order_;
+    size_t m_;
+    size_t n_;
+    size_t lda_;
+    size_t offA_;
+    size_t a_num_vectors_;
+    size_t b_num_vectors_;
+    size_t c_num_vectors_;
+    clblasTranspose trans_a_;
+    T* a_;
+    T* x_;
+    T* y_;
+    cl_mem buf_a_;
+    cl_mem buf_x_;
+    cl_mem buf_y_;
+    T alpha_;
+    T beta_;
+}; // struct buffer
+
+template <typename T>
+class xGemv : public clblasFunc
+{
+public:
+    xGemv(StatisticalTimer& timer, cl_device_type devType) :
+        clblasFunc(timer, devType)
+    {
+        timer.getUniqueID("clGemv", 0);
+    }
+
+    ~xGemv()
+    {
+        delete buffer_.a_;
+        delete buffer_.x_;
+        delete buffer_.y_;
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
+                        "releasing buffer A");
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_x_),
+                        "releasing buffer X");
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_y_),
+                        "releasing buffer Y");
+    }
+
+    void call_func()
+    {
+    }
+
+    double gflops()
+    {
+        return (2.0*buffer_.m_*buffer_.n_)/time_in_ns();
+    }
+
+    std::string gflops_formula()
+    {
+        return "2.0*M*N/time";
+        // NOTE i removed a \n from the end of this. it needs to be absent
+        // from all functions
+    }
+
+    void setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+    {
+        DUMMY_ARGS_USAGE_4(side_option, uplo_option, diag_option,
+                           transB_option);
+        DUMMY_ARGS_USAGE_3(K, ldb, ldc);
+        DUMMY_ARGS_USAGE_2(offBX, offCY);
+
+        initialize_scalars(alpha, beta);
+
+        buffer_.m_ = M;
+        buffer_.n_ = N;
+        buffer_.offA_ = offA;
+
+        if (transA_option == 0)
+        {
+            buffer_.trans_a_ = clblasNoTrans;
+            buffer_.x_ = new T[buffer_.n_];
+            buffer_.y_ = new T[buffer_.m_];
+        }
+        else
+        {
+            buffer_.trans_a_ = clblasTrans;
+            buffer_.x_ = new T[buffer_.m_];
+            buffer_.y_ = new T[buffer_.n_];
+        }
+
+        if (order_option == 0)
+        {
+            order_ = clblasRowMajor;
+            buffer_.a_num_vectors_ = M;
+            if (lda == 0)
+            {
+                buffer_.lda_ = N;
+            }
+            else if (lda < N)
+            {
+                std::cerr << "lda:wrong size\n";
+                exit(1);
+            }
+            else
+            {
+                buffer_.lda_ = lda;
+            }
+        }
+        else
+        {
+            order_ = clblasColumnMajor;
+            buffer_.a_num_vectors_ = N;
+            if (lda == 0)
+            {
+                buffer_.lda_ = M;
+            }
+            else if (lda < M)
+            {
+                std::cerr << "lda:wrong size\n";
+                exit(1);
+            }
+            else
+            {
+                buffer_.lda_ = lda;
+            }
+
+        }
+        buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+
+        cl_int err;
+        size_t size = (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T);
+        if( size >= maxMemAllocSize )
+            throw std::runtime_error( "Tried to create a buffer larger than allowable on this device" );
+
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        size,
+                                        NULL, &err);
+
+        if (transA_option == 0)
+        {
+            buffer_.buf_x_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                            buffer_.n_*sizeof(T),
+                                            NULL, &err);
+
+            buffer_.buf_y_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                            buffer_.m_*sizeof(T),
+                                            NULL, &err);
+        }
+        else
+        {
+            buffer_.buf_x_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                            buffer_.m_*sizeof(T),
+                                            NULL, &err);
+
+            buffer_.buf_y_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                            buffer_.n_*sizeof(T),
+                                            NULL, &err);
+        }
+    }
+
+    void initialize_cpu_buffer()
+    {
+        srand(10);
+        for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+        {
+            for (size_t j = 0; j < buffer_.lda_; ++j)
+            {
+                buffer_.a_[i*buffer_.lda_+j] = random<T>(UPPER_BOUND<T>()) /
+                                               randomScale<T>();
+            }
+        }
+
+        if (buffer_.trans_a_ == clblasNoTrans)
+        {
+            for (size_t i = 0; i < buffer_.n_; ++i)
+            {
+                buffer_.x_[i] = random<T>(UPPER_BOUND<T>()) /
+                                randomScale<T>();
+            }
+            for (size_t i = 0; i < buffer_.m_; ++i)
+            {
+                buffer_.y_[i] = random<T>(UPPER_BOUND<T>()) /
+                                randomScale<T>();
+            }
+        }
+        else
+        {
+            for (size_t i = 0; i < buffer_.m_; ++i)
+            {
+                buffer_.x_[i] = random<T>(UPPER_BOUND<T>()) /
+                                randomScale<T>();
+            }
+            for (size_t i = 0; i < buffer_.n_; ++i)
+            {
+                buffer_.y_[i] = random<T>(UPPER_BOUND<T>()) /
+                                randomScale<T>();
+            }
+        }
+    }
+
+    void initialize_gpu_buffer()
+    {
+        cl_int err;
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(T),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.a_, 0, NULL, NULL);
+
+        if (buffer_.trans_a_ == clblasNoTrans)
+        {
+            err = clEnqueueWriteBuffer(queue_, buffer_.buf_x_, CL_TRUE, 0,
+                                       buffer_.n_*sizeof(T),
+                                 buffer_.x_, 0, NULL, NULL);
+
+            err = clEnqueueWriteBuffer(queue_, buffer_.buf_y_, CL_TRUE, 0,
+                                       buffer_.m_*sizeof(T),
+                                       buffer_.y_, 0, NULL, NULL);
+        }
+        else
+        {
+            err = clEnqueueWriteBuffer(queue_, buffer_.buf_x_, CL_TRUE, 0,
+                                       buffer_.m_*sizeof(T),
+                                       buffer_.x_, 0, NULL, NULL);
+
+            err = clEnqueueWriteBuffer(queue_, buffer_.buf_y_, CL_TRUE, 0,
+                                       buffer_.n_*sizeof(T),
+                                       buffer_.y_, 0, NULL, NULL);
+        }
+    }
+
+    void reset_gpu_write_buffer()
+    {
+        cl_int err;
+
+        if (buffer_.trans_a_ == clblasNoTrans)
+        {
+            err = clEnqueueWriteBuffer(queue_, buffer_.buf_y_, CL_TRUE, 0,
+                                       buffer_.m_*sizeof(T),
+                                       buffer_.y_, 0, NULL, NULL);
+        }
+        else
+        {
+            err = clEnqueueWriteBuffer(queue_, buffer_.buf_y_, CL_TRUE, 0,
+                                       buffer_.n_*sizeof(T),
+                                       buffer_.y_, 0, NULL, NULL);
+        }
+    }
+
+	void read_gpu_buffer()
+	{
+		//to-do need to fill up
+	}
+	void roundtrip_func()
+	{//to-do need to fill up
+	}
+	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+		{}
+
+protected:
+    void initialize_scalars(double alpha, double beta)
+    {
+        buffer_.alpha_ = makeScalar<T>(alpha);
+        buffer_.beta_ = makeScalar<T>(beta);
+    }
+
+private:
+  xGemvBuffer<T> buffer_;
+
+}; // class xgemv
+
+
+template<>
+void
+xGemv<float>::
+call_func()
+{
+    timer.Start(timer_id);
+
+	clblasSgemv(order_, buffer_.trans_a_, buffer_.m_, buffer_.n_,
+                     buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                     buffer_.lda_, buffer_.buf_x_, 0, 1, buffer_.beta_,
+                     buffer_.buf_y_, 0, 1, 1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xGemv<double>::
+call_func()
+{
+    timer.Start(timer_id);
+
+	clblasDgemv(order_, buffer_.trans_a_, buffer_.m_, buffer_.n_,
+                     buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                     buffer_.lda_, buffer_.buf_x_, 0, 1, buffer_.beta_,
+                     buffer_.buf_y_, 0, 1, 1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xGemv<cl_float2>::
+call_func()
+{
+    timer.Start(timer_id);
+
+	clblasCgemv(order_, buffer_.trans_a_, buffer_.m_, buffer_.n_,
+                     buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                     buffer_.lda_, buffer_.buf_x_, 0, 1, buffer_.beta_,
+                     buffer_.buf_y_, 0, 1, 1, &queue_, 0, NULL, &event_);
+
+	clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xGemv<cl_double2>::
+call_func()
+{
+    timer.Start(timer_id);
+
+	clblasZgemv(order_, buffer_.trans_a_, buffer_.m_, buffer_.n_,
+                     buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                     buffer_.lda_, buffer_.buf_x_, 0, 1, buffer_.beta_,
+                     buffer_.buf_y_, 0, 1, 1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+#endif // ifndef CLBLAS_BENCHMARK_XGEMV_HXX__
diff --git a/src/client/clfunc_xger.hpp b/src/client/clfunc_xger.hpp
new file mode 100644
index 0000000..05899cd
--- /dev/null
+++ b/src/client/clfunc_xger.hpp
@@ -0,0 +1,419 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XGER_HXX__
+#define CLBLAS_BENCHMARK_XGER_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xGerBuffer
+{
+  clblasOrder order_;
+  size_t m_;
+  size_t n_;
+  T alpha;
+  T* X;
+  cl_mem x_;
+  size_t offX;
+  int incx_;
+  T* Y;
+  cl_mem y_;
+  size_t offY;
+  int incy_;
+  T* A;
+  cl_mem a_;
+  size_t a_num_vectors_;
+  size_t offA;
+  size_t lda_;
+}; // struct buffer
+
+template <typename T>
+class xGer : public clblasFunc
+{
+public:
+  xGer(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer,  devType)
+  {
+    timer.getUniqueID("clGer", 0);
+  }
+
+  ~xGer()
+  {
+    delete buffer_.X;
+    delete buffer_.Y;
+    delete buffer_.A;
+    OPENCL_V_THROW( clReleaseMemObject(buffer_.x_), "releasing buffer X");
+    OPENCL_V_THROW( clReleaseMemObject(buffer_.y_), "releasing buffer Y");
+    OPENCL_V_THROW( clReleaseMemObject(buffer_.a_), "releasing buffer A");
+  }
+
+  //void call_func() {}
+
+  double gflops()
+  {
+    return (buffer_.m_*(buffer_.m_+1))/time_in_ns();
+  }
+
+  std::string gflops_formula()
+  {
+    return "M*(M+1)/time";
+  }
+
+  void setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					          size_t offB, size_t offC, double alpha,
+                    double beta)
+  {
+    initialize_scalars(alpha, beta);
+
+    buffer_.m_ = M;
+    buffer_.n_ = N;
+    buffer_.incx_ = 1;
+    buffer_.incy_ = 1;
+
+    if (order_option == 0)
+    {
+      buffer_.order_ = clblasRowMajor;
+    }
+    else
+    {
+      buffer_.order_ = clblasColumnMajor;
+    }
+
+
+    if (lda == 0)
+    {
+      buffer_.lda_ = M;
+    }
+    else
+    {
+      if( lda < M )
+      {
+        std::cerr << "ERROR: lda must be set to 0 or a value >= M" << std::endl;
+      }
+      else if (lda >= M)
+      {
+        buffer_.lda_ = lda;
+      }
+    }
+    buffer_.offA = offA;
+    buffer_.offX = offB;
+    buffer_.offY = offC;
+
+
+    buffer_.a_num_vectors_ = buffer_.n_;
+    size_t sizeA = buffer_.lda_*buffer_.a_num_vectors_;
+    size_t sizeX = buffer_.m_;
+    size_t sizeY = buffer_.n_;
+    buffer_.A = new T[sizeA];
+    buffer_.X = new T[sizeX];
+    buffer_.Y = new T[sizeY];
+
+
+    cl_int err;
+    buffer_.a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                    sizeA*sizeof(T),
+                                    NULL, &err);
+
+    buffer_.x_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    sizeX*sizeof(T),
+                                    NULL, &err);
+    buffer_.y_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    sizeY*sizeof(T),
+                                    NULL, &err);
+  }
+
+  void initialize_cpu_buffer()
+  {
+    srand(10);
+
+    for (size_t i = 0; i < buffer_.m_; ++i)
+    {
+      buffer_.X[i] = static_cast<T>(rand())/static_cast<T>(RAND_MAX);
+    }
+    for (size_t i = 0; i < buffer_.n_; ++i)
+    {
+      buffer_.Y[i] = static_cast<T>(rand())/static_cast<T>(RAND_MAX);
+    }
+
+    for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+    {
+      for (size_t j = 0; j < buffer_.lda_; ++j)
+      {
+        if (i == j)
+        {
+          /*if (buffer_.diag_ == clblasUnit)
+          {
+            buffer_.a_[i*buffer_.lda_+j] = static_cast<T>(1.0);
+          }
+          else
+          {*/
+            buffer_.A[i*buffer_.lda_+j] =
+              static_cast<T>(rand())/static_cast<T>(RAND_MAX);
+          //}
+        }
+        else
+        {
+          buffer_.A[i*buffer_.lda_+j] = static_cast<T>(0.0);
+        }
+      }
+    }
+  }
+
+  void initialize_gpu_buffer()
+  {
+    cl_int err;
+
+    err = clEnqueueWriteBuffer(queue_, buffer_.a_, CL_TRUE, 0,
+                               buffer_.lda_*buffer_.a_num_vectors_*sizeof(T),
+                               buffer_.A, 0, NULL, NULL);
+
+    err = clEnqueueWriteBuffer(queue_, buffer_.x_, CL_TRUE, 0,
+                               buffer_.m_*sizeof(T),
+                               buffer_.X, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue_, buffer_.y_, CL_TRUE, 0,
+                               buffer_.n_*sizeof(T),
+                               buffer_.Y, 0, NULL, NULL);
+  }
+
+  void reset_gpu_write_buffer()
+  {
+    cl_int err;
+    err = clEnqueueWriteBuffer(queue_, buffer_.x_, CL_TRUE, 0,
+                               buffer_.m_,
+                               buffer_.x_, 0, NULL, NULL);
+  }
+  void call_func();
+
+  void read_gpu_buffer()
+  {
+		//cl_int err;
+		//to-do need to fill up
+  }
+  	void roundtrip_func()
+	{//to-do need to fill up
+	}
+ 	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+		{}
+
+protected:
+  void initialize_scalars(double alpha, double beta)
+  {
+    buffer_.alpha = alpha;
+  }
+
+private:
+  xGerBuffer<T> buffer_;
+
+}; // class xger
+
+//template<>
+//void
+//xGer<cl_float2>::
+//initialize_scalars(double alpha, double beta)
+//{
+//  buffer_.alpha = alpha;
+//}
+
+//template<>
+//void
+//xGer<cl_double2>::
+//initialize_scalars(double alpha, double beta)
+//{
+//}
+
+template<>
+void
+xGer<cl_float>::
+call_func()
+{
+    timer.Start(timer_id);
+    clblasSger(buffer_.order_, buffer_.m_, buffer_.n_, buffer_.alpha, buffer_.x_, buffer_.offX, 1, buffer_.y_, buffer_.offY,
+      1, buffer_.a_, buffer_.offA, buffer_.lda_, 1, &queue_, 0, NULL,
+                   &event_);
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xGer<cl_double>::
+call_func()
+{
+    timer.Start(timer_id);
+    clblasDger(buffer_.order_, buffer_.m_, buffer_.n_, buffer_.alpha, buffer_.x_, buffer_.offX, 1, buffer_.y_, buffer_.offY,
+      1, buffer_.a_, buffer_.offA, buffer_.lda_, 1, &queue_, 0, NULL,
+                   &event_);
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+//template<>
+//void
+//xGer<cl_float2>::
+//call_func()
+//{
+//  timer.Start(timer_id);
+//  clblasCger(order_, buffer_.m_, buffer_.n, buffer_a_, 0,
+//                 buffer_.lda_, buffer_x_, 0, 1, 1, &queue_, 0, NULL,
+//                 &event_);
+//  clWaitForEvents(1, &event_);
+//  timer.Stop(timer_id);
+//}
+//
+//template<>
+//void
+//xGer<cl_double2>::
+//call_func()
+//{
+//  timer.Start(timer_id);
+//  clblasZger(order_, buffer_.uplo_, buffer_.trans_a_,
+//                 buffer_.diag_, buffer_.m_, buffer_a_, 0,
+//                 buffer_.lda_, buffer_x_, 0, 1, 1, &queue_, 0, NULL,
+//                 &event_);
+//  clWaitForEvents(1, &event_);
+//  timer.Stop(timer_id);
+//}
+
+//template<>
+//void
+//xGer<cl_float2>::
+//initialize_cpu_buffer()
+//{
+//  srand(10);
+//  for (size_t i = 0; i < buffer_.m_; ++i)
+//  {
+//    buffer_x_[i].s[0] =
+//      static_cast<cl_float>(rand())/static_cast<cl_float>(RAND_MAX);
+//    buffer_.x_[i].s[1] =
+//      static_cast<cl_float>(rand())/static_cast<cl_float>(RAND_MAX);
+//  }
+//
+//  for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+//  {
+//    for (size_t j = 0; j < buffer_.lda_; ++j)
+//    {
+//      if (i == j)
+//      {
+//        if (buffer_.diag_ == clblasUnit)
+//        {
+//          buffer_.a_[i*buffer_.lda_+j].s[0] = 1.0f;
+//          buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0f;
+//        }
+//        else
+//        {
+//          buffer_.a_[i*buffer_.lda_+j].s[0] =
+//            static_cast<cl_float>(rand())/static_cast<cl_float>(RAND_MAX);
+//          buffer_.a_[i*buffer_.lda_+j].s[1] =
+//            static_cast<cl_float>(rand())/static_cast<cl_float>(RAND_MAX);
+//        }
+//      }
+//      else
+//      {
+//        buffer_.a_[i*buffer_.lda_+j].s[0] = 0.0f;
+//        buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0f;
+//      }
+//    }
+//  }
+//
+//
+//}
+
+//template<>
+//void
+//xGer<cl_double2>::
+//initialize_cpu_buffer()
+//{
+//  srand(10);
+//  for (size_t i = 0; i < buffer_.m_; ++i)
+//  {
+//    buffer_.x_[i].s[0] =
+//      static_cast<cl_double>(rand())/static_cast<cl_double>(RAND_MAX);
+//    buffer_.x_[i].s[1] =
+//      static_cast<cl_double>(rand())/static_cast<cl_double>(RAND_MAX);
+//  }
+//
+//  for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+//  {
+//    for (size_t j = 0; j < buffer_.lda_; ++j)
+//    {
+//      if (i == j)
+//      {
+//        if (buffer_.diag_ == clblasUnit)
+//        {
+//          buffer_.a_[i*buffer_.lda_+j].s[0] = 1.0;
+//          buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0;
+//        }
+//        else
+//        {
+//          buffer_.a_[i*buffer_.lda_+j].s[0] =
+//            static_cast<cl_double>(rand())/static_cast<cl_double>(RAND_MAX);
+//          buffer_.a_[i*buffer_.lda_+j].s[1] =
+//            static_cast<cl_double>(rand())/static_cast<cl_double>(RAND_MAX);
+//        }
+//      }
+//      else
+//      {
+//        buffer_.a_[i*buffer_.lda_+j].s[0] = 0.0;
+//        buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0;
+//      }
+//    }
+//  }
+//}
+
+//template<>
+//double
+//xGer<cl_float2>::
+//gflops()
+//{
+//  return 2.0*buffer_.m_*(buffer_.m_+1)/time_in_ns();
+//}
+//
+//template<>
+//double
+//xGer<cl_double2>::
+//gflops()
+//{
+//  return 2.0*buffer_.m_*(buffer_.m_+1)/time_in_ns();
+//}
+//
+//template<>
+//std::string
+//xGer<cl_float2>::
+//gflops_formula()
+//{
+//  return "2.0*M*(M+1)/time";
+//}
+//
+//template<>
+//std::string
+//xGer<cl_double2>::
+//gflops_formula()
+//{
+//  return "2.0*M*(M+1)/time";
+//}
+
+
+#endif // ifndef CLBLAS_BENCHMARK_XGER_HXX__
diff --git a/src/client/clfunc_xgerc.hpp b/src/client/clfunc_xgerc.hpp
new file mode 100644
index 0000000..829d938
--- /dev/null
+++ b/src/client/clfunc_xgerc.hpp
@@ -0,0 +1,301 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XGERC_HXX__
+#define CLBLAS_BENCHMARK_XGERC_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xGercBuffer
+{
+	clblasOrder order;
+  size_t M;
+  size_t N;
+  T alpha;
+  T* cpuX;
+  cl_mem X;
+  size_t offx;
+  int incx;
+  T* cpuY;
+  cl_mem Y;
+  size_t offy;
+  int incy;
+  T* cpuA;
+  cl_mem A;
+  size_t offa;
+  size_t lda;
+}; // struct buffer
+
+template <typename T>
+class xGerc : public clblasFunc
+{
+public:
+  xGerc(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer,  devType)
+  {
+    timer.getUniqueID("clGerc", 0);
+  }
+
+  ~xGerc()
+  {
+    delete buffer.cpuA;
+    delete buffer.cpuX;
+    delete buffer.cpuY;
+    OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
+    OPENCL_V_THROW( clReleaseMemObject(buffer.X), "releasing buffer X");
+    OPENCL_V_THROW( clReleaseMemObject(buffer.Y), "releasing buffer Y");
+
+  }
+
+  double gflops()
+  {
+    return (buffer.N*(buffer.N+1))/time_in_ns();
+  }
+
+  std::string gflops_formula()
+  {
+    return "M*(M+1)/time";
+  }
+
+  void setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					          size_t offB, size_t offC, double alpha,
+                    double beta);
+  void initialize_cpu_buffer();
+  void initialize_gpu_buffer();
+  void reset_gpu_write_buffer();
+  void read_gpu_buffer()
+  {
+		//cl_int err;
+		//to-do need to fill up
+  }
+  	void roundtrip_func()
+	{//to-do need to fill up
+	}
+	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+		{}
+
+  void call_func();
+
+protected:
+  void initialize_scalars(double alpha, double beta)
+  {
+      buffer.alpha = makeScalar<T>(alpha);
+      //buffer.beta = makeScalar<T>(beta);
+  }
+
+private:
+  xGercBuffer<T> buffer;
+};
+
+template <typename T>
+void xGerc<T>::setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					          size_t offB, size_t offC, double alpha,
+                    double beta)
+{
+  initialize_scalars(alpha, beta);
+  buffer.offa = offA;
+  buffer.offx = offB;
+  buffer.incx = 1;//If this changes, remember to adjust size of Y in rest of the file
+  buffer.offy = offC;
+  buffer.incy = 1;//If this changes, remember to adjust size of Y in rest of the file
+  buffer.M = M;
+  buffer.N = N;
+  if (order_option == 0)
+  {
+  buffer.order = clblasRowMajor;
+  }
+  else
+  {
+  buffer.order = clblasColumnMajor;
+  }
+  if (lda == 0)
+  {
+    buffer.lda = buffer.M;
+  }
+  else if (lda < buffer.M)
+  {
+    std::cerr << "lda:wrong size\n";
+    exit(1);
+  }
+  else
+  {
+    buffer.lda = lda;
+  }
+  buffer.cpuX = new T[buffer.M];
+  buffer.cpuY = new T[buffer.N];
+  buffer.cpuA = new T[buffer.N * buffer.lda];
+  cl_int err;
+  buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                buffer.N * buffer.lda*sizeof(T),
+                                NULL, &err);
+
+  buffer.X = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.M*sizeof(T),
+                                    NULL, &err);
+  buffer.Y = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*sizeof(T),
+                                    NULL, &err);
+}
+
+template <typename T>
+void xGerc<T>::initialize_cpu_buffer()
+{
+  srand(10);
+  for (size_t i = 0; i < buffer.N; ++i)
+  {
+    for (size_t j = 0; j < buffer.lda; ++j)
+    {
+        buffer.cpuA[i*buffer.lda+j] = random<T>(UPPER_BOUND<T>()) /
+                                      randomScale<T>();
+    }
+  }
+
+  for (size_t i = 0; i < buffer.M; ++i)
+  {
+    buffer.cpuX[i] = random<T>(UPPER_BOUND<T>()) /
+                                      randomScale<T>();
+  }
+  for (size_t i = 0; i < buffer.N; ++i)
+  {
+    buffer.cpuY[i] = random<T>(UPPER_BOUND<T>()) /
+                                      randomScale<T>();
+  }
+}
+
+//template <>
+//void xGerc<cl_float2>::initialize_cpu_buffer()
+//{
+//  srand(10);
+//  for (size_t i = 0; i < buffer.N; ++i)
+//  {
+//      for (size_t j = 0; j < buffer.lda; ++j)
+//      {
+//          buffer.cpuA[i*buffer.lda+j].s[0] = static_cast<cl_float>(rand())/
+//            static_cast<cl_float>(RAND_MAX);
+//          buffer.cpuA[i*buffer.lda+j].s[1] = static_cast<cl_float>(rand())/
+//            static_cast<cl_float>(RAND_MAX);
+//      }
+//  }
+//
+//  for (size_t i = 0; i < buffer.M; ++i)
+//  {
+//      buffer.cpuX[i].s[0] = static_cast<cl_float>(rand())/
+//        static_cast<cl_float>(RAND_MAX);
+//      buffer.cpuX[i].s[1] = static_cast<cl_float>(rand())/
+//        static_cast<cl_float>(RAND_MAX);
+//  }
+//  for (size_t i = 0; i < buffer.N; ++i)
+//  {
+//      buffer.cpuY[i].s[0] = static_cast<cl_float>(rand())/
+//        static_cast<cl_float>(RAND_MAX);
+//      buffer.cpuY[i].s[1] = static_cast<cl_float>(rand())/
+//        static_cast<cl_float>(RAND_MAX);
+//  }
+//}
+//template <>
+//void xGerc<cl_double2>::initialize_cpu_buffer()
+//{
+//  srand(10);
+//  for (size_t i = 0; i < buffer.N; ++i)
+//  {
+//      for (size_t j = 0; j < buffer.lda; ++j)
+//      {
+//          buffer.cpuA[i*buffer.lda+j].s[0] = static_cast<cl_double>(rand())/
+//            static_cast<cl_double>(RAND_MAX);
+//          buffer.cpuA[i*buffer.lda+j].s[1] = static_cast<cl_double>(rand())/
+//            static_cast<cl_double>(RAND_MAX);
+//      }
+//  }
+//
+//  for (size_t i = 0; i < buffer.M; ++i)
+//  {
+//      buffer.cpuX[i].s[0] = static_cast<cl_double>(rand())/
+//        static_cast<cl_double>(RAND_MAX);
+//      buffer.cpuX[i].s[1] = static_cast<cl_double>(rand())/
+//        static_cast<cl_double>(RAND_MAX);
+//  }
+//  for (size_t i = 0; i < buffer.N; ++i)
+//  {
+//      buffer.cpuY[i].s[0] = static_cast<cl_double>(rand())/
+//        static_cast<cl_double>(RAND_MAX);
+//      buffer.cpuY[i].s[1] = static_cast<cl_double>(rand())/
+//        static_cast<cl_double>(RAND_MAX);
+//  }
+//}
+//
+
+template <typename T>
+void xGerc<T>::initialize_gpu_buffer()
+{
+  cl_int err;
+
+  err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(T),
+                              buffer.N * buffer.lda*sizeof(T),
+                              buffer.cpuA, 0, NULL, NULL);
+
+  err = clEnqueueWriteBuffer(queue_, buffer.X, CL_TRUE, 0,
+                              buffer.M*sizeof(T),
+                              buffer.cpuX, 0, NULL, NULL);
+  err = clEnqueueWriteBuffer(queue_, buffer.Y, CL_TRUE, 0,
+                              buffer.N*sizeof(T),
+                              buffer.cpuY, 0, NULL, NULL);
+}
+
+template <typename T>
+void xGerc<T>::reset_gpu_write_buffer()
+{
+  cl_int err;
+  err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(T),
+                              buffer.N * buffer.lda*sizeof(T),
+                              buffer.cpuA, 0, NULL, NULL);;
+}
+
+template <>
+void xGerc<cl_float2>::call_func()
+{
+  timer.Start(timer_id);
+  clblasCgerc(buffer.order, buffer.M, buffer.N, buffer.alpha, buffer.X, buffer.offx,
+    buffer.incx, buffer.Y, buffer.offy, buffer.incy, buffer.A, buffer.offa, buffer.lda, 1, &queue_, 0, NULL,&event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template <>
+void xGerc<cl_double2>::call_func()
+{
+  timer.Start(timer_id);
+  clblasZgerc(buffer.order, buffer.M, buffer.N, buffer.alpha, buffer.X, buffer.offx,
+    buffer.incx, buffer.Y, buffer.offy, buffer.incy, buffer.A, buffer.offa, buffer.lda, 1, &queue_, 0, NULL,&event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+#endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__
\ No newline at end of file
diff --git a/src/client/clfunc_xgeru.hpp b/src/client/clfunc_xgeru.hpp
new file mode 100644
index 0000000..8c7d02c
--- /dev/null
+++ b/src/client/clfunc_xgeru.hpp
@@ -0,0 +1,236 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XGERU_HXX__
+#define CLBLAS_BENCHMARK_XGERU_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xGeruBuffer
+{
+	clblasOrder order;
+  size_t M;
+  size_t N;
+  T alpha;
+  T* cpuX;
+  cl_mem X;
+  size_t offx;
+  int incx;
+  T* cpuY;
+  cl_mem Y;
+  size_t offy;
+  int incy;
+  T* cpuA;
+  cl_mem A;
+  size_t offa;
+  size_t lda;
+}; // struct buffer
+
+template <typename T>
+class xGeru : public clblasFunc
+{
+public:
+  xGeru(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer,  devType)
+  {
+    timer.getUniqueID("clGeru", 0);
+  }
+
+  ~xGeru()
+  {
+    delete buffer.cpuA;
+    delete buffer.cpuX;
+    OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
+    OPENCL_V_THROW( clReleaseMemObject(buffer.X), "releasing buffer C");
+  }
+
+  double gflops()
+  {
+    return (buffer.N*(buffer.N+1))/time_in_ns();
+  }
+
+  std::string gflops_formula()
+  {
+    return "M*(M+1)/time";
+  }
+
+  void setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					          size_t offB, size_t offC, double alpha,
+                    double beta);
+  void initialize_cpu_buffer();
+  void initialize_gpu_buffer();
+  void reset_gpu_write_buffer();
+  void call_func();
+  void read_gpu_buffer()
+	{
+		//cl_int err;
+		//to-do need to fill up
+	}
+  	void roundtrip_func()
+	{//to-do need to fill up
+	}
+	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+		{}
+
+protected:
+protected:
+  void initialize_scalars(double alpha, double beta)
+  {
+      buffer.alpha = makeScalar<T>(alpha);
+      //buffer.beta = makeScalar<T>(beta);
+  }
+
+private:
+  xGeruBuffer<T> buffer;
+};
+
+template <typename T>
+void xGeru<T>::setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					          size_t offB, size_t offC, double alpha,
+                    double beta)
+{
+  initialize_scalars(alpha, beta);
+  buffer.offa = offA;
+  buffer.offx = offB;
+  buffer.incx = 1;//If this changes, remember to adjust size of Y in rest of the file
+  buffer.offy = offC;
+  buffer.incy = 1;//If this changes, remember to adjust size of Y in rest of the file
+  buffer.M = M;
+  buffer.N = N;
+  if (order_option == 0)
+  {
+  buffer.order = clblasRowMajor;
+  }
+  else
+  {
+  buffer.order = clblasColumnMajor;
+  }
+  if (lda == 0)
+  {
+    buffer.lda = buffer.M;
+  }
+  else if (lda < buffer.M)
+  {
+    std::cerr << "lda:wrong size\n";
+    exit(1);
+  }
+  else
+  {
+    buffer.lda = lda;
+  }
+  buffer.cpuX = new T[buffer.M];
+  buffer.cpuY = new T[buffer.N];
+  buffer.cpuA = new T[buffer.N * buffer.lda];
+  cl_int err;
+  buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                buffer.N * buffer.lda*sizeof(T),
+                                NULL, &err);
+
+  buffer.X = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.M*sizeof(T),
+                                    NULL, &err);
+  buffer.Y = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*sizeof(T),
+                                    NULL, &err);
+}
+
+template <typename T>
+void xGeru<T>::initialize_cpu_buffer()
+{
+  srand(10);
+  for (size_t i = 0; i < buffer.N; ++i)
+  {
+    for (size_t j = 0; j < buffer.lda; ++j)
+    {
+        buffer.cpuA[i*buffer.lda+j] = random<T>(UPPER_BOUND<T>()) /
+                                      randomScale<T>();
+    }
+  }
+
+  for (size_t i = 0; i < buffer.M; ++i)
+  {
+    buffer.cpuX[i] = random<T>(UPPER_BOUND<T>()) /
+                                      randomScale<T>();
+  }
+  for (size_t i = 0; i < buffer.N; ++i)
+  {
+    buffer.cpuY[i] = random<T>(UPPER_BOUND<T>()) /
+                                      randomScale<T>();
+  }
+}
+
+template <typename T>
+void xGeru<T>::initialize_gpu_buffer()
+{
+  cl_int err;
+
+  err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(T),
+                              buffer.N * buffer.lda*sizeof(T),
+                              buffer.cpuA, 0, NULL, NULL);
+
+  err = clEnqueueWriteBuffer(queue_, buffer.X, CL_TRUE, 0,
+                              buffer.M*sizeof(T),
+                              buffer.cpuX, 0, NULL, NULL);
+  err = clEnqueueWriteBuffer(queue_, buffer.Y, CL_TRUE, 0,
+                              buffer.N*sizeof(T),
+                              buffer.cpuY, 0, NULL, NULL);
+}
+
+template <typename T>
+void xGeru<T>::reset_gpu_write_buffer()
+{
+  cl_int err;
+  err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(T),
+                              buffer.N * buffer.lda*sizeof(T),
+                              buffer.cpuA, 0, NULL, NULL);;
+}
+
+template <>
+void xGeru<cl_float2>::call_func()
+{
+  timer.Start(timer_id);
+  clblasCgeru(buffer.order, buffer.M, buffer.N, buffer.alpha, buffer.X, buffer.offx,
+    buffer.incx, buffer.Y, buffer.offy, buffer.incy, buffer.A, buffer.offa, buffer.lda, 1, &queue_, 0, NULL,&event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template <>
+void xGeru<cl_double2>::call_func()
+{
+  timer.Start(timer_id);
+  clblasZgeru(buffer.order, buffer.M, buffer.N, buffer.alpha, buffer.X, buffer.offx,
+    buffer.incx, buffer.Y, buffer.offy, buffer.incy, buffer.A, buffer.offa, buffer.lda, 1, &queue_, 0, NULL,&event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+#endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__
\ No newline at end of file
diff --git a/src/client/clfunc_xhemm.hpp b/src/client/clfunc_xhemm.hpp
new file mode 100644
index 0000000..8e46d1e
--- /dev/null
+++ b/src/client/clfunc_xhemm.hpp
@@ -0,0 +1,394 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XHEMM_HXX__
+#define CLBLAS_BENCHMARK_XHEMM_HXX__
+
+#include "clfunc_common.hpp"
+//clblasChemm(
+//    clblasOrder order,
+//    clblasSide side,
+//    clblasUplo uplo,
+//    size_t M,
+//    size_t N,
+//    cl_float2 alpha,
+//    const cl_mem A,
+//    size_t offa,
+//    size_t lda,
+//    const cl_mem B,
+//    size_t offb,
+//    size_t ldb,
+//    cl_float2 beta,
+//    cl_mem C,
+//    size_t offc,
+//    size_t ldc,
+//    cl_uint numCommandQueues,
+//    cl_command_queue *commandQueues,
+//    cl_uint numEventsInWaitList,
+//    const cl_event *eventWaitList);
+template <typename T>
+struct xHemmBuffer
+{
+	clblasOrder order;
+  clblasSide side;
+  clblasUplo uplo;
+  size_t M;
+  size_t N;
+  T alpha;
+  T* cpuA;
+  size_t a_num_vectors;
+  cl_mem A;
+  size_t offa;
+  size_t lda;
+  T* cpuB;
+  cl_mem B;
+  size_t offb;
+  size_t ldb;
+  T beta;
+  T* cpuC;
+  cl_mem C;
+  size_t offc;
+  size_t ldc;
+}; // struct buffer
+
+template <typename T>
+class xHemm : public clblasFunc
+{
+public:
+  xHemm(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer,  devType)
+  {
+    timer.getUniqueID("clHemm", 0);
+  }
+
+  ~xHemm()
+  {
+    delete buffer.cpuA;
+    delete buffer.cpuB;
+    delete buffer.cpuC;
+    OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
+    OPENCL_V_THROW( clReleaseMemObject(buffer.B), "releasing buffer B");
+    OPENCL_V_THROW( clReleaseMemObject(buffer.C), "releasing buffer C");
+  }
+
+  double gflops()
+  {
+    return (buffer.N*(buffer.N+1))/time_in_ns();
+  }
+
+  std::string gflops_formula()
+  {
+    return "M*(M+1)/time";
+  }
+
+  void setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					          size_t offB, size_t offC, double alpha,
+                    double beta);
+  void initialize_cpu_buffer(){}
+  void initialize_gpu_buffer();
+  void reset_gpu_write_buffer();
+  void call_func();
+  	void read_gpu_buffer()
+	{
+		//cl_int err;
+		//to-do need to fill up
+	}
+	void roundtrip_func()
+	{//to-do need to fill up
+	}
+	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+		{}
+
+protected:
+protected:
+  void initialize_scalars(double alpha, double beta)
+  {
+      buffer.alpha = makeScalar<T>(alpha);
+      buffer.beta = makeScalar<T>(beta);
+  }
+
+private:
+  xHemmBuffer<T> buffer;
+};
+
+template <typename T>
+void xHemm<T>::setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					          size_t offB, size_t offC, double alpha,
+                    double beta)
+{
+  initialize_scalars(alpha, beta);
+  buffer.offa = offA;
+  buffer.offb = offB;
+  buffer.offc = offC;
+  buffer.M = M;
+  buffer.N = N;
+  if (order_option == 0)
+  {
+  buffer.order = clblasRowMajor;
+  }
+  else
+  {
+  buffer.order = clblasColumnMajor;
+  }
+  if (uplo_option == 0)
+  {
+    buffer.uplo = clblasUpper;
+  }
+  else
+  {
+    buffer.uplo = clblasLower;
+  }
+  if (side_option == 0)
+  {
+      buffer.side = clblasLeft;
+      buffer.a_num_vectors = M;
+      if (lda == 0)
+      {
+        buffer.lda = buffer.M;
+      }
+      else if (lda < buffer.M)
+      {
+        std::cerr << "lda:wrong size\n";
+        exit(1);
+      }
+      else
+      {
+        buffer.lda = lda;
+      }
+  }
+  else
+  {
+      buffer.side = clblasRight;
+      buffer.a_num_vectors = N;
+      if (lda == 0)
+      {
+        buffer.lda = buffer.N;
+      }
+      else if (lda < buffer.N)
+      {
+        std::cerr << "lda:wrong size\n";
+        exit(1);
+      }
+      else
+      {
+        buffer.lda = lda;
+      }
+  }
+  /*}
+  if (lda == 0)
+  {
+    buffer.lda = buffer.M;
+  }
+  else if (lda < buffer.M)
+  {
+    std::cerr << "lda:wrong size\n";
+    exit(1);
+  }
+  else
+  {
+    buffer.lda = lda;
+  }*/
+  if (ldb == 0)
+  {
+    buffer.ldb = buffer.M;
+  }
+  else if (ldb < buffer.M)
+  {
+    std::cerr << "ldb:wrong size\n";
+    exit(1);
+  }
+  else
+  {
+    buffer.ldb = ldb;
+  }
+  if (ldc == 0)
+  {
+    buffer.ldc = buffer.M;
+  }
+  else if (ldc < buffer.M)
+  {
+    std::cerr << "ldc:wrong size\n";
+    exit(1);
+  }
+  else
+  {
+    buffer.ldc = ldc;
+  }
+  buffer.cpuB = new T[buffer.N * buffer.ldb];
+  buffer.cpuC = new T[buffer.N * buffer.ldc];
+  buffer.cpuA = new T[buffer.a_num_vectors * buffer.lda];
+  cl_int err;
+  buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                buffer.a_num_vectors * buffer.lda*sizeof(T),
+                                NULL, &err);
+
+  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*buffer.ldb*sizeof(T),
+                                    NULL, &err);
+  buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*buffer.ldc*sizeof(T),
+                                    NULL, &err);
+}
+
+template <>
+void xHemm<cl_float2>::initialize_cpu_buffer()
+{
+  srand(10);
+  for (size_t i = 0; i < buffer.a_num_vectors; ++i)
+  {
+      for (size_t j = 0; j < buffer.lda; ++j)
+      {
+          buffer.cpuA[i*buffer.lda+j].s[0] = static_cast<cl_float>(rand())/
+            static_cast<cl_float>(RAND_MAX);
+          buffer.cpuA[i*buffer.lda+j].s[1] = static_cast<cl_float>(rand())/
+            static_cast<cl_float>(RAND_MAX);
+      }
+  }
+  for (size_t i = 0; i < buffer.N; ++i)
+  {
+      for (size_t j = 0; j < buffer.ldb; ++j)
+      {
+          buffer.cpuB[i*buffer.ldb+j].s[0] = static_cast<cl_float>(rand())/
+            static_cast<cl_float>(RAND_MAX);
+          buffer.cpuB[i*buffer.ldb+j].s[1] = static_cast<cl_float>(rand())/
+            static_cast<cl_float>(RAND_MAX);
+      }
+  }
+  for (size_t i = 0; i < buffer.N; ++i)
+  {
+      for (size_t j = 0; j < buffer.ldc; ++j)
+      {
+          buffer.cpuC[i*buffer.ldc+j].s[0] = static_cast<cl_float>(rand())/
+            static_cast<cl_float>(RAND_MAX);
+          buffer.cpuC[i*buffer.ldc+j].s[1] = static_cast<cl_float>(rand())/
+            static_cast<cl_float>(RAND_MAX);
+      }
+  }
+
+
+  //for (size_t i = 0; i < buffer.N; ++i)
+  //{
+  //    buffer.cpuX[i].s[0] = static_cast<cl_float>(rand())/
+  //      static_cast<cl_float>(RAND_MAX);
+  //    buffer.cpuX[i].s[1] = static_cast<cl_float>(rand())/
+  //      static_cast<cl_float>(RAND_MAX);
+  //    buffer.cpuY[i].s[0] = static_cast<cl_float>(rand())/
+  //      static_cast<cl_float>(RAND_MAX);
+  //    buffer.cpuY[i].s[1] = static_cast<cl_float>(rand())/
+  //      static_cast<cl_float>(RAND_MAX);
+  //}
+}
+template <>
+void xHemm<cl_double2>::initialize_cpu_buffer()
+{
+  srand(10);
+  for (size_t i = 0; i < buffer.a_num_vectors; ++i)
+  {
+      for (size_t j = 0; j < buffer.lda; ++j)
+      {
+          buffer.cpuA[i*buffer.lda+j].s[0] = static_cast<cl_double>(rand())/
+            static_cast<cl_double>(RAND_MAX);
+          buffer.cpuA[i*buffer.lda+j].s[1] = static_cast<cl_double>(rand())/
+            static_cast<cl_double>(RAND_MAX);
+      }
+  }
+  for (size_t i = 0; i < buffer.N; ++i)
+  {
+      for (size_t j = 0; j < buffer.ldb; ++j)
+      {
+          buffer.cpuB[i*buffer.ldb+j].s[0] = static_cast<cl_double>(rand())/
+            static_cast<cl_double>(RAND_MAX);
+          buffer.cpuB[i*buffer.ldb+j].s[1] = static_cast<cl_double>(rand())/
+            static_cast<cl_double>(RAND_MAX);
+      }
+  }
+  for (size_t i = 0; i < buffer.N; ++i)
+  {
+      for (size_t j = 0; j < buffer.ldc; ++j)
+      {
+          buffer.cpuC[i*buffer.ldc+j].s[0] = static_cast<cl_double>(rand())/
+            static_cast<cl_double>(RAND_MAX);
+          buffer.cpuC[i*buffer.ldc+j].s[1] = static_cast<cl_double>(rand())/
+            static_cast<cl_double>(RAND_MAX);
+      }
+  }
+}
+
+
+template <typename T>
+void xHemm<T>::initialize_gpu_buffer()
+{
+  cl_int err;
+
+  err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(T),
+                              buffer.a_num_vectors * buffer.lda*sizeof(T),
+                              buffer.cpuA, 0, NULL, NULL);
+
+  err = clEnqueueWriteBuffer(queue_, buffer.B, CL_TRUE, 0,
+                              buffer.ldb*buffer.N*sizeof(T),
+                              buffer.cpuB, 0, NULL, NULL);
+  err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE, 0,
+                              buffer.ldc*buffer.N*sizeof(T),
+                              buffer.cpuC, 0, NULL, NULL);
+}
+
+template <typename T>
+void xHemm<T>::reset_gpu_write_buffer()
+{
+  cl_int err;
+  err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE, 0,
+                              buffer.ldc*buffer.N*sizeof(T),
+                              buffer.cpuC, 0, NULL, NULL);
+}
+
+template <>
+void xHemm<cl_float2>::call_func()
+{
+  timer.Start(timer_id);
+  clblasChemm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
+      buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
+      buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, 1, &queue_,
+      0, NULL,&event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template <>
+void xHemm<cl_double2>::call_func()
+{
+  timer.Start(timer_id);
+  clblasZhemm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
+      buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
+      buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, 1, &queue_,
+      0, NULL,&event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+#endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__
\ No newline at end of file
diff --git a/src/client/clfunc_xhemv.hpp b/src/client/clfunc_xhemv.hpp
new file mode 100644
index 0000000..570c3fc
--- /dev/null
+++ b/src/client/clfunc_xhemv.hpp
@@ -0,0 +1,275 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XHEMV_HXX__
+#define CLBLAS_BENCHMARK_XHEMV_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xHemvBuffer
+{
+	clblasOrder order;
+  clblasUplo uplo;
+  size_t N;
+  T alpha;
+  T* cpuX;
+  cl_mem X;
+  size_t offx;
+  int incx;
+  T beta;
+  T* cpuY;
+  cl_mem Y;
+  size_t offy;
+  int incy;
+  T* cpuA;
+  cl_mem A;
+  size_t offa;
+  size_t lda;
+}; // struct buffer
+
+template <typename T>
+class xHemv : public clblasFunc
+{
+public:
+  xHemv(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer,  devType)
+  {
+    timer.getUniqueID("clHemv", 0);
+  }
+
+  ~xHemv()
+  {
+    delete buffer.cpuA;
+    delete buffer.cpuX;
+    OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
+    OPENCL_V_THROW( clReleaseMemObject(buffer.X), "releasing buffer C");
+  }
+
+  double gflops()
+  {
+    return static_cast<double>((2 * buffer.N * buffer.N)/time_in_ns());
+  }
+
+  std::string gflops_formula()
+  {
+    return "2*N*N/time";
+  }
+
+  void setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					          size_t offB, size_t offC, double alpha,
+                    double beta);
+  void initialize_cpu_buffer();
+  void initialize_gpu_buffer();
+  void reset_gpu_write_buffer();
+  void call_func();
+	void read_gpu_buffer()
+	{
+		//cl_int err;
+		//to-do need to fill up
+	}
+	void roundtrip_func()
+	{//to-do need to fill up
+	}
+	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+		{}
+
+protected:
+protected:
+  void initialize_scalars(double alpha, double beta)
+  {
+      buffer.alpha = makeScalar<T>(alpha);
+      buffer.beta = makeScalar<T>(beta);
+  }
+
+private:
+  xHemvBuffer<T> buffer;
+};
+
+template <typename T>
+void xHemv<T>::setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					          size_t offB, size_t offC, double alpha,
+                    double beta)
+{
+  initialize_scalars(alpha, beta);
+  buffer.offa = offA;
+  buffer.offx = offB;
+  buffer.incx = 1;//If this changes, remember to adjust size of Y in rest of the file
+  buffer.offy = offC;
+  buffer.incy = 1;//If this changes, remember to adjust size of Y in rest of the file
+  buffer.N = M;
+  if (order_option == 0)
+  {
+  buffer.order = clblasRowMajor;
+  }
+  else
+  {
+  buffer.order = clblasColumnMajor;
+  }
+  if (uplo_option == 0)
+  {
+    buffer.uplo = clblasUpper;
+  }
+  else
+  {
+    buffer.uplo = clblasLower;
+  }
+  if (lda == 0)
+  {
+    buffer.lda = buffer.N;
+  }
+  else if (lda < buffer.N)
+  {
+    std::cerr << "lda:wrong size\n";
+    exit(1);
+  }
+  else
+  {
+    buffer.lda = lda;
+  }
+  buffer.cpuX = new T[buffer.N];
+  buffer.cpuY = new T[buffer.N];
+  buffer.cpuA = new T[buffer.N * buffer.lda];
+  cl_int err;
+  buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                buffer.N * buffer.lda*sizeof(T),
+                                NULL, &err);
+
+  buffer.X = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*sizeof(T),
+                                    NULL, &err);
+  buffer.Y = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*sizeof(T),
+                                    NULL, &err);
+}
+
+template <typename T>
+void xHemv<T>::initialize_cpu_buffer()
+{
+  srand(10);
+  for (size_t i = 0; i < buffer.N; ++i)
+  {
+      for (size_t j = 0; j < buffer.lda; ++j)
+      {
+          buffer.cpuA[i*buffer.lda+j] = random<T>(UPPER_BOUND<T>()) /
+                                        randomScale<T>();
+      }
+  }
+
+  for (size_t i = 0; i < buffer.N; ++i)
+  {
+      buffer.cpuX[i] = random<T>(UPPER_BOUND<T>()) /
+                                        randomScale<T>();
+      buffer.cpuY[i] = random<T>(UPPER_BOUND<T>()) /
+                                        randomScale<T>();
+  }
+}
+
+template <typename T>
+void xHemv<T>::initialize_gpu_buffer()
+{
+  cl_int err;
+
+  err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(T),
+                              buffer.N * buffer.lda*sizeof(T),
+                              buffer.cpuA, 0, NULL, NULL);
+
+  err = clEnqueueWriteBuffer(queue_, buffer.X, CL_TRUE, 0,
+                              buffer.N*sizeof(T),
+                              buffer.cpuX, 0, NULL, NULL);
+  err = clEnqueueWriteBuffer(queue_, buffer.Y, CL_TRUE, 0,
+                              buffer.N*sizeof(T),
+                              buffer.cpuY, 0, NULL, NULL);
+}
+
+template <typename T>
+void xHemv<T>::reset_gpu_write_buffer()
+{
+  cl_int err;
+  err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(T),
+                              buffer.N * buffer.lda*sizeof(T),
+                              buffer.cpuA, 0, NULL, NULL);;
+}
+
+template <>
+void xHemv<cl_float2>::call_func()
+{
+  timer.Start(timer_id);
+  clblasChemv(buffer.order, buffer.uplo, buffer.N, buffer.alpha, buffer.A,
+                 buffer.offa, buffer.lda, buffer.X, buffer.offx, buffer.incx,
+                 buffer.beta, buffer.Y, buffer.offy, buffer.incy, 1, &queue_, 0, NULL,&event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template <>
+void xHemv<cl_double2>::call_func()
+{
+  timer.Start(timer_id);
+  clblasZhemv(buffer.order, buffer.uplo, buffer.N, buffer.alpha, buffer.A,
+                 buffer.offa, buffer.lda, buffer.X, buffer.offx, buffer.incx,
+                 buffer.beta, buffer.Y, buffer.offy, buffer.incy, 1, &queue_, 0, NULL,&event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template<>
+double
+xHemv<cl_float2>::
+gflops()
+{
+  return static_cast<double>((8 * buffer.N * buffer.N)/time_in_ns());
+}
+
+template<>
+double
+xHemv<cl_double2>::
+gflops()
+{
+  return static_cast<double>((8 * buffer.N * buffer.N)/time_in_ns());
+}
+
+template<>
+std::string
+xHemv<cl_float2>::
+gflops_formula()
+{
+  return "8*N*N/time";
+}
+
+template<>
+std::string
+xHemv<cl_double2>::
+gflops_formula()
+{
+  return "8*N*N/time";
+}
+
+#endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__
\ No newline at end of file
diff --git a/src/client/clfunc_xher.hpp b/src/client/clfunc_xher.hpp
new file mode 100644
index 0000000..e624b55
--- /dev/null
+++ b/src/client/clfunc_xher.hpp
@@ -0,0 +1,305 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XHER_HXX__
+#define CLBLAS_BENCHMARK_XHER_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xHerBuffer
+{
+	clblasOrder order;
+  clblasUplo uplo;
+  size_t N;
+  T alpha;
+  T* cpuX;
+  cl_mem X;
+  size_t offx;
+  int incx;
+  T* cpuA;
+  cl_mem A;
+  size_t offa;
+  size_t lda;
+}; // struct buffer
+
+template <typename T>
+class xHer : public clblasFunc
+{
+public:
+  xHer(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer,  devType)
+  {
+    timer.getUniqueID("clHer", 0);
+  }
+
+  ~xHer()
+  {
+    delete buffer.cpuA;
+    delete buffer.cpuX;
+    OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
+    OPENCL_V_THROW( clReleaseMemObject(buffer.X), "releasing buffer C");
+  }
+
+  double gflops()
+  {
+    return static_cast<double>((buffer.N * buffer.N)/time_in_ns());
+  }
+
+  std::string gflops_formula()
+  {
+    return "N*N/time";
+  }
+
+  void setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					          size_t offB, size_t offC, double alpha,
+                    double beta);
+  void initialize_cpu_buffer();
+  void initialize_gpu_buffer();
+  void reset_gpu_write_buffer();
+  void call_func();
+  	void read_gpu_buffer()
+	{
+		//cl_int err;
+		//to-do need to fill up
+	}
+	void roundtrip_func()
+	{//to-do need to fill up
+	}
+	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+		{}
+
+protected:
+protected:
+  void initialize_scalars(double alpha, double beta)
+  {
+      buffer.alpha = makeScalar<T>(alpha);
+      //buffer.beta = makeScalar<T>(beta);
+  }
+
+private:
+  xHerBuffer<T> buffer;
+};
+
+template <typename T>
+void xHer<T>::setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					          size_t offB, size_t offC, double alpha,
+                    double beta)
+{
+  initialize_scalars(alpha, beta);
+  buffer.offa = offA;
+  buffer.offx = offB;
+  buffer.incx = 1;//If this changes, remember to adjust size of Y in rest of the file
+  buffer.N = M;
+  if (order_option == 0)
+  {
+  buffer.order = clblasRowMajor;
+  }
+  else
+  {
+  buffer.order = clblasColumnMajor;
+  }
+  if (uplo_option == 0)
+  {
+    buffer.uplo = clblasUpper;
+  }
+  else
+  {
+    buffer.uplo = clblasLower;
+  }
+  if (lda == 0)
+  {
+    buffer.lda = buffer.N;
+  }
+  else if (lda < buffer.N)
+  {
+    std::cerr << "lda:wrong size\n";
+    exit(1);
+  }
+  else
+  {
+    buffer.lda = lda;
+  }
+  buffer.cpuX = new T[buffer.N];
+  buffer.cpuA = new T[buffer.N * buffer.lda];
+  cl_int err;
+  buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                buffer.N * buffer.lda*sizeof(T),
+                                NULL, &err);
+
+  buffer.X = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*sizeof(T),
+                                    NULL, &err);
+}
+
+template <typename T>
+void xHer<T>::initialize_cpu_buffer()
+{
+  srand(10);
+  for (size_t i = 0; i < buffer.N; ++i)
+  {
+    for (size_t j = 0; j < buffer.lda; ++j)
+    {
+        buffer.cpuA[i*buffer.lda+j] = random<T>(UPPER_BOUND<T>()) /
+                                      randomScale<T>();
+    }
+  }
+
+  for (size_t i = 0; i < buffer.N; ++i)
+  {
+    buffer.cpuX[i] = random<T>(UPPER_BOUND<T>()) /
+                                      randomScale<T>();
+  }
+}
+//
+//template <>
+//void xHer<cl_float2>::initialize_cpu_buffer()
+//{
+//  srand(10);
+//  for (size_t i = 0; i < buffer.N; ++i)
+//  {
+//      for (size_t j = 0; j < buffer.lda; ++j)
+//      {
+//          buffer.cpuA[i*buffer.lda+j].s[0] = static_cast<cl_float>(rand())/
+//            static_cast<cl_float>(RAND_MAX);
+//          buffer.cpuA[i*buffer.lda+j].s[1] = static_cast<cl_float>(rand())/
+//            static_cast<cl_float>(RAND_MAX);
+//      }
+//  }
+//
+//  for (size_t i = 0; i < buffer.N; ++i)
+//  {
+//      buffer.cpuX[i].s[0] = static_cast<cl_float>(rand())/
+//        static_cast<cl_float>(RAND_MAX);
+//      buffer.cpuX[i].s[1] = static_cast<cl_float>(rand())/
+//        static_cast<cl_float>(RAND_MAX);
+//  }
+//}
+//template <>
+//void xHer<cl_double2>::initialize_cpu_buffer()
+//{
+//  srand(10);
+//  for (size_t i = 0; i < buffer.N; ++i)
+//  {
+//      for (size_t j = 0; j < buffer.lda; ++j)
+//      {
+//          buffer.cpuA[i*buffer.lda+j].s[0] = static_cast<cl_double>(rand())/
+//            static_cast<cl_double>(RAND_MAX);
+//          buffer.cpuA[i*buffer.lda+j].s[1] = static_cast<cl_double>(rand())/
+//            static_cast<cl_double>(RAND_MAX);
+//      }
+//  }
+//
+//  for (size_t i = 0; i < buffer.N; ++i)
+//  {
+//      buffer.cpuX[i].s[0] = static_cast<cl_double>(rand())/
+//        static_cast<cl_double>(RAND_MAX);
+//      buffer.cpuX[i].s[1] = static_cast<cl_double>(rand())/
+//        static_cast<cl_double>(RAND_MAX);
+//  }
+//}
+
+
+template <typename T>
+void xHer<T>::initialize_gpu_buffer()
+{
+  cl_int err;
+
+  err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(T),
+                              buffer.N * buffer.lda*sizeof(T),
+                              buffer.cpuA, 0, NULL, NULL);
+
+  err = clEnqueueWriteBuffer(queue_, buffer.X, CL_TRUE, 0,
+                              buffer.N*sizeof(T),
+                              buffer.cpuX, 0, NULL, NULL);
+}
+
+template <typename T>
+void xHer<T>::reset_gpu_write_buffer()
+{
+  cl_int err;
+  err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(T),
+                              buffer.N * buffer.lda*sizeof(T),
+                              buffer.cpuA, 0, NULL, NULL);;
+}
+
+template <>
+void xHer<cl_float2>::call_func()
+{
+  timer.Start(timer_id);
+  clblasCher(buffer.order, buffer.uplo, buffer.N, buffer.alpha.s[0], buffer.X, buffer.offx,
+    buffer.incx, buffer.A, buffer.offa, buffer.lda, 1, &queue_, 0, NULL,&event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template <>
+void xHer<cl_double2>::call_func()
+{
+  timer.Start(timer_id);
+  clblasZher(buffer.order, buffer.uplo, buffer.N, buffer.alpha.s[0], buffer.X, buffer.offx,
+    buffer.incx, buffer.A, buffer.offa, buffer.lda, 1, &queue_, 0, NULL,&event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template<>
+double
+xHer<cl_float2>::
+gflops()
+{
+  return static_cast<double>((4 * buffer.N * buffer.N)/time_in_ns());
+}
+
+template<>
+double
+xHer<cl_double2>::
+gflops()
+{
+  return static_cast<double>((4 * buffer.N * buffer.N)/time_in_ns());
+}
+
+template<>
+std::string
+xHer<cl_float2>::
+gflops_formula()
+{
+  return "4*N*N/time";
+}
+
+template<>
+std::string
+xHer<cl_double2>::
+gflops_formula()
+{
+  return "4*N*N/time";
+}
+
+#endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__
\ No newline at end of file
diff --git a/src/client/clfunc_xher2.hpp b/src/client/clfunc_xher2.hpp
new file mode 100644
index 0000000..27d95f3
--- /dev/null
+++ b/src/client/clfunc_xher2.hpp
@@ -0,0 +1,329 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XHER2_HXX__
+#define CLBLAS_BENCHMARK_XHER2_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xHer2Buffer
+{
+	clblasOrder order;
+  clblasUplo uplo;
+  size_t N;
+  T alpha;
+  T* cpuX;
+  cl_mem X;
+  size_t offx;
+  int incx;
+  T* cpuY;
+  cl_mem Y;
+  size_t offy;
+  int incy;
+  T* cpuA;
+  cl_mem A;
+  size_t offa;
+  size_t lda;
+}; // struct buffer
+
+template <typename T>
+class xHer2 : public clblasFunc
+{
+public:
+  xHer2(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer,  devType)
+  {
+    timer.getUniqueID("clHer2", 0);
+  }
+
+  ~xHer2()
+  {
+    delete buffer.cpuA;
+    delete buffer.cpuX;
+    OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
+    OPENCL_V_THROW( clReleaseMemObject(buffer.X), "releasing buffer C");
+  }
+
+  double gflops()
+  {
+    return static_cast<double>((2 * buffer.N * buffer.N)/time_in_ns());
+  }
+
+  std::string gflops_formula()
+  {
+    return "2*N*N/time";
+  }
+
+  void setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					          size_t offB, size_t offC, double alpha,
+                    double beta);
+  void initialize_cpu_buffer();
+  void initialize_gpu_buffer();
+  void reset_gpu_write_buffer();
+  void call_func();
+  	void read_gpu_buffer()
+	{
+		//cl_int err;
+		//to-do need to fill up
+	}
+	void roundtrip_func()
+	{//to-do need to fill up
+	}
+	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+		{}
+protected:
+protected:
+  void initialize_scalars(double alpha, double beta)
+  {
+      buffer.alpha = makeScalar<T>(alpha);
+      //buffer.beta = makeScalar<T>(beta);
+  }
+
+private:
+  xHer2Buffer<T> buffer;
+};
+
+template <typename T>
+void xHer2<T>::setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					          size_t offB, size_t offC, double alpha,
+                    double beta)
+{
+  initialize_scalars(alpha, beta);
+  buffer.offa = offA;
+  buffer.offx = offB;
+  buffer.incx = 1;//If this changes, remember to adjust size of Y in rest of the file
+  buffer.offy = offC;
+  buffer.incy = 1;//If this changes, remember to adjust size of Y in rest of the file
+  buffer.N = M;
+  if (order_option == 0)
+  {
+  buffer.order = clblasRowMajor;
+  }
+  else
+  {
+  buffer.order = clblasColumnMajor;
+  }
+  if (uplo_option == 0)
+  {
+    buffer.uplo = clblasUpper;
+  }
+  else
+  {
+    buffer.uplo = clblasLower;
+  }
+  if (lda == 0)
+  {
+    buffer.lda = buffer.N;
+  }
+  else if (lda < buffer.N)
+  {
+    std::cerr << "lda:wrong size\n";
+    exit(1);
+  }
+  else
+  {
+    buffer.lda = lda;
+  }
+  buffer.cpuX = new T[buffer.N];
+  buffer.cpuY = new T[buffer.N];
+  buffer.cpuA = new T[buffer.N * buffer.lda];
+  cl_int err;
+  buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                buffer.N * buffer.lda*sizeof(T),
+                                NULL, &err);
+
+  buffer.X = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*sizeof(T),
+                                    NULL, &err);
+  buffer.Y = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*sizeof(T),
+                                    NULL, &err);
+}
+
+template <typename T>
+void xHer2<T>::initialize_cpu_buffer()
+{
+  srand(10);
+  for (size_t i = 0; i < buffer.N; ++i)
+  {
+      for (size_t j = 0; j < buffer.lda; ++j)
+      {
+          buffer.cpuA[i*buffer.lda+j] = random<T>(UPPER_BOUND<T>()) /
+                                        randomScale<T>();
+      }
+  }
+
+  for (size_t i = 0; i < buffer.N; ++i)
+  {
+      buffer.cpuX[i] = random<T>(UPPER_BOUND<T>()) /
+                                        randomScale<T>();
+      buffer.cpuY[i] = random<T>(UPPER_BOUND<T>()) /
+                                        randomScale<T>();
+  }
+}
+
+//template <>
+//void xHer2<cl_float2>::initialize_cpu_buffer()
+//{
+//  srand(10);
+//  for (size_t i = 0; i < buffer.N; ++i)
+//  {
+//      for (size_t j = 0; j < buffer.lda; ++j)
+//      {
+//          buffer.cpuA[i*buffer.lda+j].s[0] = static_cast<cl_float>(rand())/
+//            static_cast<cl_float>(RAND_MAX);
+//          buffer.cpuA[i*buffer.lda+j].s[1] = static_cast<cl_float>(rand())/
+//            static_cast<cl_float>(RAND_MAX);
+//      }
+//  }
+//
+//  for (size_t i = 0; i < buffer.N; ++i)
+//  {
+//      buffer.cpuX[i].s[0] = static_cast<cl_float>(rand())/
+//        static_cast<cl_float>(RAND_MAX);
+//      buffer.cpuX[i].s[1] = static_cast<cl_float>(rand())/
+//        static_cast<cl_float>(RAND_MAX);
+//      buffer.cpuY[i].s[0] = static_cast<cl_float>(rand())/
+//        static_cast<cl_float>(RAND_MAX);
+//      buffer.cpuY[i].s[1] = static_cast<cl_float>(rand())/
+//        static_cast<cl_float>(RAND_MAX);
+//  }
+//}
+//template <>
+//void xHer2<cl_double2>::initialize_cpu_buffer()
+//{
+//  srand(10);
+//  for (size_t i = 0; i < buffer.N; ++i)
+//  {
+//      for (size_t j = 0; j < buffer.lda; ++j)
+//      {
+//          buffer.cpuA[i*buffer.lda+j].s[0] = static_cast<cl_double>(rand())/
+//            static_cast<cl_double>(RAND_MAX);
+//          buffer.cpuA[i*buffer.lda+j].s[1] = static_cast<cl_double>(rand())/
+//            static_cast<cl_double>(RAND_MAX);
+//      }
+//  }
+//
+//  for (size_t i = 0; i < buffer.N; ++i)
+//  {
+//      buffer.cpuX[i].s[0] = static_cast<cl_double>(rand())/
+//        static_cast<cl_double>(RAND_MAX);
+//      buffer.cpuX[i].s[1] = static_cast<cl_double>(rand())/
+//        static_cast<cl_double>(RAND_MAX);
+//      buffer.cpuY[i].s[0] = static_cast<cl_double>(rand())/
+//        static_cast<cl_double>(RAND_MAX);
+//      buffer.cpuY[i].s[1] = static_cast<cl_double>(rand())/
+//        static_cast<cl_double>(RAND_MAX);
+//  }
+//}
+//
+
+template <typename T>
+void xHer2<T>::initialize_gpu_buffer()
+{
+  cl_int err;
+
+  err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(T),
+                              buffer.N * buffer.lda*sizeof(T),
+                              buffer.cpuA, 0, NULL, NULL);
+
+  err = clEnqueueWriteBuffer(queue_, buffer.X, CL_TRUE, 0,
+                              buffer.N*sizeof(T),
+                              buffer.cpuX, 0, NULL, NULL);
+  err = clEnqueueWriteBuffer(queue_, buffer.Y, CL_TRUE, 0,
+                              buffer.N*sizeof(T),
+                              buffer.cpuY, 0, NULL, NULL);
+}
+
+template <typename T>
+void xHer2<T>::reset_gpu_write_buffer()
+{
+  cl_int err;
+  err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(T),
+                              buffer.N * buffer.lda*sizeof(T),
+                              buffer.cpuA, 0, NULL, NULL);;
+}
+
+template <>
+void xHer2<cl_float2>::call_func()
+{
+  timer.Start(timer_id);
+  clblasCher2(buffer.order, buffer.uplo, buffer.N, buffer.alpha, buffer.X, buffer.offx,
+                 buffer.incx, buffer.Y, buffer.offy, buffer.incy, buffer.A, buffer.offa,
+                 buffer.lda, 1, &queue_, 0, NULL,&event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template <>
+void xHer2<cl_double2>::call_func()
+{
+  timer.Start(timer_id);
+  clblasZher2(buffer.order, buffer.uplo, buffer.N, buffer.alpha, buffer.X, buffer.offx,
+                 buffer.incx, buffer.Y, buffer.offy, buffer.incy, buffer.A, buffer.offa,
+                 buffer.lda, 1, &queue_, 0, NULL,&event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template<>
+double
+xHer2<cl_float2>::
+gflops()
+{
+  return static_cast<double>((8 * buffer.N * buffer.N)/time_in_ns());
+}
+
+template<>
+double
+xHer2<cl_double2>::
+gflops()
+{
+  return static_cast<double>((8 * buffer.N * buffer.N)/time_in_ns());
+}
+
+template<>
+std::string
+xHer2<cl_float2>::
+gflops_formula()
+{
+  return "8*N*N/time";
+}
+
+template<>
+std::string
+xHer2<cl_double2>::
+gflops_formula()
+{
+  return "8*N*N/time";
+}
+
+#endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__
\ No newline at end of file
diff --git a/src/client/clfunc_xsymm.hpp b/src/client/clfunc_xsymm.hpp
new file mode 100644
index 0000000..e9fe981
--- /dev/null
+++ b/src/client/clfunc_xsymm.hpp
@@ -0,0 +1,660 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XSYMM_HXX__
+#define CLBLAS_BENCHMARK_XSYMM_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xSymmBuffer
+{
+  clblasOrder order;
+  clblasSide side;
+  clblasUplo uplo;
+  size_t M;
+  size_t N;
+  T alpha;
+  T* cpuA;
+  size_t a_num_vectors;
+  cl_mem A;
+  size_t offa;
+  size_t lda;
+  T* cpuB;
+  cl_mem B;
+  size_t offb;
+  size_t ldb;
+  T beta;
+  T* cpuC;
+  cl_mem C;
+  size_t offc;
+  size_t ldc;
+}; // struct buffer
+
+template <typename T>
+class xSymm : public clblasFunc
+{
+public:
+  xSymm(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer,  devType)
+  {
+    timer.getUniqueID("clSymm", 0);
+  }
+
+  ~xSymm()
+  {
+    delete buffer.cpuA;
+    delete buffer.cpuB;
+    delete buffer.cpuC;
+    OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
+    OPENCL_V_THROW( clReleaseMemObject(buffer.B), "releasing buffer B");
+    OPENCL_V_THROW( clReleaseMemObject(buffer.C), "releasing buffer C");
+  }
+
+  double gflops()
+  {
+    if (buffer.side == clblasLeft)
+      return static_cast<double>((2 * buffer.M * buffer.M * buffer.N)/time_in_ns());
+    else
+      return static_cast<double>((2 * buffer.N * buffer.N * buffer.M)/time_in_ns());
+  }
+
+  std::string gflops_formula()
+  {
+    if (buffer.side == clblasLeft)
+      return "2*M*M*N/time";
+    else
+      return "2*N*N*M/time";
+  }
+
+  void setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					          size_t offB, size_t offC, double alpha,
+                    double beta);
+  void initialize_cpu_buffer();
+  void initialize_gpu_buffer();
+  void reset_gpu_write_buffer();
+  void call_func();
+  void read_gpu_buffer()
+	{
+		cl_int err;
+		err = clEnqueueReadBuffer(queue_, buffer.C, CL_TRUE,
+			                      buffer.offc * sizeof(T), buffer.ldc * buffer.N *
+                                       sizeof(T),
+								  buffer.cpuC, 0, NULL, NULL);
+	}
+  void roundtrip_func()
+	{
+				std::cout << "xSymm::roundtrip_func\n";
+	}
+  void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offB, size_t offC,
+                      double alpha, double beta)
+  {
+  initialize_scalars(alpha, beta);
+  buffer.offa = offA;
+  buffer.offb = offB;
+  buffer.offc = offC;
+  buffer.M = M;
+  buffer.N = N;
+  if (order_option == 0)
+  {
+  buffer.order = clblasRowMajor;
+  }
+  else
+  {
+  buffer.order = clblasColumnMajor;
+  }
+  if (uplo_option == 0)
+  {
+    buffer.uplo = clblasUpper;
+  }
+  else
+  {
+    buffer.uplo = clblasLower;
+  }
+  if (side_option == 0)
+  {
+      buffer.side = clblasLeft;
+      buffer.a_num_vectors = M;
+      if (lda == 0)
+      {
+        buffer.lda = buffer.M;
+      }
+      else if (lda < buffer.M)
+      {
+        std::cerr << "lda:wrong size\n";
+        exit(1);
+      }
+      else
+      {
+        buffer.lda = lda;
+      }
+  }
+  else
+  {
+      buffer.side = clblasRight;
+      buffer.a_num_vectors = N;
+      if (lda == 0)
+      {
+        buffer.lda = buffer.N;
+      }
+      else if (lda < buffer.N)
+      {
+        std::cerr << "lda:wrong size\n";
+        exit(1);
+      }
+      else
+      {
+        buffer.lda = lda;
+      }
+  }
+  /*}
+  if (lda == 0)
+  {
+    buffer.lda = buffer.M;
+  }
+  else if (lda < buffer.M)
+  {
+    std::cerr << "lda:wrong size\n";
+    exit(1);
+  }
+  else
+  {
+    buffer.lda = lda;
+  }*/
+  if (ldb == 0)
+  {
+    buffer.ldb = buffer.M;
+  }
+  else if (ldb < buffer.M)
+  {
+    std::cerr << "ldb:wrong size\n";
+    exit(1);
+  }
+  else
+  {
+    buffer.ldb = ldb;
+  }
+  if (ldc == 0)
+  {
+    buffer.ldc = buffer.M;
+  }
+  else if (ldc < buffer.M)
+  {
+    std::cerr << "ldc:wrong size\n";
+    exit(1);
+  }
+  else
+  {
+    buffer.ldc = ldc;
+  }
+  buffer.cpuB = new T[buffer.N * buffer.ldb];
+  buffer.cpuC = new T[buffer.N * buffer.ldc];
+  buffer.cpuA = new T[buffer.a_num_vectors * buffer.lda];
+  }
+protected:
+  void initialize_scalars(double alpha, double beta)
+  {
+      buffer.alpha = makeScalar<T>(alpha);
+      buffer.beta = makeScalar<T>(beta);
+  }
+
+private:
+  xSymmBuffer<T> buffer;
+};
+
+template <typename T>
+void xSymm<T>::setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					          size_t offB, size_t offC, double alpha,
+                    double beta)
+{
+  initialize_scalars(alpha, beta);
+  buffer.offa = offA;
+  buffer.offb = offB;
+  buffer.offc = offC;
+  buffer.M = M;
+  buffer.N = N;
+  if (order_option == 0)
+  {
+  buffer.order = clblasRowMajor;
+  }
+  else
+  {
+  buffer.order = clblasColumnMajor;
+  }
+  if (uplo_option == 0)
+  {
+    buffer.uplo = clblasUpper;
+  }
+  else
+  {
+    buffer.uplo = clblasLower;
+  }
+  if (side_option == 0)
+  {
+      buffer.side = clblasLeft;
+      buffer.a_num_vectors = M;
+      if (lda == 0)
+      {
+        buffer.lda = buffer.M;
+      }
+      else if (lda < buffer.M)
+      {
+        std::cerr << "lda:wrong size\n";
+        exit(1);
+      }
+      else
+      {
+        buffer.lda = lda;
+      }
+  }
+  else
+  {
+      buffer.side = clblasRight;
+      buffer.a_num_vectors = N;
+      if (lda == 0)
+      {
+        buffer.lda = buffer.N;
+      }
+      else if (lda < buffer.N)
+      {
+        std::cerr << "lda:wrong size\n";
+        exit(1);
+      }
+      else
+      {
+        buffer.lda = lda;
+      }
+  }
+  /*}
+  if (lda == 0)
+  {
+    buffer.lda = buffer.M;
+  }
+  else if (lda < buffer.M)
+  {
+    std::cerr << "lda:wrong size\n";
+    exit(1);
+  }
+  else
+  {
+    buffer.lda = lda;
+  }*/
+  if (ldb == 0)
+  {
+    buffer.ldb = buffer.M;
+  }
+  else if (ldb < buffer.M)
+  {
+    std::cerr << "ldb:wrong size\n";
+    exit(1);
+  }
+  else
+  {
+    buffer.ldb = ldb;
+  }
+  if (ldc == 0)
+  {
+    buffer.ldc = buffer.M;
+  }
+  else if (ldc < buffer.M)
+  {
+    std::cerr << "ldc:wrong size\n";
+    exit(1);
+  }
+  else
+  {
+    buffer.ldc = ldc;
+  }
+  buffer.cpuB = new T[buffer.N * buffer.ldb];
+  buffer.cpuC = new T[buffer.N * buffer.ldc];
+  buffer.cpuA = new T[buffer.a_num_vectors * buffer.lda];
+  cl_int err;
+  buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                buffer.a_num_vectors * buffer.lda*sizeof(T),
+                                NULL, &err);
+
+  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*buffer.ldb*sizeof(T),
+                                    NULL, &err);
+  buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*buffer.ldc*sizeof(T),
+                                    NULL, &err);
+}
+
+template <typename T>
+void xSymm<T>::initialize_cpu_buffer()
+{
+  srand(10);
+  for (size_t i = 0; i < buffer.a_num_vectors; ++i)
+  {
+    for (size_t j = 0; j < buffer.lda; ++j)
+    {
+        buffer.cpuA[i*buffer.lda+j] = random<T>(UPPER_BOUND<T>()) /
+                                        randomScale<T>();
+    }
+  }
+  for (size_t i = 0; i < buffer.N; ++i)
+  {
+    for (size_t j = 0; j < buffer.ldb; ++j)
+    {
+        buffer.cpuB[i*buffer.ldb+j] = random<T>(UPPER_BOUND<T>()) /
+                                      randomScale<T>();
+    }
+  }
+  for (size_t i = 0; i < buffer.N; ++i)
+  {
+    for (size_t j = 0; j < buffer.ldc; ++j)
+    {
+        buffer.cpuC[i*buffer.ldc+j] = random<T>(UPPER_BOUND<T>()) /
+                                      randomScale<T>();
+    }
+  }
+}
+
+template <typename T>
+void xSymm<T>::initialize_gpu_buffer()
+{
+  cl_int err;
+
+  err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(T),
+                              buffer.a_num_vectors * buffer.lda*sizeof(T),
+                              buffer.cpuA, 0, NULL, NULL);
+  err = clEnqueueWriteBuffer(queue_, buffer.B, CL_TRUE, 0,
+                              buffer.ldb*buffer.N*sizeof(T),
+                              buffer.cpuB, 0, NULL, NULL);
+  err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE, 0,
+                              buffer.ldc*buffer.N*sizeof(T),
+                              buffer.cpuC, 0, NULL, NULL);
+}
+
+template <typename T>
+void xSymm<T>::reset_gpu_write_buffer()
+{
+  cl_int err;
+  err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE, 0,
+                              buffer.ldc*buffer.N*sizeof(T),
+                              buffer.cpuC, 0, NULL, NULL);
+}
+
+template <>
+void xSymm<cl_float>::call_func()
+{
+  timer.Start(timer_id);
+  clblasSsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
+      buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
+      buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, 1, &queue_,
+      0, NULL,&event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template <>
+void xSymm<cl_float>::roundtrip_func()
+{
+  timer.Start(timer_id);
+  //set up buffer
+    cl_int err;
+  buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                buffer.a_num_vectors * buffer.lda*sizeof(cl_float),
+                                NULL, &err);
+
+  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*buffer.ldb*sizeof(cl_float),
+                                    NULL, &err);
+  buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*buffer.ldc*sizeof(cl_float),
+                                    NULL, &err);
+  //initialize gpu buffer
+  err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(cl_float),
+                              buffer.a_num_vectors * buffer.lda*sizeof(cl_float),
+                              buffer.cpuA, 0, NULL, NULL);
+  err = clEnqueueWriteBuffer(queue_, buffer.B, CL_TRUE, 0,
+                              buffer.ldb*buffer.N*sizeof(cl_float),
+                              buffer.cpuB, 0, NULL, NULL);
+  err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE, 0,
+                              buffer.ldc*buffer.N*sizeof(cl_float),
+                              buffer.cpuC, 0, NULL, NULL);
+  //call func
+  clblasSsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
+      buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
+      buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, 1, &queue_,
+      0, NULL,NULL);
+  //read gpu buffer
+  err = clEnqueueReadBuffer(queue_, buffer.C, CL_TRUE,
+			                      buffer.offc * sizeof(cl_float), buffer.ldc * buffer.N *
+                                       sizeof(cl_float),
+								  buffer.cpuC, 0, NULL, &event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template <>
+void xSymm<cl_double>::call_func()
+{
+  timer.Start(timer_id);
+  clblasDsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
+      buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
+      buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, 1, &queue_,
+      0, NULL,&event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template <>
+void xSymm<cl_double>::roundtrip_func()
+{
+  timer.Start(timer_id);
+  //set up buffer
+    cl_int err;
+  buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                buffer.a_num_vectors * buffer.lda*sizeof(cl_double),
+                                NULL, &err);
+
+  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*buffer.ldb*sizeof(cl_double),
+                                    NULL, &err);
+  buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*buffer.ldc*sizeof(cl_double),
+                                    NULL, &err);
+  //initialize gpu buffer
+  err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(cl_double),
+                              buffer.a_num_vectors * buffer.lda*sizeof(cl_double),
+                              buffer.cpuA, 0, NULL, NULL);
+  err = clEnqueueWriteBuffer(queue_, buffer.B, CL_TRUE, 0,
+                              buffer.ldb*buffer.N*sizeof(cl_double),
+                              buffer.cpuB, 0, NULL, NULL);
+  err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE, 0,
+                              buffer.ldc*buffer.N*sizeof(cl_double),
+                              buffer.cpuC, 0, NULL, NULL);
+  //call func
+  clblasDsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
+      buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
+      buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, 1, &queue_,
+      0, NULL,NULL);
+  //read gpu buffer
+  err = clEnqueueReadBuffer(queue_, buffer.C, CL_TRUE,
+			                      buffer.offc * sizeof(cl_double), buffer.ldc * buffer.N *
+                                       sizeof(cl_double),
+								  buffer.cpuC, 0, NULL, &event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template <>
+void xSymm<cl_float2>::call_func()
+{
+  timer.Start(timer_id);
+  clblasCsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
+      buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
+      buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, 1, &queue_,
+      0, NULL,&event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template <>
+void xSymm<cl_float2>::roundtrip_func()
+{
+  timer.Start(timer_id);
+  //set up buffer
+    cl_int err;
+  buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                buffer.a_num_vectors * buffer.lda*sizeof(cl_float2),
+                                NULL, &err);
+
+  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*buffer.ldb*sizeof(cl_float2),
+                                    NULL, &err);
+  buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*buffer.ldc*sizeof(cl_float2),
+                                    NULL, &err);
+  //initialize gpu buffer
+  err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(cl_float2),
+                              buffer.a_num_vectors * buffer.lda*sizeof(cl_float2),
+                              buffer.cpuA, 0, NULL, NULL);
+  err = clEnqueueWriteBuffer(queue_, buffer.B, CL_TRUE, 0,
+                              buffer.ldb*buffer.N*sizeof(cl_float2),
+                              buffer.cpuB, 0, NULL, NULL);
+  err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE, 0,
+                              buffer.ldc*buffer.N*sizeof(cl_float2),
+                              buffer.cpuC, 0, NULL, NULL);
+  //call func
+  clblasCsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
+      buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
+      buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, 1, &queue_,
+      0, NULL,NULL);
+  //read gpu buffer
+  err = clEnqueueReadBuffer(queue_, buffer.C, CL_TRUE,
+			                      buffer.offc * sizeof(cl_float2), buffer.ldc * buffer.N *
+                                       sizeof(cl_float2),
+								  buffer.cpuC, 0, NULL, &event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template <>
+void xSymm<cl_double2>::call_func()
+{
+  timer.Start(timer_id);
+  clblasZsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
+      buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
+      buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, 1, &queue_,
+      0, NULL,&event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template <>
+void xSymm<cl_double2>::roundtrip_func()
+{
+  timer.Start(timer_id);
+  //set up buffer
+    cl_int err;
+  buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                buffer.a_num_vectors * buffer.lda*sizeof(cl_double2),
+                                NULL, &err);
+
+  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*buffer.ldb*sizeof(cl_double2),
+                                    NULL, &err);
+  buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*buffer.ldc*sizeof(cl_double2),
+                                    NULL, &err);
+  //initialize gpu buffer
+  err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(cl_double2),
+                              buffer.a_num_vectors * buffer.lda*sizeof(cl_double2),
+                              buffer.cpuA, 0, NULL, NULL);
+  err = clEnqueueWriteBuffer(queue_, buffer.B, CL_TRUE, 0,
+                              buffer.ldb*buffer.N*sizeof(cl_double2),
+                              buffer.cpuB, 0, NULL, NULL);
+  err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE, 0,
+                              buffer.ldc*buffer.N*sizeof(cl_double2),
+                              buffer.cpuC, 0, NULL, NULL);
+  //call func
+  clblasZsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
+      buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
+      buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, 1, &queue_,
+      0, NULL,NULL);
+  //read gpu buffer
+  err = clEnqueueReadBuffer(queue_, buffer.C, CL_TRUE,
+			                      buffer.offc * sizeof(cl_double2), buffer.ldc * buffer.N *
+                                       sizeof(cl_double2),
+								  buffer.cpuC, 0, NULL, &event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template<>
+double
+xSymm<cl_float2>::
+gflops()
+{
+  if (buffer.side == clblasLeft)
+    return static_cast<double>((8 * buffer.M * buffer.M * buffer.N)/time_in_ns());
+  else
+    return static_cast<double>((8 * buffer.N * buffer.N * buffer.M)/time_in_ns());
+}
+
+template<>
+double
+xSymm<cl_double2>::
+gflops()
+{
+  if (buffer.side == clblasLeft)
+      return static_cast<double>((8 * buffer.M * buffer.M * buffer.N)/time_in_ns());
+  else
+      return static_cast<double>((8 * buffer.N * buffer.N * buffer.M)/time_in_ns());
+}
+
+template<>
+std::string
+xSymm<cl_float2>::
+gflops_formula()
+{
+  if (buffer.side == clblasLeft)
+      return "8*M*M*N/time";
+  else
+      return "8*N*N*M/time";
+}
+
+template<>
+std::string
+xSymm<cl_double2>::
+gflops_formula()
+{
+  if (buffer.side == clblasLeft)
+      return "8*M*M*N/time";
+  else
+      return "8*N*N*M/time";
+}
+
+#endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__
\ No newline at end of file
diff --git a/src/client/clfunc_xsymv.hpp b/src/client/clfunc_xsymv.hpp
new file mode 100644
index 0000000..625c7ec
--- /dev/null
+++ b/src/client/clfunc_xsymv.hpp
@@ -0,0 +1,256 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XSYMV_HXX__
+#define CLBLAS_BENCHMARK_XSYMV_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xSymvBuffer
+{
+    clblasOrder order_;
+    size_t n_;
+    size_t lda_;
+    size_t offA_;
+    size_t a_num_vectors_;
+    clblasUplo uplo_;
+    T* a_;
+    T* x_;
+    T* y_;
+    cl_mem buf_a_;
+    cl_mem buf_x_;
+    cl_mem buf_y_;
+    T alpha_;
+    T beta_;
+}; // struct buffer
+
+template <typename T>
+class xSymv : public clblasFunc
+{
+public:
+    xSymv(StatisticalTimer& _timer, cl_device_type devType) :
+        clblasFunc(_timer, devType)
+    {
+        timer.getUniqueID("clSymv", 0);
+    }
+
+    ~xSymv()
+    {
+        delete buffer_.a_;
+        delete buffer_.x_;
+        delete buffer_.y_;
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
+                        "releasing buffer A");
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_x_),
+                        "releasing buffer X");
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_y_),
+                        "releasing buffer Y");
+    }
+
+    void call_func()
+    {
+    }
+
+    double gflops()
+    {
+        return (2.0*buffer_.n_*buffer_.n_)/time_in_ns();
+    }
+
+    std::string gflops_formula()
+    {
+        return "2.0*N*N/time";
+    }
+
+    void setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+    {
+        DUMMY_ARGS_USAGE_3(side_option, diag_option, transB_option);
+        DUMMY_ARGS_USAGE_4(M, K, ldb, ldc);
+        DUMMY_ARGS_USAGE_3(transA_option, offBX, offCY);
+
+        initialize_scalars(alpha, beta);
+
+        buffer_.n_ = N;
+        buffer_.a_num_vectors_ = N;
+        buffer_.offA_ = offA;
+
+        if (uplo_option == 0)
+        {
+            buffer_.uplo_ = clblasUpper;
+        }
+        else
+        {
+            buffer_.uplo_ = clblasLower;
+        }
+
+        buffer_.x_ = new T[buffer_.n_];
+        buffer_.y_ = new T[buffer_.n_];
+
+        if (order_option == 0)
+        {
+            order_ = clblasRowMajor;
+        }
+        else
+        {
+            order_ = clblasColumnMajor;
+        }
+
+        if (lda == 0)
+        {
+            buffer_.lda_ = N;
+        }
+        else if (lda < N)
+        {
+            std::cerr << "lda:wrong size\n";
+            exit(1);
+        }
+        else
+        {
+            buffer_.lda_ = lda;
+        }
+
+        buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+
+        cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(T),
+                                        NULL, &err);
+
+        buffer_.buf_x_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        buffer_.n_*sizeof(T),
+                                        NULL, &err);
+
+        buffer_.buf_y_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        buffer_.n_*sizeof(T),
+                                        NULL, &err);
+    }
+
+    void initialize_cpu_buffer()
+    {
+        srand(10);
+        for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+        {
+            for (size_t j = 0; j < buffer_.lda_; ++j)
+            {
+                buffer_.a_[i*buffer_.lda_+j] = static_cast<T>(rand()) /
+                                               static_cast<T>(RAND_MAX);
+            }
+        }
+
+        for (size_t i = 0; i < buffer_.n_; ++i)
+        {
+            buffer_.x_[i] = static_cast<T>(rand()) /
+                            static_cast<T>(RAND_MAX);
+            buffer_.y_[i] = static_cast<T>(rand()) /
+                            static_cast<T>(RAND_MAX);
+        }
+
+    }
+
+    void initialize_gpu_buffer()
+    {
+        cl_int err;
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(T),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.a_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_x_, CL_TRUE, 0,
+                                   buffer_.n_*sizeof(T),
+                                   buffer_.x_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_y_, CL_TRUE, 0,
+                                   buffer_.n_*sizeof(T),
+                                   buffer_.y_, 0, NULL, NULL);
+    }
+
+    void reset_gpu_write_buffer()
+    {
+        cl_int err;
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_y_, CL_TRUE, 0,
+                                   buffer_.n_*sizeof(T),
+                                   buffer_.y_, 0, NULL, NULL);
+    }
+ 	void read_gpu_buffer()
+	{
+		//cl_int err;
+		//to-do need to fill up
+	}
+	void roundtrip_func()
+	{//to-do need to fill up
+	}
+	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+		{}
+protected:
+    void initialize_scalars(double alpha, double beta)
+    {
+        buffer_.alpha_ = static_cast<T>(alpha);
+        buffer_.beta_ = static_cast<T>(beta);
+    }
+
+private:
+    xSymvBuffer<T> buffer_;
+
+}; // class xsymv
+
+template<>
+void
+xSymv<float>::
+call_func()
+{
+    timer.Start(timer_id);
+
+    clblasSsymv(order_, buffer_.uplo_, buffer_.n_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_x_, 0, 1, buffer_.beta_, buffer_.buf_y_,
+                     0, 1, 1, &queue_, 0, NULL, &event_);
+
+	clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xSymv<double>::
+call_func()
+{
+    timer.Start(timer_id);
+
+	clblasDsymv(order_, buffer_.uplo_, buffer_.n_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_x_, 0, 1, buffer_.beta_, buffer_.buf_y_,
+                     0, 1, 1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+#endif // ifndef CLBLAS_BENCHMARK_XSYMV_HXX__
diff --git a/src/client/clfunc_xsyr.hpp b/src/client/clfunc_xsyr.hpp
new file mode 100644
index 0000000..172032c
--- /dev/null
+++ b/src/client/clfunc_xsyr.hpp
@@ -0,0 +1,224 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XSYR_HXX__
+#define CLBLAS_BENCHMARK_XSYR_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xSyrBuffer
+{
+	clblasOrder order;
+  clblasUplo uplo;
+  size_t N;
+  T alpha;
+  T* cpuX;
+  cl_mem X;
+  size_t offx;
+  int incx;
+  T* cpuA;
+  cl_mem A;
+  size_t offa;
+  size_t lda;
+}; // struct buffer
+
+template <typename T>
+class xSyr : public clblasFunc
+{
+public:
+  xSyr(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer,  devType)
+  {
+    timer.getUniqueID("clSyr", 0);
+  }
+
+  ~xSyr()
+  {
+    delete buffer.cpuA;
+    delete buffer.cpuX;
+    OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
+    OPENCL_V_THROW( clReleaseMemObject(buffer.X), "releasing buffer C");
+  }
+
+  double gflops()
+  {
+    return static_cast<double>((buffer.N * buffer.N)/time_in_ns());
+  }
+
+  std::string gflops_formula()
+  {
+    return "N*N/time";
+  }
+
+  void setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					          size_t offB, size_t offC, double alpha,
+                    double beta);
+  void initialize_cpu_buffer();
+  void initialize_gpu_buffer();
+  void reset_gpu_write_buffer();
+  void call_func();
+  	void read_gpu_buffer()
+	{
+		//cl_int err;
+		//to-do need to fill up
+	}
+	void roundtrip_func()
+	{//to-do need to fill up
+	}
+	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+		{}
+
+protected:
+protected:
+  void initialize_scalars(double alpha, double beta)
+  {
+    buffer.alpha = alpha;
+  }
+
+private:
+  xSyrBuffer<T> buffer;
+};
+
+template <typename T>
+void xSyr<T>::setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					          size_t offB, size_t offC, double alpha,
+                    double beta)
+{
+  initialize_scalars(alpha, beta);
+  buffer.offa = offA;
+  buffer.offx = offB;
+  buffer.incx = 1;
+  buffer.N = M;
+  if (order_option == 0)
+  {
+  buffer.order = clblasRowMajor;
+  }
+  else
+  {
+  buffer.order = clblasColumnMajor;
+  }
+  if (uplo_option == 0)
+  {
+      buffer.uplo = clblasUpper;
+  }
+  else
+  {
+      buffer.uplo = clblasLower;
+  }
+  if (lda == 0)
+  {
+    buffer.lda = buffer.N;
+  }
+  else if (lda < buffer.N)
+  {
+    std::cerr << "lda:wrong size\n";
+    exit(1);
+  }
+  else
+  {
+    buffer.lda = lda;
+  }
+  buffer.cpuX = new T[buffer.N];
+  buffer.cpuA = new T[buffer.N * buffer.lda];
+  cl_int err;
+  buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                buffer.N * buffer.lda*sizeof(T),
+                                NULL, &err);
+
+  buffer.X = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*sizeof(T),
+                                    NULL, &err);
+}
+
+template <typename T>
+void xSyr<T>::initialize_cpu_buffer()
+{
+  srand(10);
+  for (size_t i = 0; i < buffer.N; ++i)
+  {
+    for (size_t j = 0; j < buffer.lda; ++j)
+    {
+        buffer.cpuA[i*buffer.lda+j] = random<T>(UPPER_BOUND<T>()) /
+                                      randomScale<T>();
+    }
+  }
+
+  for (size_t i = 0; i < buffer.N; ++i)
+  {
+    buffer.cpuX[i] = random<T>(UPPER_BOUND<T>()) /
+                                      randomScale<T>();
+  }
+}
+
+template <typename T>
+void xSyr<T>::initialize_gpu_buffer()
+{
+  cl_int err;
+
+  err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(T),
+                              buffer.N * buffer.lda*sizeof(T),
+                              buffer.cpuA, 0, NULL, NULL);
+
+  err = clEnqueueWriteBuffer(queue_, buffer.X, CL_TRUE, 0,
+                              buffer.N*sizeof(T),
+                              buffer.cpuX, 0, NULL, NULL);
+}
+
+template <typename T>
+void xSyr<T>::reset_gpu_write_buffer()
+{
+  cl_int err;
+  err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(T),
+                              buffer.N * buffer.lda*sizeof(T),
+                              buffer.cpuA, 0, NULL, NULL);;
+}
+
+template <>
+void xSyr<cl_float>::call_func()
+{
+  timer.Start(timer_id);
+  clblasSsyr(buffer.order, buffer.uplo, buffer.N, buffer.alpha, buffer.X, buffer.offx,
+              buffer.incx, buffer.A, buffer.offa, buffer.lda, 1, &queue_, 0, NULL,&event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template <>
+void xSyr<cl_double>::call_func()
+{
+  timer.Start(timer_id);
+  clblasSsyr(buffer.order, buffer.uplo, buffer.N, buffer.alpha, buffer.X, buffer.offx,
+              buffer.incx, buffer.A, buffer.offa, buffer.lda, 1, &queue_, 0, NULL,&event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+#endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__
\ No newline at end of file
diff --git a/src/client/clfunc_xsyr2.hpp b/src/client/clfunc_xsyr2.hpp
new file mode 100644
index 0000000..761c616
--- /dev/null
+++ b/src/client/clfunc_xsyr2.hpp
@@ -0,0 +1,239 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XSYR2_HXX__
+#define CLBLAS_BENCHMARK_XSYR2_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xSyr2Buffer
+{
+	clblasOrder order;
+  clblasUplo uplo;
+  size_t N;
+  T alpha;
+  T* cpuX;
+  cl_mem X;
+  size_t offx;
+  int incx;
+  T* cpuY;
+  cl_mem Y;
+  size_t offy;
+  int incy;
+  T* cpuA;
+  cl_mem A;
+  size_t offa;
+  size_t lda;
+}; // struct buffer
+
+template <typename T>
+class xSyr2 : public clblasFunc
+{
+public:
+  xSyr2(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer,  devType)
+  {
+    timer.getUniqueID("clSyr2", 0);
+  }
+
+  ~xSyr2()
+  {
+    delete buffer.cpuA;
+    delete buffer.cpuX;
+    OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
+    OPENCL_V_THROW( clReleaseMemObject(buffer.X), "releasing buffer C");
+  }
+
+  double gflops()
+  {
+    return static_cast<double>((2 * buffer.N * buffer.N)/time_in_ns());
+  }
+
+  std::string gflops_formula()
+  {
+    return "2*N*N/time";
+  }
+
+  void setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					          size_t offB, size_t offC, double alpha,
+                    double beta);
+  void initialize_cpu_buffer();
+  void initialize_gpu_buffer();
+  void reset_gpu_write_buffer();
+  void call_func();
+	void read_gpu_buffer()
+	{
+		//cl_int err;
+		//to-do need to fill up
+	}
+	void roundtrip_func()
+	{//to-do need to fill up
+	}
+	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+		{}
+
+protected:
+protected:
+  void initialize_scalars(double alpha, double beta)
+  {
+    buffer.alpha = alpha;
+  }
+
+private:
+  xSyr2Buffer<T> buffer;
+};
+
+template <typename T>
+void xSyr2<T>::setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					          size_t offB, size_t offC, double alpha,
+                    double beta)
+{
+  initialize_scalars(alpha, beta);
+  buffer.offa = offA;
+  buffer.offx = offB;
+  buffer.incx = 1;//If this changes, remember to adjust size of Y in rest of the file
+  buffer.offy = offC;
+  buffer.incy = 1;//If this changes, remember to adjust size of Y in rest of the file
+  buffer.N = M;
+  if (order_option == 0)
+  {
+  buffer.order = clblasRowMajor;
+  }
+  else
+  {
+  buffer.order = clblasColumnMajor;
+  }
+  if (uplo_option == 0)
+  {
+      buffer.uplo = clblasUpper;
+  }
+  else
+  {
+      buffer.uplo = clblasLower;
+  }
+  if (lda == 0)
+  {
+    buffer.lda = buffer.N;
+  }
+  else if (lda < buffer.N)
+  {
+    std::cerr << "lda:wrong size\n";
+    exit(1);
+  }
+  else
+  {
+    buffer.lda = lda;
+  }
+  buffer.cpuX = new T[buffer.N];
+  buffer.cpuY = new T[buffer.N];
+  buffer.cpuA = new T[buffer.N * buffer.lda];
+  cl_int err;
+  buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                buffer.N * buffer.lda*sizeof(T),
+                                NULL, &err);
+
+  buffer.X = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*sizeof(T),
+                                    NULL, &err);
+  buffer.Y = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*sizeof(T),
+                                    NULL, &err);
+}
+
+template <typename T>
+void xSyr2<T>::initialize_cpu_buffer()
+{
+  srand(10);
+  for (size_t i = 0; i < buffer.N; ++i)
+  {
+    for (size_t j = 0; j < buffer.lda; ++j)
+    {
+        buffer.cpuA[i*buffer.lda+j] = random<T>(UPPER_BOUND<T>()) /
+                                      randomScale<T>();
+    }
+  }
+
+  for (size_t i = 0; i < buffer.N; ++i)
+  {
+    buffer.cpuX[i] = random<T>(UPPER_BOUND<T>()) /
+                                      randomScale<T>();
+    buffer.cpuY[i] = random<T>(UPPER_BOUND<T>()) /
+                                      randomScale<T>();
+  }
+}
+
+template <typename T>
+void xSyr2<T>::initialize_gpu_buffer()
+{
+  cl_int err;
+
+  err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(T),
+                              buffer.N * buffer.lda*sizeof(T),
+                              buffer.cpuA, 0, NULL, NULL);
+
+  err = clEnqueueWriteBuffer(queue_, buffer.X, CL_TRUE, 0,
+                              buffer.N*sizeof(T),
+                              buffer.cpuX, 0, NULL, NULL);
+  err = clEnqueueWriteBuffer(queue_, buffer.Y, CL_TRUE, 0,
+                              buffer.N*sizeof(T),
+                              buffer.cpuY, 0, NULL, NULL);
+}
+
+template <typename T>
+void xSyr2<T>::reset_gpu_write_buffer()
+{
+  cl_int err;
+  err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(T),
+                              buffer.N * buffer.lda*sizeof(T),
+                              buffer.cpuA, 0, NULL, NULL);;
+}
+
+template <>
+void xSyr2<cl_float>::call_func()
+{
+  timer.Start(timer_id);
+  clblasSsyr2(buffer.order, buffer.uplo, buffer.N, buffer.alpha, buffer.X, buffer.offx,
+    buffer.incx, buffer.Y, buffer.offy, buffer.incy, buffer.A, buffer.offa, buffer.lda, 1, &queue_, 0, NULL,&event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template <>
+void xSyr2<cl_double>::call_func()
+{
+  timer.Start(timer_id);
+  clblasSsyr2(buffer.order, buffer.uplo, buffer.N, buffer.alpha, buffer.X, buffer.offx,
+    buffer.incx, buffer.Y, buffer.offy, buffer.incy, buffer.A, buffer.offa, buffer.lda, 1, &queue_, 0, NULL,&event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+#endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__
\ No newline at end of file
diff --git a/src/client/clfunc_xsyr2k.hpp b/src/client/clfunc_xsyr2k.hpp
new file mode 100644
index 0000000..4faa399
--- /dev/null
+++ b/src/client/clfunc_xsyr2k.hpp
@@ -0,0 +1,460 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XSYR2K_HXX__
+#define CLBLAS_BENCHMARK_XSYR2K_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xSyr2kBuffer
+{
+    clblasOrder order_;
+    size_t n_;
+    size_t k_;
+    size_t lda_;
+    size_t ldb_;
+    size_t ldc_;
+    size_t offA_;
+    size_t offB_;
+    size_t offC_;
+    size_t a_num_vectors_;
+    size_t b_num_vectors_;
+    size_t c_num_vectors_;
+    clblasTranspose trans_;
+    clblasUplo uplo_;
+    T* a_;
+    T* b_;
+    T* c_;
+    cl_mem buf_a_;
+    cl_mem buf_b_;
+    cl_mem buf_c_;
+    T alpha_;
+    T beta_;
+}; // struct buffer
+
+template <typename T>
+class xSyr2k : public clblasFunc
+{
+public:
+    xSyr2k(StatisticalTimer& _timer, cl_device_type devType) :
+        clblasFunc(_timer, devType)
+    {
+        timer.getUniqueID("clSyr2k", 0);
+    }
+
+    ~xSyr2k()
+    {
+        delete buffer_.a_;
+        delete buffer_.b_;
+        delete buffer_.c_;
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
+                        "releasing buffer A");
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_b_),
+                        "releasing buffer B");
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
+                        "releasing buffer C");
+    }
+
+    void call_func()
+    {
+    }
+
+    double gflops()
+    {
+        return 2.0*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns() +
+            buffer_.n_*(buffer_.n_+1)/time_in_ns();
+    }
+
+    std::string gflops_formula()
+    {
+        return "2.0*(M*(M+1)*N+M*(M+1))/time";
+    }
+
+    void setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+    {
+        DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+
+        initialize_scalars(alpha, beta);
+
+        buffer_.n_ = N;
+        buffer_.k_ = K;
+        buffer_.offA_ = offA;
+        buffer_.offB_ = offBX;
+        buffer_.offC_ = offCY;
+
+        if (uplo_option == 0)
+        {
+            buffer_.uplo_ = clblasUpper;
+        }
+        else
+        {
+            buffer_.uplo_ = clblasLower;
+        }
+
+
+        if (ldc == 0)
+        {
+            buffer_.ldc_ = N;
+        }
+        else if (ldc < N)
+        {
+            std::cerr << "ldc:wrong size\n";
+        }
+        else
+        {
+            buffer_.ldc_ = ldc;
+        }
+        buffer_.c_num_vectors_ = N;
+
+        if (order_option == 0)
+        {
+            order_ = clblasRowMajor;
+            if (transA_option == 0)
+            {
+                buffer_.trans_ = clblasNoTrans;
+                buffer_.a_num_vectors_ = N;
+                buffer_.b_num_vectors_ = N;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = K;
+                }
+                else if (ldb < K)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = K;
+                buffer_.b_num_vectors_ = K;
+                if (transA_option == 1)
+                {
+                    buffer_.trans_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.trans_ = clblasConjTrans;
+                }
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = N;
+                }
+                else if (ldb < N)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+        }
+        else
+        {
+            order_ = clblasColumnMajor;
+            if (transA_option == 0)
+            {
+                buffer_.a_num_vectors_ = K;
+                buffer_.b_num_vectors_ = K;
+                buffer_.trans_ = clblasNoTrans;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = N;
+                }
+                else if (ldb < N)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = N;
+                buffer_.b_num_vectors_ = N;
+                if (transA_option == 1)
+                {
+                    buffer_.trans_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.trans_ = clblasConjTrans;
+                }
+
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = K;
+                }
+                else if (ldb < K)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+        }
+
+        buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+        buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
+        buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+
+        cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(T),
+                                        NULL, &err);
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                        NULL, &err);
+        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(T),
+                                        NULL, &err);
+    }
+
+    void initialize_cpu_buffer()
+    {
+        srand(10);
+        for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+        {
+            for (size_t j = 0; j < buffer_.lda_; ++j)
+            {
+                buffer_.a_[i*buffer_.lda_+j] = random<T>(UPPER_BOUND<T>()) /
+                                               randomScale<T>();
+            }
+        }
+
+        for (size_t i = 0; i < buffer_.b_num_vectors_; ++i)
+        {
+            for (size_t j = 0; j < buffer_.ldb_; ++j)
+            {
+                buffer_.b_[i*buffer_.ldb_+j] = random<T>(UPPER_BOUND<T>()) /
+                                               randomScale<T>();
+            }
+        }
+
+        for (size_t i = 0; i < buffer_.c_num_vectors_; ++i)
+        {
+            for (size_t j = 0; j < buffer_.ldc_; ++j)
+            {
+                buffer_.c_[i*buffer_.ldc_+j] = random<T>(UPPER_BOUND<T>()) /
+                                               randomScale<T>();
+            }
+        }
+    }
+
+    void initialize_gpu_buffer()
+    {
+        cl_int err;
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(T),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.a_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(T),
+                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.b_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+                                   buffer_.offC_ * sizeof(T),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.c_, 0, NULL, NULL);
+    }
+
+    void reset_gpu_write_buffer()
+    {
+        cl_int err;
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE, 0,
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.c_, 0, NULL, NULL);
+    }
+	void read_gpu_buffer()
+	{
+		//cl_int err;
+		//to-do need to fill up
+	}
+	void roundtrip_func()
+	{//to-do need to fill up
+	}
+	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+		{}
+
+protected:
+    void initialize_scalars(double alpha, double beta)
+    {
+        buffer_.alpha_ = makeScalar<T>(alpha);
+        buffer_.beta_ = makeScalar<T>(beta);
+    }
+
+private:
+    xSyr2kBuffer<T> buffer_;
+
+}; // class xsyr2k
+
+template<>
+void
+xSyr2k<float>::
+call_func()
+{
+    timer.Start(timer_id);
+
+    clblasSsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_,
+                      buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                      buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                      buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                      buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xSyr2k<double>::
+call_func()
+{
+    timer.Start(timer_id);
+
+    clblasDsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_,
+                      buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                      buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                      buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                      buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xSyr2k<cl_float2>::
+call_func()
+{
+    timer.Start(timer_id);
+
+    clblasCsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_,
+                      buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                      buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                      buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                      buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xSyr2k<cl_double2>::
+call_func()
+{
+    timer.Start(timer_id);
+
+    clblasZsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_,
+                      buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                      buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                      buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                      buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+#endif // ifndef CLBLAS_BENCHMARK_XSYR2K_HXX__
diff --git a/src/client/clfunc_xsyrk.hpp b/src/client/clfunc_xsyrk.hpp
new file mode 100644
index 0000000..5bfd0e3
--- /dev/null
+++ b/src/client/clfunc_xsyrk.hpp
@@ -0,0 +1,372 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XSYRK_HXX__
+#define CLBLAS_BENCHMARK_XSYRK_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xSyrkBuffer
+{
+    clblasOrder order_;
+    size_t n_;
+    size_t k_;
+    size_t lda_;
+    size_t ldc_;
+    size_t offA_;
+    size_t offC_;
+    size_t a_num_vectors_;
+    size_t c_num_vectors_;
+    clblasTranspose trans_a_;
+    clblasUplo uplo_;
+    T* a_;
+    T* c_;
+    cl_mem buf_a_;
+    cl_mem buf_c_;
+    T alpha_;
+    T beta_;
+}; // struct buffer
+
+template <typename T>
+class xSyrk : public clblasFunc
+{
+public:
+    xSyrk(StatisticalTimer& _timer, cl_device_type devType) :
+        clblasFunc(_timer, devType)
+    {
+        timer.getUniqueID("clSyrk", 0);
+    }
+
+    ~xSyrk()
+    {
+        delete buffer_.a_;
+        delete buffer_.c_;
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
+                        "releasing buffer A");
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
+                        "releasing buffer C");
+        }
+
+    void call_func()
+    {
+    }
+
+    double gflops()
+    {
+        return buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns() +
+            buffer_.n_*(buffer_.n_+1)/time_in_ns();
+    }
+
+    std::string gflops_formula()
+    {
+        return "(N*(N+1)*K+N*(N+1))/time";
+    }
+
+    void setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+    {
+        DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+        DUMMY_ARGS_USAGE_2(ldb, offBX);
+
+        initialize_scalars(alpha, beta);
+
+        buffer_.n_ = N;
+        buffer_.k_ = K;
+        buffer_.offA_ = offA;
+        buffer_.offC_ = offCY;
+
+        if (uplo_option == 0)
+        {
+            buffer_.uplo_ = clblasUpper;
+        }
+        else
+        {
+            buffer_.uplo_ = clblasLower;
+        }
+
+
+        if (ldc == 0)
+        {
+            buffer_.ldc_ = N;
+        }
+        else if (ldc < N)
+        {
+            std::cerr << "ldc:wrong size\n";
+        }
+        else
+        {
+            buffer_.ldc_ = ldc;
+        }
+        buffer_.c_num_vectors_ = N;
+
+        if (order_option == 0)
+        {
+            order_ = clblasRowMajor;
+            if (transA_option == 0)
+            {
+                buffer_.trans_a_ = clblasNoTrans;
+                buffer_.a_num_vectors_ = N;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = K;
+                if (transA_option == 1)
+                {
+                    buffer_.trans_a_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.trans_a_ = clblasConjTrans;
+                }
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+        }
+        else
+        {
+            order_ = clblasColumnMajor;
+            if (transA_option == 0)
+            {
+                buffer_.a_num_vectors_ = K;
+                buffer_.trans_a_ = clblasNoTrans;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = N;
+                if (transA_option == 1)
+                {
+                    buffer_.trans_a_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.trans_a_ = clblasConjTrans;
+                }
+
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+        }
+
+        buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+        buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+
+        cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(T),
+                                        NULL, &err);
+
+        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(T),
+                                        NULL, &err);
+    }
+
+    void initialize_cpu_buffer()
+    {
+        srand(10);
+        for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+        {
+            for (size_t j = 0; j < buffer_.lda_; ++j)
+            {
+                buffer_.a_[i*buffer_.lda_+j] = random<T>(UPPER_BOUND<T>()) /
+                                               randomScale<T>();
+            }
+        }
+
+        for (size_t i = 0; i < buffer_.c_num_vectors_; ++i)
+        {
+            for (size_t j = 0; j < buffer_.ldc_; ++j)
+            {
+                buffer_.c_[i*buffer_.ldc_+j] = random<T>(UPPER_BOUND<T>()) /
+                                               randomScale<T>();
+            }
+        }
+    }
+
+    void initialize_gpu_buffer()
+    {
+        cl_int err;
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(T),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.a_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(T),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.c_, 0, NULL, NULL);
+    }
+
+    void reset_gpu_write_buffer()
+    {
+        cl_int err;
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+                                   buffer_.offC_ * sizeof(T),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.c_, 0, NULL, NULL);
+    }
+ 	void read_gpu_buffer()
+	{
+		//cl_int err;
+		//to-do need to fill up
+	}
+	void roundtrip_func()
+	{//to-do need to fill up
+	}
+	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+		{}
+protected:
+    void initialize_scalars(double alpha, double beta)
+    {
+        buffer_.alpha_ = makeScalar<T>(alpha);
+        buffer_.beta_ = makeScalar<T>(beta);
+    }
+
+private:
+    xSyrkBuffer<T> buffer_;
+
+}; // class xsyrk
+
+template<>
+void
+xSyrk<float>::
+call_func()
+{
+    timer.Start(timer_id);
+
+    clblasSsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_,
+                     buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                     buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                     buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xSyrk<double>::
+call_func()
+{
+    timer.Start(timer_id);
+
+    clblasDsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_,
+                     buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                     buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                     buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xSyrk<cl_float2>::
+call_func()
+{
+    timer.Start(timer_id);
+
+    clblasCsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_,
+                     buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                     buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                     buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xSyrk<cl_double2>::
+call_func()
+{
+    timer.Start(timer_id);
+
+    clblasZsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_,
+                     buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                     buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                     buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+#endif // ifndef CLBLAS_BENCHMARK_XSYRK_HXX__
diff --git a/src/client/clfunc_xtrmm.hpp b/src/client/clfunc_xtrmm.hpp
new file mode 100644
index 0000000..d47ddfd
--- /dev/null
+++ b/src/client/clfunc_xtrmm.hpp
@@ -0,0 +1,785 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XTRMM_HXX__
+#define CLBLAS_BENCHMARK_XTRMM_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xTrmmBuffer
+{
+    clblasOrder order_;
+    size_t m_;
+    size_t n_;
+    size_t lda_;
+    size_t ldb_;
+    size_t offA_;
+    size_t offB_;
+    size_t a_num_vectors_;
+    size_t b_num_vectors_;
+    clblasTranspose trans_a_;
+    clblasSide side_;
+    clblasUplo uplo_;
+    clblasDiag diag_;
+    T* a_;
+    T* b_;
+    cl_mem buf_a_;
+    cl_mem buf_b_;
+    T alpha_;
+}; // struct buffer
+
+template <typename T>
+class xTrmm : public clblasFunc
+{
+public:
+    xTrmm(StatisticalTimer& timer, cl_device_type devType) :
+        clblasFunc(timer, devType)
+    {
+        timer.getUniqueID("clTrmm", 0);
+    }
+
+    ~xTrmm()
+    {
+        delete buffer_.a_;
+        delete buffer_.b_;
+        OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_a_),
+                       "releasing buffer A");
+        OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_b_),
+                       "releasing buffer B");
+    }
+
+    void call_func()
+    {
+        std::cout << "xtrmm::call_func\n";
+    }
+
+    double gflops()
+    {
+        if (buffer_.side_ == clblasLeft)
+        {
+            return buffer_.m_*(buffer_.m_+1)*buffer_.n_/time_in_ns();
+        }
+        else
+        {
+            return 20*buffer_.m_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
+        }
+    }
+
+    std::string gflops_formula()
+    {
+        if (buffer_.side_ == clblasLeft)
+        {
+            return "M*(M+1)*N/time";
+        }
+        else
+        {
+            return "M*(N+1)*N/time";
+        }
+    }
+
+    void setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+    {
+        DUMMY_ARGS_USAGE_3(transB_option, K, beta);
+        DUMMY_ARGS_USAGE_2(ldc, offCY);
+
+        initialize_scalars(alpha, beta);
+
+        buffer_.m_ = M;
+        buffer_.n_ = N;
+        buffer_.offA_ = offA;
+        buffer_.offB_ = offBX;
+
+        if (transA_option == 0)
+        {
+            buffer_.trans_a_ = clblasNoTrans;
+        }
+        else if (transA_option == 1)
+        {
+            buffer_.trans_a_ = clblasTrans;
+        }
+        else if (transA_option == 2)
+        {
+            buffer_.trans_a_ = clblasConjTrans;
+        }
+
+        if (side_option == 0)
+        {
+            buffer_.side_ = clblasLeft;
+            buffer_.a_num_vectors_ = M;
+        }
+        else
+        {
+            buffer_.side_ = clblasRight;
+            buffer_.a_num_vectors_ = N;
+        }
+
+        if (uplo_option == 0)
+        {
+            buffer_.uplo_ = clblasUpper;
+        }
+        else
+        {
+            buffer_.uplo_ = clblasLower;
+        }
+
+        if (diag_option == 0)
+        {
+            buffer_.diag_ = clblasUnit;
+        }
+        else
+        {
+            buffer_.diag_ = clblasNonUnit;
+        }
+
+        if (order_option == 0)
+        {
+            order_ = clblasRowMajor;
+            buffer_.b_num_vectors_ = M;
+            if (ldb == 0)
+            {
+                buffer_.ldb_ = N;
+            }
+            else
+            {
+                if (ldb < N)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+        }
+        else
+        {
+            buffer_.order_ = clblasColumnMajor;
+            buffer_.b_num_vectors_ = N;
+            if (ldb == 0)
+            {
+                buffer_.ldb_ = M;
+            }
+            else
+            {
+                if (ldb < M)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+        }
+
+        if (lda == 0)
+        {
+            if (side_option == 0)
+            {
+                buffer_.lda_ = M;
+            }
+            else
+            {
+                buffer_.lda_ = N;
+            }
+        }
+        else
+        {
+            if( side_option == 0 && lda < M )
+            {
+                std::cerr << "ERROR: when side is 0, lda must be set to 0 "
+                             "or a value >= M" << std::endl;
+            }
+            else if(side_option == 0 && lda >= M )
+            {
+                buffer_.lda_ = lda;
+            }
+            else if(side_option != 0 && lda < N)
+            {
+                std::cerr << "ERROR: when side is 1, lda must be set to 0 "
+                             "or a value >= N" << std::endl;
+            }
+            else if (side_option != 0 && lda >= N)
+            {
+                buffer_.lda_ = lda;
+            }
+
+        }
+
+        buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+        buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
+
+        cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(T),
+                                        NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                        NULL, &err);
+    }
+
+    void initialize_cpu_buffer()
+    {
+        srand(10);
+
+        for (size_t i = 0; i < buffer_.b_num_vectors_; ++i)
+        {
+            for (size_t j = 0; j < buffer_.ldb_; ++j)
+            {
+                buffer_.b_[i*buffer_.ldb_+j] = random<T>(UPPER_BOUND<T>()) /
+                                               randomScale<T>();
+            }
+        }
+
+        for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+        {
+            for (size_t j = 0; j < buffer_.lda_; ++j)
+            {
+                if (i == j && buffer_.diag_ == clblasUnit)
+                {
+                    buffer_.a_[i*buffer_.lda_+j] = ONE<T>();
+                }
+                else
+                {
+                    buffer_.a_[i*buffer_.lda_+j] = random<T>(UPPER_BOUND<T>()) /
+                                                   randomScale<T>();
+                }
+            }
+        }
+    }
+
+    void initialize_gpu_buffer()
+    {
+        cl_int err;
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(T),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.a_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(T),
+                                   buffer_.ldb_ *buffer_.b_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.b_, 0, NULL, NULL);
+    }
+
+    void reset_gpu_write_buffer()
+    {
+        cl_int err;
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(T),
+                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.b_, 0, NULL, NULL);
+    }
+	void read_gpu_buffer()
+	{
+		cl_int err;
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+			                      buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(T),
+								  buffer_.b_, 0, NULL, NULL);
+	}
+	void roundtrip_func()
+	{
+		std::cout << "xGemm::roundtrip_func\n";
+	}
+	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+	{
+		DUMMY_ARGS_USAGE_3(transB_option, K, beta);
+        DUMMY_ARGS_USAGE_2(ldc, offCY);
+
+        initialize_scalars(alpha, beta);
+
+        buffer_.m_ = M;
+        buffer_.n_ = N;
+        buffer_.offA_ = offA;
+        buffer_.offB_ = offBX;
+
+        if (transA_option == 0)
+        {
+            buffer_.trans_a_ = clblasNoTrans;
+        }
+        else if (transA_option == 1)
+        {
+            buffer_.trans_a_ = clblasTrans;
+        }
+        else if (transA_option == 2)
+        {
+            buffer_.trans_a_ = clblasConjTrans;
+        }
+
+        if (side_option == 0)
+        {
+            buffer_.side_ = clblasLeft;
+            buffer_.a_num_vectors_ = M;
+        }
+        else
+        {
+            buffer_.side_ = clblasRight;
+            buffer_.a_num_vectors_ = N;
+        }
+
+        if (uplo_option == 0)
+        {
+            buffer_.uplo_ = clblasUpper;
+        }
+        else
+        {
+            buffer_.uplo_ = clblasLower;
+        }
+
+        if (diag_option == 0)
+        {
+            buffer_.diag_ = clblasUnit;
+        }
+        else
+        {
+            buffer_.diag_ = clblasNonUnit;
+        }
+
+        if (order_option == 0)
+        {
+            order_ = clblasRowMajor;
+            buffer_.b_num_vectors_ = M;
+            if (ldb == 0)
+            {
+                buffer_.ldb_ = N;
+            }
+            else
+            {
+                if (ldb < N)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+        }
+        else
+        {
+            buffer_.order_ = clblasColumnMajor;
+            buffer_.b_num_vectors_ = N;
+            if (ldb == 0)
+            {
+                buffer_.ldb_ = M;
+            }
+            else
+            {
+                if (ldb < M)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+        }
+
+        if (lda == 0)
+        {
+            if (side_option == 0)
+            {
+                buffer_.lda_ = M;
+            }
+            else
+            {
+                buffer_.lda_ = N;
+            }
+        }
+        else
+        {
+            if( side_option == 0 && lda < M )
+            {
+                std::cerr << "ERROR: when side is 0, lda must be set to 0 "
+                             "or a value >= M" << std::endl;
+            }
+            else if(side_option == 0 && lda >= M )
+            {
+                buffer_.lda_ = lda;
+            }
+            else if(side_option != 0 && lda < N)
+            {
+                std::cerr << "ERROR: when side is 1, lda must be set to 0 "
+                             "or a value >= N" << std::endl;
+            }
+            else if (side_option != 0 && lda >= N)
+            {
+                buffer_.lda_ = lda;
+            }
+
+        }
+
+        buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+        buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
+	}
+protected:
+    void initialize_scalars(double alpha, double beta)
+    {
+        DUMMY_ARG_USAGE(beta);
+        buffer_.alpha_ = makeScalar<T>(alpha);
+    }
+
+private:
+    xTrmmBuffer<T> buffer_;
+
+}; // class xTrmm
+
+template<>
+void
+xTrmm<cl_float>::
+call_func()
+{
+    timer.Start(timer_id);
+
+    clblasStrmm(order_, buffer_.side_, buffer_.uplo_,
+                     buffer_.trans_a_, buffer_.diag_,
+                     buffer_.m_, buffer_.n_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                     1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xTrmm<cl_float>::
+roundtrip_func()
+{
+	    timer.Start(timer_id);
+	    cl_int err;
+			//set up buffer
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(cl_float),
+                                        NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(cl_float),
+                                        NULL, &err);
+		//initialize gpu buffer
+		err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(cl_float),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(cl_float),
+                                   buffer_.a_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(cl_float),
+                                   buffer_.ldb_ *buffer_.b_num_vectors_ *
+                                       sizeof(cl_float),
+                                   buffer_.b_, 0, NULL, NULL);
+		//call_func
+		    clblasStrmm(order_, buffer_.side_, buffer_.uplo_,
+                     buffer_.trans_a_, buffer_.diag_,
+                     buffer_.m_, buffer_.n_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                     1, &queue_, 0, NULL, NULL);
+		//read gpu buffer
+			err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+			                      buffer_.offB_ * sizeof(cl_float), buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(cl_float),
+								  buffer_.b_, 0, NULL, &event_);
+			clWaitForEvents(1, &event_);
+			timer.Stop(timer_id);
+
+}
+
+template<>
+void
+xTrmm<cl_double>::
+call_func()
+{
+    timer.Start(timer_id);
+
+    clblasDtrmm(order_, buffer_.side_, buffer_.uplo_,
+                     buffer_.trans_a_, buffer_.diag_,
+                     buffer_.m_, buffer_.n_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offB_, buffer_.lda_,
+                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                     1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xTrmm<cl_double>::
+roundtrip_func()
+{
+	    timer.Start(timer_id);
+	    cl_int err;
+			//set up buffer
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(cl_double),
+                                        NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(cl_double),
+                                        NULL, &err);
+		//initialize gpu buffer
+		err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(cl_double),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(cl_double),
+                                   buffer_.a_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(cl_double),
+                                   buffer_.ldb_ *buffer_.b_num_vectors_ *
+                                       sizeof(cl_double),
+                                   buffer_.b_, 0, NULL, NULL);
+		//call_func
+		    clblasDtrmm(order_, buffer_.side_, buffer_.uplo_,
+                     buffer_.trans_a_, buffer_.diag_,
+                     buffer_.m_, buffer_.n_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                     1, &queue_, 0, NULL, NULL);
+		//read gpu buffer
+			err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+			                      buffer_.offB_ * sizeof(cl_double), buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(cl_double),
+								  buffer_.b_, 0, NULL, &event_);
+			clWaitForEvents(1, &event_);
+			timer.Stop(timer_id);
+
+}
+
+template<>
+void
+xTrmm<cl_float2>::
+call_func()
+{
+    timer.Start(timer_id);
+
+    clblasCtrmm(order_, buffer_.side_, buffer_.uplo_,
+                     buffer_.trans_a_, buffer_.diag_,
+                     buffer_.m_, buffer_.n_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                     1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xTrmm<cl_float2>::
+roundtrip_func()
+{
+	    timer.Start(timer_id);
+	    cl_int err;
+			//set up buffer
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(cl_float2),
+                                        NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(cl_float2),
+                                        NULL, &err);
+		//initialize gpu buffer
+		err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(cl_float2),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(cl_float2),
+                                   buffer_.a_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(cl_float2),
+                                   buffer_.ldb_ *buffer_.b_num_vectors_ *
+                                       sizeof(cl_float2),
+                                   buffer_.b_, 0, NULL, NULL);
+		//call_func
+		    clblasCtrmm(order_, buffer_.side_, buffer_.uplo_,
+                     buffer_.trans_a_, buffer_.diag_,
+                     buffer_.m_, buffer_.n_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                     1, &queue_, 0, NULL, NULL);
+		//read gpu buffer
+			err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+			                      buffer_.offB_ * sizeof(cl_float2), buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(cl_float2),
+								  buffer_.b_, 0, NULL, &event_);
+			clWaitForEvents(1, &event_);
+			timer.Stop(timer_id);
+
+}
+
+template<>
+void
+xTrmm<cl_double2>::
+call_func()
+{
+    timer.Start(timer_id);
+
+    clblasZtrmm(order_, buffer_.side_, buffer_.uplo_,
+                     buffer_.trans_a_, buffer_.diag_,
+                     buffer_.m_, buffer_.n_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                     1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xTrmm<cl_double2>::
+roundtrip_func()
+{
+	    timer.Start(timer_id);
+	    cl_int err;
+			//set up buffer
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(cl_double2),
+                                        NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(cl_double2),
+                                        NULL, &err);
+		//initialize gpu buffer
+		err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(cl_double2),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(cl_double2),
+                                   buffer_.a_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(cl_double2),
+                                   buffer_.ldb_ *buffer_.b_num_vectors_ *
+                                       sizeof(cl_double2),
+                                   buffer_.b_, 0, NULL, NULL);
+		//call_func
+		    clblasZtrmm(order_, buffer_.side_, buffer_.uplo_,
+                     buffer_.trans_a_, buffer_.diag_,
+                     buffer_.m_, buffer_.n_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                     1, &queue_, 0, NULL, NULL);
+		//read gpu buffer
+			err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+			                      buffer_.offB_ * sizeof(cl_double2), buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(cl_double2),
+								  buffer_.b_, 0, NULL, &event_);
+			clWaitForEvents(1, &event_);
+			timer.Stop(timer_id);
+
+}
+
+template<>
+double
+xTrmm<cl_float2>::
+gflops()
+{
+    if (buffer_.side_ == clblasLeft)
+    {
+        return 4.0*buffer_.m_*(buffer_.m_+1)*buffer_.n_/time_in_ns();
+        // NOTE i already had my version of clfunc_common integrated, so i went
+        // ahead with that. i had a time_in_ns(), not a time_in_sec(),
+        // so i adjusted the formula accordingly
+    }
+    else
+    {
+        return 4.0*buffer_.m_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
+    }
+}
+
+template<>
+double
+xTrmm<cl_double2>::
+gflops()
+{
+    if (buffer_.side_ == clblasLeft)
+    {
+        return 4.0*buffer_.m_*(buffer_.m_+1)*buffer_.n_/time_in_ns();
+    }
+    else
+    {
+        return 4.0*buffer_.m_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
+    }
+}
+
+template<>
+std::string
+xTrmm<cl_float2>::
+gflops_formula()
+{
+    if (buffer_.side_ == clblasLeft)
+    {
+        return "4.0*M*(M+1)*N/time";
+    }
+    else
+    {
+        return "4.0*M*(N+1)*N/time";
+    }
+}
+
+template<>
+std::string
+xTrmm<cl_double2>::
+gflops_formula()
+{
+    if (buffer_.side_ == clblasLeft)
+    {
+        return "4.0*M*(M+1)*N/time";
+    }
+    else
+    {
+        return "4.0*M*(N+1)*N/time";
+    }
+}
+
+
+#endif // ifndef CLBLAS_BENCHMARK_XTRMM_HXX__
diff --git a/src/client/clfunc_xtrmv.hpp b/src/client/clfunc_xtrmv.hpp
new file mode 100644
index 0000000..725e9f3
--- /dev/null
+++ b/src/client/clfunc_xtrmv.hpp
@@ -0,0 +1,427 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XTRMV_HXX__
+#define CLBLAS_BENCHMARK_XTRMV_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xTrmvBuffer
+{
+  size_t m_;
+  size_t lda_;
+  size_t a_num_vectors_;
+  clblasTranspose trans_a_;
+  clblasUplo uplo_;
+  clblasDiag diag_;
+  T* a_;
+  T* x_;
+  cl_mem buf_a_;
+  cl_mem buf_x_;
+  cl_mem scratch_;
+}; // struct buffer
+
+template <typename T>
+class xTrmv : public clblasFunc
+{
+public:
+  xTrmv(StatisticalTimer& timer,  cl_device_type devType) : clblasFunc(timer,  devType)
+  {
+    timer.getUniqueID("clTrmv", 0);
+  }
+
+  ~xTrmv()
+  {
+    delete buffer_.a_;
+    delete buffer_.x_;
+    OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_), "releasing buffer A");
+    OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_x_), "releasing buffer X");
+    OPENCL_V_THROW( clReleaseMemObject(buffer_.scratch_), "releasing buffer X");
+  }
+
+  void call_func() {}
+
+  double gflops()
+  {
+    return static_cast<double>(buffer_.m_ * buffer_.m_ )/time_in_ns();
+  }
+
+  std::string gflops_formula()
+  {
+    return "M*M/time";
+  }
+
+  void setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					size_t offB, size_t offC, double alpha,
+                    double beta)
+  {
+    initialize_scalars(alpha, beta);
+
+    buffer_.m_ = M;
+
+    if (transA_option == 0)
+    {
+      buffer_.trans_a_ = clblasNoTrans;
+    }
+    else if (transA_option == 1)
+    {
+      buffer_.trans_a_ = clblasTrans;
+    }
+    else if (transA_option == 2)
+    {
+      buffer_.trans_a_ = clblasConjTrans;
+    }
+
+    if (uplo_option == 0)
+    {
+      buffer_.uplo_ = clblasUpper;
+    }
+    else
+    {
+      buffer_.uplo_ = clblasLower;
+    }
+
+    if (diag_option == 0)
+    {
+      buffer_.diag_ = clblasUnit;
+    }
+    else
+    {
+      buffer_.diag_ = clblasNonUnit;
+    }
+
+    if (order_option == 0)
+    {
+      order_ = clblasRowMajor;
+    }
+    else
+    {
+      order_ = clblasColumnMajor;
+    }
+
+
+    if (lda == 0)
+    {
+      buffer_.lda_ = M;
+    }
+    else
+    {
+      if( lda < M )
+      {
+        std::cerr << "ERROR: lda must be set to 0 or a value >= M" << std::endl;
+      }
+      else if (lda >= M)
+      {
+        buffer_.lda_ = lda;
+      }
+    }
+
+
+    buffer_.a_num_vectors_ = buffer_.m_;
+
+    buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+    buffer_.x_ = new T[buffer_.m_];
+
+
+    cl_int err;
+    buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                    buffer_.lda_*buffer_.a_num_vectors_*sizeof(T),
+                                    NULL, &err);
+
+    buffer_.buf_x_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer_.m_*sizeof(T),
+                                    NULL, &err);
+
+    buffer_.scratch_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer_.m_*sizeof(T),
+                                    NULL, &err);
+
+  }
+
+  void initialize_cpu_buffer()
+  {
+    srand(10);
+
+    for (size_t i = 0; i < buffer_.m_; ++i)
+    {
+      buffer_.x_[i] = static_cast<T>(rand())/static_cast<T>(RAND_MAX);
+    }
+
+    for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+    {
+      for (size_t j = 0; j < buffer_.lda_; ++j)
+      {
+        if (i == j)
+        {
+          if (buffer_.diag_ == clblasUnit)
+          {
+            buffer_.a_[i*buffer_.lda_+j] = static_cast<T>(1.0);
+          }
+          else
+          {
+            buffer_.a_[i*buffer_.lda_+j] =
+              static_cast<T>(rand())/static_cast<T>(RAND_MAX);
+          }
+        }
+        else
+        {
+          buffer_.a_[i*buffer_.lda_+j] = static_cast<T>(0.0);
+        }
+      }
+    }
+  }
+
+  void initialize_gpu_buffer()
+  {
+    cl_int err;
+
+    err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE, 0,
+                               buffer_.lda_*buffer_.a_num_vectors_*sizeof(T),
+                               buffer_.a_, 0, NULL, NULL);
+
+    err = clEnqueueWriteBuffer(queue_, buffer_.buf_x_, CL_TRUE, 0,
+                               buffer_.m_*sizeof(T),
+                               buffer_.x_, 0, NULL, NULL);
+  }
+
+  void reset_gpu_write_buffer()
+  {
+    cl_int err;
+    err = clEnqueueWriteBuffer(queue_, buffer_.buf_x_, CL_TRUE, 0,
+                               buffer_.m_,
+                               buffer_.x_, 0, NULL, NULL);
+  }
+  void read_gpu_buffer()
+  {
+		//cl_int err;
+		//to-do need to fill up
+  }
+  void roundtrip_func()
+	{//to-do need to fill up
+	}
+  void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+		{}
+protected:
+  void initialize_scalars(double alpha, double beta)
+  {
+  }
+
+private:
+  xTrmvBuffer<T> buffer_;
+
+}; // class xtrmv
+
+template<>
+void
+xTrmv<cl_float2>::
+initialize_scalars(double alpha, double beta)
+{
+}
+
+template<>
+void
+xTrmv<cl_double2>::
+initialize_scalars(double alpha, double beta)
+{
+}
+
+template<>
+void
+xTrmv<cl_float>::
+call_func()
+{
+  timer.Start(timer_id);
+  clblasStrmv(order_, buffer_.uplo_, buffer_.trans_a_,
+                 buffer_.diag_, buffer_.m_, buffer_.buf_a_, 0,
+                 buffer_.lda_, buffer_.buf_x_, 0, 1, buffer_.scratch_,
+                 1, &queue_, 0, NULL, &event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template<>
+void
+xTrmv<cl_double>::
+call_func()
+{
+  timer.Start(timer_id);
+  clblasDtrmv(order_, buffer_.uplo_, buffer_.trans_a_,
+                 buffer_.diag_, buffer_.m_, buffer_.buf_a_, 0,
+                 buffer_.lda_, buffer_.buf_x_, 0, 1, buffer_.scratch_,
+                 1, &queue_, 0, NULL, &event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template<>
+void
+xTrmv<cl_float2>::
+call_func()
+{
+  timer.Start(timer_id);
+  clblasCtrmv(order_, buffer_.uplo_, buffer_.trans_a_,
+                 buffer_.diag_, buffer_.m_, buffer_.buf_a_, 0,
+                 buffer_.lda_, buffer_.buf_x_, 0, 1, buffer_.scratch_,
+                 1, &queue_, 0, NULL, &event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template<>
+void
+xTrmv<cl_double2>::
+call_func()
+{
+  timer.Start(timer_id);
+  clblasZtrmv(order_, buffer_.uplo_, buffer_.trans_a_,
+                 buffer_.diag_, buffer_.m_, buffer_.buf_a_, 0,
+                 buffer_.lda_, buffer_.buf_x_, 0, 1, buffer_.scratch_,
+                 1, &queue_, 0, NULL, &event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template<>
+void
+xTrmv<cl_float2>::
+initialize_cpu_buffer()
+{
+  srand(10);
+  for (size_t i = 0; i < buffer_.m_; ++i)
+  {
+    buffer_.x_[i].s[0] =
+      static_cast<cl_float>(rand())/static_cast<cl_float>(RAND_MAX);
+    buffer_.x_[i].s[1] =
+      static_cast<cl_float>(rand())/static_cast<cl_float>(RAND_MAX);
+  }
+
+  for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+  {
+    for (size_t j = 0; j < buffer_.lda_; ++j)
+    {
+      if (i == j)
+      {
+        if (buffer_.diag_ == clblasUnit)
+        {
+          buffer_.a_[i*buffer_.lda_+j].s[0] = 1.0f;
+          buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0f;
+        }
+        else
+        {
+          buffer_.a_[i*buffer_.lda_+j].s[0] =
+            static_cast<cl_float>(rand())/static_cast<cl_float>(RAND_MAX);
+          buffer_.a_[i*buffer_.lda_+j].s[1] =
+            static_cast<cl_float>(rand())/static_cast<cl_float>(RAND_MAX);
+        }
+      }
+      else
+      {
+        buffer_.a_[i*buffer_.lda_+j].s[0] = 0.0f;
+        buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0f;
+      }
+    }
+  }
+
+
+}
+
+template<>
+void
+xTrmv<cl_double2>::
+initialize_cpu_buffer()
+{
+  srand(10);
+  for (size_t i = 0; i < buffer_.m_; ++i)
+  {
+    buffer_.x_[i].s[0] =
+      static_cast<cl_double>(rand())/static_cast<cl_double>(RAND_MAX);
+    buffer_.x_[i].s[1] =
+      static_cast<cl_double>(rand())/static_cast<cl_double>(RAND_MAX);
+  }
+
+  for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+  {
+    for (size_t j = 0; j < buffer_.lda_; ++j)
+    {
+      if (i == j)
+      {
+        if (buffer_.diag_ == clblasUnit)
+        {
+          buffer_.a_[i*buffer_.lda_+j].s[0] = 1.0;
+          buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0;
+        }
+        else
+        {
+          buffer_.a_[i*buffer_.lda_+j].s[0] =
+            static_cast<cl_double>(rand())/static_cast<cl_double>(RAND_MAX);
+          buffer_.a_[i*buffer_.lda_+j].s[1] =
+            static_cast<cl_double>(rand())/static_cast<cl_double>(RAND_MAX);
+        }
+      }
+      else
+      {
+        buffer_.a_[i*buffer_.lda_+j].s[0] = 0.0;
+        buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0;
+      }
+    }
+  }
+}
+
+template<>
+double
+xTrmv<cl_float2>::
+gflops()
+{
+  return static_cast<double>(4 * buffer_.m_ * buffer_.m_ )/time_in_ns();
+}
+
+template<>
+double
+xTrmv<cl_double2>::
+gflops()
+{
+  return static_cast<double>(4 * buffer_.m_ * buffer_.m_ )/time_in_ns();
+}
+
+template<>
+std::string
+xTrmv<cl_float2>::
+gflops_formula()
+{
+  return "4*M*M/time";
+}
+
+template<>
+std::string
+xTrmv<cl_double2>::
+gflops_formula()
+{
+  return "4*M*M/time";
+}
+
+
+#endif // ifndef CLBLAS_BENCHMARK_XTRMV_HXX__
diff --git a/src/client/clfunc_xtrsm.hpp b/src/client/clfunc_xtrsm.hpp
new file mode 100644
index 0000000..8ae85c3
--- /dev/null
+++ b/src/client/clfunc_xtrsm.hpp
@@ -0,0 +1,785 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XTRSM_HXX__
+#define CLBLAS_BENCHMARK_XTRSM_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xTrsmBuffer
+{
+    clblasOrder order_;
+    size_t m_;
+    size_t n_;
+    size_t lda_;
+    size_t ldb_;
+    size_t offA_;
+    size_t offB_;
+    size_t a_num_vectors_;
+    size_t b_num_vectors_;
+    clblasTranspose trans_a_;
+    clblasSide side_;
+    clblasUplo uplo_;
+    clblasDiag diag_;
+    T* a_;
+    T* b_;
+    cl_mem buf_a_;
+    cl_mem buf_b_;
+    T alpha_;
+}; // struct buffer
+
+template <typename T>
+class xTrsm : public clblasFunc
+{
+public:
+    xTrsm(StatisticalTimer& timer, cl_device_type devType) :
+        clblasFunc(timer, devType)
+    {
+        timer.getUniqueID("clTrsm", 0);
+    }
+
+    ~xTrsm()
+    {
+        delete buffer_.a_;
+        delete buffer_.b_;
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
+                        "releasing buffer A");
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_b_),
+                        "releasing buffer B");
+    }
+
+    void call_func()
+    {
+        std::cout << "xtrsm::call_func\n";
+    }
+
+    double gflops()
+    {
+        if (buffer_.side_ == clblasLeft)
+        {
+            return buffer_.m_*(buffer_.m_+1)*buffer_.n_/time_in_ns();
+        }
+        else
+        {
+            return 20*buffer_.m_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
+        }
+    }
+
+    std::string gflops_formula()
+    {
+        if (buffer_.side_ == clblasLeft)
+        {
+            return "M*(M+1)*N/time";
+        }
+        else
+        {
+            return "M*(N+1)*N/time";
+        }
+    }
+
+    void setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+    {
+        DUMMY_ARGS_USAGE_3(transB_option, K, beta);
+        DUMMY_ARGS_USAGE_2(ldc, offCY);
+
+        initialize_scalars(alpha, beta);
+
+        buffer_.m_ = M;
+        buffer_.n_ = N;
+        buffer_.offA_ = offA;
+        buffer_.offB_ = offBX;
+
+        if (transA_option == 0)
+        {
+            buffer_.trans_a_ = clblasNoTrans;
+        }
+        else if (transA_option == 1)
+        {
+            buffer_.trans_a_ = clblasTrans;
+        }
+        else if (transA_option == 2)
+        {
+            buffer_.trans_a_ = clblasConjTrans;
+        }
+
+        if (side_option == 0)
+        {
+            buffer_.side_ = clblasLeft;
+            buffer_.a_num_vectors_ = M;
+        }
+        else
+        {
+            buffer_.side_ = clblasRight;
+            buffer_.a_num_vectors_ = N;
+        }
+
+        if (uplo_option == 0)
+        {
+            buffer_.uplo_ = clblasUpper;
+        }
+        else
+        {
+            buffer_.uplo_ = clblasLower;
+        }
+
+        if (diag_option == 0)
+        {
+            buffer_.diag_ = clblasUnit;
+        }
+        else
+        {
+            buffer_.diag_ = clblasNonUnit;
+        }
+
+        if (order_option == 0)
+        {
+            order_ = clblasRowMajor;
+            buffer_.b_num_vectors_ = M;
+            if (ldb == 0)
+            {
+                buffer_.ldb_ = N;
+            }
+            else
+            {
+                if (ldb < N)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+        }
+        else
+        {
+            buffer_.order_ = clblasColumnMajor;
+            buffer_.b_num_vectors_ = N;
+            if (ldb == 0)
+            {
+                buffer_.ldb_ = M;
+            }
+            else
+            {
+                if (ldb < M)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+        }
+
+        if (lda == 0)
+        {
+            if (side_option == 0)
+            {
+                buffer_.lda_ = M;
+            }
+            else
+            {
+                buffer_.lda_ = N;
+            }
+        }
+        else
+        {
+            if( side_option == 0 && lda < M )
+            {
+                std::cerr << "ERROR: when side is 0, lda must be set to 0 "
+                             "or a value >= M" << std::endl;
+            }
+            else if(side_option == 0 && lda >= M )
+            {
+                buffer_.lda_ = lda;
+            }
+            else if(side_option != 0 && lda < N)
+            {
+                std::cerr << "ERROR: when side is 1, lda must be set to 0 "
+                             "or a value >= N" << std::endl;
+            }
+            else if (side_option != 0 && lda >= N)
+            {
+                buffer_.lda_ = lda;
+            }
+        }
+
+        buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+        buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
+
+        cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(T),
+                                        NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                         NULL, &err);
+    }
+
+    void initialize_cpu_buffer()
+    {
+        srand(10);
+
+        for (size_t i = 0; i < buffer_.b_num_vectors_; ++i)
+        {
+            for (size_t j = 0; j < buffer_.ldb_; ++j)
+            {
+                buffer_.b_[i*buffer_.ldb_+j] = random<T>(UPPER_BOUND<T>()) /
+                    randomScale<T>();
+            }
+        }
+
+        for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+        {
+            for (size_t j = 0; j < buffer_.lda_; ++j)
+            {
+                if (i == j)
+                {
+                    if (buffer_.diag_ == clblasUnit)
+                    {
+                        buffer_.a_[i*buffer_.lda_+j] = ONE<T>();
+                    }
+                    else
+                    {
+                        buffer_.a_[i*buffer_.lda_+j] =
+                            random<T>(UPPER_BOUND<T>()) /
+                            randomScale<T>();
+                    }
+                }
+                else
+                {
+                    buffer_.a_[i*buffer_.lda_+j] = ZERO<T>();
+                }
+            }
+        }
+    }
+
+    void initialize_gpu_buffer()
+    {
+        cl_int err;
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(T),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.a_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(T),
+                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.b_, 0, NULL, NULL);
+    }
+
+    void reset_gpu_write_buffer()
+    {
+        cl_int err;
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(T),
+                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.b_, 0, NULL, NULL);
+    }
+	void read_gpu_buffer()
+	{
+		cl_int err;
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+			                      buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(T),
+								  buffer_.b_, 0, NULL, NULL);
+	}
+	void roundtrip_func()
+	{
+		std::cout << "xtrsm::call_func\n";
+	}
+	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+	{
+        DUMMY_ARGS_USAGE_3(transB_option, K, beta);
+        DUMMY_ARGS_USAGE_2(ldc, offCY);
+
+        initialize_scalars(alpha, beta);
+
+        buffer_.m_ = M;
+        buffer_.n_ = N;
+        buffer_.offA_ = offA;
+        buffer_.offB_ = offBX;
+
+        if (transA_option == 0)
+        {
+            buffer_.trans_a_ = clblasNoTrans;
+        }
+        else if (transA_option == 1)
+        {
+            buffer_.trans_a_ = clblasTrans;
+        }
+        else if (transA_option == 2)
+        {
+            buffer_.trans_a_ = clblasConjTrans;
+        }
+
+        if (side_option == 0)
+        {
+            buffer_.side_ = clblasLeft;
+            buffer_.a_num_vectors_ = M;
+        }
+        else
+        {
+            buffer_.side_ = clblasRight;
+            buffer_.a_num_vectors_ = N;
+        }
+
+        if (uplo_option == 0)
+        {
+            buffer_.uplo_ = clblasUpper;
+        }
+        else
+        {
+            buffer_.uplo_ = clblasLower;
+        }
+
+        if (diag_option == 0)
+        {
+            buffer_.diag_ = clblasUnit;
+        }
+        else
+        {
+            buffer_.diag_ = clblasNonUnit;
+        }
+
+        if (order_option == 0)
+        {
+            order_ = clblasRowMajor;
+            buffer_.b_num_vectors_ = M;
+            if (ldb == 0)
+            {
+                buffer_.ldb_ = N;
+            }
+            else
+            {
+                if (ldb < N)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+        }
+        else
+        {
+            buffer_.order_ = clblasColumnMajor;
+            buffer_.b_num_vectors_ = N;
+            if (ldb == 0)
+            {
+                buffer_.ldb_ = M;
+            }
+            else
+            {
+                if (ldb < M)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+        }
+
+        if (lda == 0)
+        {
+            if (side_option == 0)
+            {
+                buffer_.lda_ = M;
+            }
+            else
+            {
+                buffer_.lda_ = N;
+            }
+        }
+        else
+        {
+            if( side_option == 0 && lda < M )
+            {
+                std::cerr << "ERROR: when side is 0, lda must be set to 0 "
+                             "or a value >= M" << std::endl;
+            }
+            else if(side_option == 0 && lda >= M )
+            {
+                buffer_.lda_ = lda;
+            }
+            else if(side_option != 0 && lda < N)
+            {
+                std::cerr << "ERROR: when side is 1, lda must be set to 0 "
+                             "or a value >= N" << std::endl;
+            }
+            else if (side_option != 0 && lda >= N)
+            {
+                buffer_.lda_ = lda;
+            }
+        }
+
+        buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+        buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
+	}
+protected:
+    void initialize_scalars(double alpha, double beta)
+    {
+        DUMMY_ARG_USAGE(beta);
+        buffer_.alpha_ = makeScalar<T>(alpha);
+    }
+
+private:
+    xTrsmBuffer<T> buffer_;
+
+}; // class xtrsm
+
+template<>
+void
+xTrsm<cl_float>::
+call_func()
+{
+    timer.Start(timer_id);
+
+    clblasStrsm(order_, buffer_.side_, buffer_.uplo_,
+                     buffer_.trans_a_, buffer_.diag_,
+                     buffer_.m_, buffer_.n_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                     1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xTrsm<cl_float>::
+roundtrip_func()
+{
+	timer.Start(timer_id);
+	    //set up buffer
+        cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(cl_float),
+                                        NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(cl_float),
+                                         NULL, &err);
+		//initialize gpu buffer
+		err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(cl_float),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(cl_float),
+                                   buffer_.a_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(cl_float),
+                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(cl_float),
+                                   buffer_.b_, 0, NULL, NULL);
+		//call func
+		clblasStrsm(order_, buffer_.side_, buffer_.uplo_,
+                     buffer_.trans_a_, buffer_.diag_,
+                     buffer_.m_, buffer_.n_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                     1, &queue_, 0, NULL, NULL);
+		//read gpu buffer
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+			                      buffer_.offB_ * sizeof(cl_float), buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(cl_float),
+								  buffer_.b_, 0, NULL, &event_);
+	clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+}
+
+template<>
+void
+xTrsm<cl_double>::
+call_func()
+{
+    timer.Start(timer_id);
+
+    clblasDtrsm(order_, buffer_.side_, buffer_.uplo_,
+                     buffer_.trans_a_, buffer_.diag_,
+                     buffer_.m_, buffer_.n_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                     1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xTrsm<cl_double>::
+roundtrip_func()
+{
+	timer.Start(timer_id);
+	    //set up buffer
+        cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(cl_double),
+                                        NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(cl_double),
+                                         NULL, &err);
+		//initialize gpu buffer
+		err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(cl_double),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(cl_double),
+                                   buffer_.a_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(cl_double),
+                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(cl_double),
+                                   buffer_.b_, 0, NULL, NULL);
+		//call func
+		clblasDtrsm(order_, buffer_.side_, buffer_.uplo_,
+                     buffer_.trans_a_, buffer_.diag_,
+                     buffer_.m_, buffer_.n_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                     1, &queue_, 0, NULL, NULL);
+		//read gpu buffer
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+			                      buffer_.offB_ * sizeof(cl_double), buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(cl_double),
+								  buffer_.b_, 0, NULL, &event_);
+	clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+}
+
+template<>
+void
+xTrsm<cl_float2>::
+call_func()
+{
+    timer.Start(timer_id);
+
+    clblasCtrsm(order_, buffer_.side_, buffer_.uplo_,
+                     buffer_.trans_a_, buffer_.diag_,
+                     buffer_.m_, buffer_.n_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                     1, &queue_, 0, NULL, &event_);
+
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template<>
+void
+xTrsm<cl_float2>::
+roundtrip_func()
+{
+	timer.Start(timer_id);
+	    //set up buffer
+        cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(cl_float2),
+                                        NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(cl_float2),
+                                         NULL, &err);
+		//initialize gpu buffer
+		err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(cl_float2),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(cl_float2),
+                                   buffer_.a_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(cl_float2),
+                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(cl_float2),
+                                   buffer_.b_, 0, NULL, NULL);
+		//call func
+		clblasCtrsm(order_, buffer_.side_, buffer_.uplo_,
+                     buffer_.trans_a_, buffer_.diag_,
+                     buffer_.m_, buffer_.n_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                     1, &queue_, 0, NULL, NULL);
+		//read gpu buffer
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+			                      buffer_.offB_ * sizeof(cl_float2), buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(cl_float2),
+								  buffer_.b_, 0, NULL, &event_);
+	clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+}
+
+template<>
+void
+xTrsm<cl_double2>::
+call_func()
+{
+  timer.Start(timer_id);
+
+  clblasZtrsm(order_, buffer_.side_, buffer_.uplo_,
+                   buffer_.trans_a_, buffer_.diag_,
+                   buffer_.m_, buffer_.n_, buffer_.alpha_,
+                   buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                   buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                   1, &queue_, 0, NULL, &event_);
+
+      clWaitForEvents(1, &event_);
+      timer.Stop(timer_id);
+}
+
+template<>
+void
+xTrsm<cl_double2>::
+roundtrip_func()
+{
+	timer.Start(timer_id);
+	    //set up buffer
+        cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(cl_double2),
+                                        NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(cl_double2),
+                                         NULL, &err);
+		//initialize gpu buffer
+		err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(cl_double2),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(cl_double2),
+                                   buffer_.a_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(cl_double2),
+                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(cl_double2),
+                                   buffer_.b_, 0, NULL, NULL);
+		//call func
+		clblasZtrsm(order_, buffer_.side_, buffer_.uplo_,
+                     buffer_.trans_a_, buffer_.diag_,
+                     buffer_.m_, buffer_.n_, buffer_.alpha_,
+                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
+                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                     1, &queue_, 0, NULL, NULL);
+		//read gpu buffer
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+			                      buffer_.offB_ * sizeof(cl_double2), buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(cl_double2),
+								  buffer_.b_, 0, NULL, &event_);
+	clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+}
+
+template<>
+double
+xTrsm<cl_float2>::
+gflops()
+{
+    if (buffer_.side_ == clblasLeft)
+    {
+        return 4.0*buffer_.m_*(buffer_.m_+1)*buffer_.n_/time_in_ns();
+    }
+    else
+    {
+        return 4.0*buffer_.m_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
+    }
+}
+
+
+template<>
+double
+xTrsm<cl_double2>::
+gflops()
+{
+    if (buffer_.side_ == clblasLeft)
+    {
+        return 4.0*buffer_.m_*(buffer_.m_+1)*buffer_.n_/time_in_ns();
+    }
+    else
+    {
+        return 4.0*buffer_.m_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
+    }
+}
+
+template<>
+std::string
+xTrsm<cl_float2>::
+gflops_formula()
+{
+    if (buffer_.side_ == clblasLeft)
+    {
+        return "4.0*M*(M+1)*N/time";
+    }
+    else
+    {
+        return "4.0*M*(N+1)*N/time";
+    }
+}
+
+template<>
+std::string
+xTrsm<cl_double2>::
+gflops_formula()
+{
+    if (buffer_.side_ == clblasLeft)
+    {
+        return "4.0*M*(M+1)*N/time";
+    }
+    else
+    {
+        return "4.0*M*(N+1)*N/time";
+    }
+}
+
+
+#endif // ifndef CLBLAS_BENCHMARK_XTRSM_HXX__
diff --git a/src/client/clfunc_xtrsv.hpp b/src/client/clfunc_xtrsv.hpp
new file mode 100644
index 0000000..f0b728a
--- /dev/null
+++ b/src/client/clfunc_xtrsv.hpp
@@ -0,0 +1,420 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XTRSV_HXX__
+#define CLBLAS_BENCHMARK_XTRSV_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xTrsvBuffer
+{
+  size_t m_;
+  size_t lda_;
+  size_t a_num_vectors_;
+  clblasTranspose trans_a_;
+  clblasUplo uplo_;
+  clblasDiag diag_;
+  T* a_;
+  T* x_;
+  cl_mem buf_a_;
+  cl_mem buf_x_;
+}; // struct buffer
+
+template <typename T>
+class xTrsv : public clblasFunc
+{
+public:
+  xTrsv(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer,  devType)
+  {
+    timer.getUniqueID("clTrsv", 0);
+  }
+
+  ~xTrsv()
+  {
+    delete buffer_.a_;
+    delete buffer_.x_;
+    OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_), "releasing buffer A");
+    OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_x_), "releasing buffer X");
+  }
+
+  void call_func() {}
+
+  double gflops()
+  {
+    return static_cast<double>(buffer_.m_ * buffer_.m_ )/time_in_ns();
+  }
+
+  std::string gflops_formula()
+  {
+    return "M*M/time";
+  }
+
+  void setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					size_t offB, size_t offC, double alpha,
+                    double beta)
+  {
+    initialize_scalars(alpha, beta);
+
+    buffer_.m_ = M;
+
+    if (transA_option == 0)
+    {
+      buffer_.trans_a_ = clblasNoTrans;
+    }
+    else if (transA_option == 1)
+    {
+      buffer_.trans_a_ = clblasTrans;
+    }
+    else if (transA_option == 2)
+    {
+      buffer_.trans_a_ = clblasConjTrans;
+    }
+
+    if (uplo_option == 0)
+    {
+      buffer_.uplo_ = clblasUpper;
+    }
+    else
+    {
+      buffer_.uplo_ = clblasLower;
+    }
+
+    if (diag_option == 0)
+    {
+      buffer_.diag_ = clblasUnit;
+    }
+    else
+    {
+      buffer_.diag_ = clblasNonUnit;
+    }
+
+    if (order_option == 0)
+    {
+      order_ = clblasRowMajor;
+    }
+    else
+    {
+      order_ = clblasColumnMajor;
+    }
+
+
+    if (lda == 0)
+    {
+      buffer_.lda_ = M;
+    }
+    else
+    {
+      if( lda < M )
+      {
+        std::cerr << "ERROR: lda must be set to 0 or a value >= M" << std::endl;
+      }
+      else if (lda >= M)
+      {
+        buffer_.lda_ = lda;
+      }
+    }
+
+
+    buffer_.a_num_vectors_ = buffer_.m_;
+
+    buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+    buffer_.x_ = new T[buffer_.m_];
+
+
+    cl_int err;
+    buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                    buffer_.lda_*buffer_.a_num_vectors_*sizeof(T),
+                                    NULL, &err);
+
+    buffer_.buf_x_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer_.m_*sizeof(T),
+                                    NULL, &err);
+  }
+
+  void initialize_cpu_buffer()
+  {
+    srand(10);
+
+    for (size_t i = 0; i < buffer_.m_; ++i)
+    {
+      buffer_.x_[i] = static_cast<T>(rand())/static_cast<T>(RAND_MAX);
+    }
+
+    for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+    {
+      for (size_t j = 0; j < buffer_.lda_; ++j)
+      {
+        if (i == j)
+        {
+          if (buffer_.diag_ == clblasUnit)
+          {
+            buffer_.a_[i*buffer_.lda_+j] = static_cast<T>(1.0);
+          }
+          else
+          {
+            buffer_.a_[i*buffer_.lda_+j] =
+              static_cast<T>(rand())/static_cast<T>(RAND_MAX);
+          }
+        }
+        else
+        {
+          buffer_.a_[i*buffer_.lda_+j] = static_cast<T>(0.0);
+        }
+      }
+    }
+  }
+
+  void initialize_gpu_buffer()
+  {
+    cl_int err;
+
+    err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE, 0,
+                               buffer_.lda_*buffer_.a_num_vectors_*sizeof(T),
+                               buffer_.a_, 0, NULL, NULL);
+
+    err = clEnqueueWriteBuffer(queue_, buffer_.buf_x_, CL_TRUE, 0,
+                               buffer_.m_*sizeof(T),
+                               buffer_.x_, 0, NULL, NULL);
+  }
+
+  void reset_gpu_write_buffer()
+  {
+    cl_int err;
+    err = clEnqueueWriteBuffer(queue_, buffer_.buf_x_, CL_TRUE, 0,
+                               buffer_.m_,
+                               buffer_.x_, 0, NULL, NULL);
+  }
+  void read_gpu_buffer()
+	{
+		//cl_int err;
+		//to-do need to fill up
+	}
+  void roundtrip_func()
+	{//to-do need to fill up
+	}
+  void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+		{}
+protected:
+  void initialize_scalars(double alpha, double beta)
+  {
+  }
+
+private:
+  xTrsvBuffer<T> buffer_;
+
+}; // class xtrsv
+
+template<>
+void
+xTrsv<cl_float2>::
+initialize_scalars(double alpha, double beta)
+{
+}
+
+template<>
+void
+xTrsv<cl_double2>::
+initialize_scalars(double alpha, double beta)
+{
+}
+
+template<>
+void
+xTrsv<cl_float>::
+call_func()
+{
+  timer.Start(timer_id);
+  clblasStrsv(order_, buffer_.uplo_, buffer_.trans_a_,
+                 buffer_.diag_, buffer_.m_, buffer_.buf_a_, 0,
+                 buffer_.lda_, buffer_.buf_x_, 0, 1, 1, &queue_, 0, NULL,
+                 &event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template<>
+void
+xTrsv<cl_double>::
+call_func()
+{
+  timer.Start(timer_id);
+  clblasDtrsv(order_, buffer_.uplo_, buffer_.trans_a_,
+                 buffer_.diag_, buffer_.m_, buffer_.buf_a_, 0,
+                 buffer_.lda_, buffer_.buf_x_, 0, 1, 1, &queue_, 0, NULL,
+                 &event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template<>
+void
+xTrsv<cl_float2>::
+call_func()
+{
+  timer.Start(timer_id);
+  clblasCtrsv(order_, buffer_.uplo_, buffer_.trans_a_,
+                 buffer_.diag_, buffer_.m_, buffer_.buf_a_, 0,
+                 buffer_.lda_, buffer_.buf_x_, 0, 1, 1, &queue_, 0, NULL,
+                 &event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template<>
+void
+xTrsv<cl_double2>::
+call_func()
+{
+  timer.Start(timer_id);
+  clblasZtrsv(order_, buffer_.uplo_, buffer_.trans_a_,
+                 buffer_.diag_, buffer_.m_, buffer_.buf_a_, 0,
+                 buffer_.lda_, buffer_.buf_x_, 0, 1, 1, &queue_, 0, NULL,
+                 &event_);
+  clWaitForEvents(1, &event_);
+  timer.Stop(timer_id);
+}
+
+template<>
+void
+xTrsv<cl_float2>::
+initialize_cpu_buffer()
+{
+  srand(10);
+  for (size_t i = 0; i < buffer_.m_; ++i)
+  {
+    buffer_.x_[i].s[0] =
+      static_cast<cl_float>(rand())/static_cast<cl_float>(RAND_MAX);
+    buffer_.x_[i].s[1] =
+      static_cast<cl_float>(rand())/static_cast<cl_float>(RAND_MAX);
+  }
+
+  for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+  {
+    for (size_t j = 0; j < buffer_.lda_; ++j)
+    {
+      if (i == j)
+      {
+        if (buffer_.diag_ == clblasUnit)
+        {
+          buffer_.a_[i*buffer_.lda_+j].s[0] = 1.0f;
+          buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0f;
+        }
+        else
+        {
+          buffer_.a_[i*buffer_.lda_+j].s[0] =
+            static_cast<cl_float>(rand())/static_cast<cl_float>(RAND_MAX);
+          buffer_.a_[i*buffer_.lda_+j].s[1] =
+            static_cast<cl_float>(rand())/static_cast<cl_float>(RAND_MAX);
+        }
+      }
+      else
+      {
+        buffer_.a_[i*buffer_.lda_+j].s[0] = 0.0f;
+        buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0f;
+      }
+    }
+  }
+
+
+}
+
+template<>
+void
+xTrsv<cl_double2>::
+initialize_cpu_buffer()
+{
+  srand(10);
+  for (size_t i = 0; i < buffer_.m_; ++i)
+  {
+    buffer_.x_[i].s[0] =
+      static_cast<cl_double>(rand())/static_cast<cl_double>(RAND_MAX);
+    buffer_.x_[i].s[1] =
+      static_cast<cl_double>(rand())/static_cast<cl_double>(RAND_MAX);
+  }
+
+  for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+  {
+    for (size_t j = 0; j < buffer_.lda_; ++j)
+    {
+      if (i == j)
+      {
+        if (buffer_.diag_ == clblasUnit)
+        {
+          buffer_.a_[i*buffer_.lda_+j].s[0] = 1.0;
+          buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0;
+        }
+        else
+        {
+          buffer_.a_[i*buffer_.lda_+j].s[0] =
+            static_cast<cl_double>(rand())/static_cast<cl_double>(RAND_MAX);
+          buffer_.a_[i*buffer_.lda_+j].s[1] =
+            static_cast<cl_double>(rand())/static_cast<cl_double>(RAND_MAX);
+        }
+      }
+      else
+      {
+        buffer_.a_[i*buffer_.lda_+j].s[0] = 0.0;
+        buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0;
+      }
+    }
+  }
+}
+
+template<>
+double
+xTrsv<cl_float2>::
+gflops()
+{
+  return static_cast<double>(4 * buffer_.m_ * buffer_.m_ )/time_in_ns();
+}
+
+template<>
+double
+xTrsv<cl_double2>::
+gflops()
+{
+  return static_cast<double>(4 * buffer_.m_ * buffer_.m_ )/time_in_ns();
+}
+
+template<>
+std::string
+xTrsv<cl_float2>::
+gflops_formula()
+{
+  return "4*M*M/time";
+}
+
+template<>
+std::string
+xTrsv<cl_double2>::
+gflops_formula()
+{
+  return "4*M*M/time";
+}
+
+
+#endif // ifndef CLBLAS_BENCHMARK_XTRSV_HXX__
diff --git a/src/client/client.cpp b/src/client/client.cpp
new file mode 100644
index 0000000..8f60a07
--- /dev/null
+++ b/src/client/client.cpp
@@ -0,0 +1,531 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <iostream>
+#include <clBLAS.h>
+#include <boost/program_options.hpp>
+#include "statisticalTimer.h"
+#include "clfunc_xgemm.hpp"
+#include "clfunc_xtrmm.hpp"
+#include "clfunc_xtrsm.hpp"
+#include "clfunc_xgemv.hpp"
+#include "clfunc_xsymv.hpp"
+#include "clfunc_xsyrk.hpp"
+#include "clfunc_xsyr2k.hpp"
+#include "clfunc_xtrsv.hpp"
+#include "clfunc_xtrmv.hpp"
+#include "clfunc_xtrsv.hpp"
+#include "clfunc_xger.hpp"
+#include "clfunc_xsyr.hpp"
+#include "clfunc_xsyr2.hpp"
+#include "clfunc_xgeru.hpp"
+#include "clfunc_xgerc.hpp"
+#include "clfunc_xher.hpp"
+#include "clfunc_xher2.hpp"
+#include "clfunc_xhemv.hpp"
+#include "clfunc_xhemm.hpp"
+#include "clfunc_xsymm.hpp"
+
+namespace po = boost::program_options;
+
+int main(int argc, char *argv[])
+{
+  size_t M;
+  size_t N;
+  size_t K;
+  cl_double alpha;
+  cl_double beta;
+  cl_uint profileCount;
+  cl_uint commandQueueFlags = 0;
+  cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
+  int order_option;
+  //clblasOrder order;
+  //clblasTranspose transA;
+  //clblasTranspose transB;
+  int transA_option;
+  int transB_option;
+  size_t lda;
+  size_t ldb;
+  size_t ldc;
+  size_t offA;
+  size_t offBX;
+  size_t offCY;
+  std::string function;
+  std::string precision;
+  std::string roundtrip;
+  int side_option;
+  int uplo_option;
+  int diag_option;
+
+  po::options_description desc( "clBLAS client command line options" );
+  desc.add_options()
+    ( "help,h", "produces this help message" )
+    ( "gpu,g", "Force instantiation of an OpenCL GPU device" )
+    ( "cpu,c", "Force instantiation of an OpenCL CPU device" )
+    ( "all,a", "Force instantiation of all OpenCL devices" )
+    ( "useimages", "Use an image-based kernel" )
+    ( "sizem,m", po::value<size_t>( &M )->default_value(128), "number of rows in A and C" )
+    ( "sizen,n", po::value<size_t>( &N )->default_value(128), "number of columns in B and C" )
+    ( "sizek,k", po::value<size_t>( &K )->default_value(128), "number of columns in A and rows in B" )
+    ( "lda", po::value<size_t>( &lda )->default_value(0), "first dimension of A in memory. if set to 0, lda will default to M (when transposeA is \"no transpose\") or K (otherwise)" )
+    ( "ldb", po::value<size_t>( &ldb )->default_value(0), "first dimension of B in memory. if set to 0, ldb will default to K (when transposeB is \"no transpose\") or N (otherwise)" )
+    ( "ldc", po::value<size_t>( &ldc )->default_value(0), "first dimension of C in memory. if set to 0, ldc will default to M" )
+    ( "offA", po::value<size_t>( &offA )->default_value(0), "offset of the matrix A in memory object" )
+    ( "offBX", po::value<size_t>( &offBX )->default_value(0), "offset of the matrix B or vector X in memory object" )
+    ( "offCY", po::value<size_t>( &offCY )->default_value(0), "offset of the matrix C or vector Y in memory object" )
+    ( "alpha", po::value<cl_double>( &alpha )->default_value(1.0f), "specifies the scalar alpha" )
+    ( "beta", po::value<cl_double>( &beta )->default_value(1.0f), "specifies the scalar beta" )
+    ( "order,o", po::value<int>( &order_option )->default_value(0), "0 = row major, 1 = column major" )
+    ( "transposeA", po::value<int>( &transA_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" )
+    ( "transposeB", po::value<int>( &transB_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" )
+    ( "function,f", po::value<std::string>( &function )->default_value("gemm"), "BLAS function to test. Options: gemm, trsm, trmm, gemv, symv, syrk, syr2k" )
+    ( "precision,r", po::value<std::string>( &precision )->default_value("s"), "Options: s,d,c,z" )
+    ( "side", po::value<int>( &side_option )->default_value(0), "0 = left, 1 = right. only used with [list of function families]" ) // xtrsm xtrmm
+    ( "uplo", po::value<int>( &uplo_option )->default_value(0), "0 = upper, 1 = lower. only used with [list of function families]" )  // xsymv xsyrk xsyr2k xtrsm xtrmm
+    ( "diag", po::value<int>( &diag_option )->default_value(0), "0 = unit diagonal, 1 = non unit diagonal. only used with [list of function families]" ) // xtrsm xtrmm
+    ( "profile,p", po::value<cl_uint>( &profileCount )->default_value(20), "Time and report the kernel speed (default: profiling off)" )
+	( "roundtrip", po::value<std::string>( &roundtrip )->default_value("noroundtrip"),"calculate the time for round trips")
+    ;
+
+  po::variables_map vm;
+  po::store( po::parse_command_line( argc, argv, desc ), vm );
+  po::notify( vm );
+
+  if( vm.count( "help" ) )
+  {
+    std::cout << desc << std::endl;
+    return 0;
+  }
+
+  if( function != "gemm"
+      && function != "trsm"
+      && function != "trmm"
+      && function != "gemv"
+      && function != "symv"
+      && function != "syrk"
+      && function != "syr2k"
+      && function != "trsv"
+      && function != "trmv"
+      && function != "ger"
+      && function != "syr"
+      && function != "syr2"
+      && function != "geru"
+      && function != "gerc"
+      && function != "her"
+      && function != "her2"
+      && function != "hemv"
+      && function != "hemm"
+      && function != "symm"
+      )
+  {
+    std::cerr << "Invalid value for --function" << std::endl;
+    return -1;
+  }
+
+  if( precision != "s" && precision != "d" && precision != "c" && precision != "z" )
+  {
+    std::cerr << "Invalid value for --precision" << std::endl;
+    return -1;
+  }
+
+  size_t mutex = ((vm.count( "gpu" ) > 0) ? 1 : 0)
+    | ((vm.count( "cpu" ) > 0) ? 2 : 0)
+    | ((vm.count( "all" ) > 0) ? 4 : 0);
+  if((mutex & (mutex-1)) != 0) {
+    std::cerr << "You have selected mutually-exclusive OpenCL device options:" << std::endl;
+    if (vm.count ( "gpu" )  > 0) std::cerr << "    gpu,g   Force instantiation of an OpenCL GPU device" << std::endl;
+    if (vm.count ( "cpu" )  > 0) std::cerr << "    cpu,c   Force instantiation of an OpenCL CPU device" << std::endl;
+    if (vm.count ( "all" )  > 0) std::cerr << "    all,a   Force instantiation of all OpenCL devices" << std::endl;
+    return 1;
+  }
+
+  if( vm.count( "gpu" ) )
+  {
+    deviceType	= CL_DEVICE_TYPE_GPU;
+  }
+
+  if( vm.count( "cpu" ) )
+  {
+    deviceType	= CL_DEVICE_TYPE_CPU;
+  }
+
+  if( vm.count( "all" ) )
+  {
+    deviceType	= CL_DEVICE_TYPE_ALL;
+  }
+
+  if( profileCount > 1 )
+  {
+    commandQueueFlags |= CL_QUEUE_PROFILING_ENABLE;
+  }
+
+  bool useimages;
+  if( vm.count("useimages") )
+    useimages = true;
+  else
+    useimages = false;
+
+  StatisticalTimer& timer = StatisticalTimer::getInstance( );
+  timer.Reserve( 3, profileCount );
+  timer.setNormalize( true );
+
+  clblasFunc *my_function = NULL;
+  if (function == "gemm")
+  {
+    if (precision == "s")
+      my_function = new xGemm<cl_float>(timer, deviceType);
+    else if (precision == "d")
+      my_function = new xGemm<cl_double>(timer, deviceType);
+    else if (precision == "c")
+      my_function = new xGemm<cl_float2>(timer, deviceType);
+    else if (precision == "z")
+      my_function = new xGemm<cl_double2>(timer, deviceType);
+    else
+    {
+      std::cerr << "Unknown gemm function" << std::endl;
+      return -1;
+    }
+  }
+  else if (function == "trsm")
+  {
+    if (precision == "s")
+      my_function = new xTrsm<cl_float>(timer, deviceType);
+    else if (precision == "d")
+      my_function = new xTrsm<cl_double>(timer, deviceType);
+    else if (precision == "c")
+      my_function = new xTrsm<cl_float2>(timer, deviceType);
+    else if (precision == "z")
+      my_function = new xTrsm<cl_double2>(timer, deviceType);
+    else
+    {
+      std::cerr << "Unknown trsm function" << std::endl;
+      return -1;
+    }
+  }
+  else if (function == "trmm")
+  {
+    if (precision == "s")
+      my_function = new xTrmm<cl_float>(timer, deviceType);
+    else if (precision == "d")
+      my_function = new xTrmm<cl_double>(timer, deviceType);
+    else if (precision == "c")
+      my_function = new xTrmm<cl_float2>(timer, deviceType);
+    else if (precision == "z")
+      my_function = new xTrmm<cl_double2>(timer, deviceType);
+    else
+    {
+      std::cerr << "Unknown trmm function" << std::endl;
+      return -1;
+    }
+  }
+  else if (function == "gemv")
+  {
+    if (precision == "s")
+      my_function = new xGemv<cl_float>(timer, deviceType);
+    else if (precision == "d")
+      my_function = new xGemv<cl_double>(timer, deviceType);
+    else if (precision == "c")
+      my_function = new xGemv<cl_float2>(timer, deviceType);
+    else if (precision == "z")
+      my_function = new xGemv<cl_double2>(timer, deviceType);
+    else
+    {
+      std::cerr << "Unknown gemv function" << std::endl;
+      return -1;
+    }
+  }
+  else if (function == "symv")
+  {
+    if (precision == "s")
+      my_function = new xSymv<cl_float>(timer, deviceType);
+    else if (precision == "d")
+      my_function = new xSymv<cl_double>(timer, deviceType);
+    else
+    {
+      std::cerr << "Unknown symv function" << std::endl;
+      return -1;
+    }
+  }
+  else if (function == "syrk")
+  {
+    if (precision == "s")
+      my_function = new xSyrk<cl_float>(timer, deviceType);
+    else if (precision == "d")
+      my_function = new xSyrk<cl_double>(timer, deviceType);
+        else if (precision == "c")
+             my_function = new xSyrk<cl_float2>(timer, deviceType);
+        else if (precision == "z")
+             my_function = new xSyrk<cl_double2>(timer, deviceType);
+    else
+    {
+      std::cerr << "Unknown syrk function" << std::endl;
+      return -1;
+    }
+  }
+  else if (function == "syr2k")
+  {
+    if (precision == "s")
+      my_function = new xSyr2k<cl_float>(timer, deviceType);
+    else if (precision == "d")
+      my_function = new xSyr2k<cl_double>(timer, deviceType);
+    else if (precision == "c")
+      my_function = new xSyr2k<cl_float2>(timer, deviceType);
+    else if (precision == "z")
+      my_function = new xSyr2k<cl_double2>(timer, deviceType);
+    else
+    {
+      std::cerr << "Unknown syr2k function" << std::endl;
+      return -1;
+    }
+  }
+  else if (function == "trsv")
+  {
+    if (precision == "s")
+      my_function = new xTrsv<cl_float>(timer, deviceType);
+    else if (precision == "d")
+      my_function = new xTrsv<cl_double>(timer, deviceType);
+    else if (precision == "c")
+      my_function = new xTrsv<cl_float2>(timer, deviceType);
+    else if (precision == "z")
+      my_function = new xTrsv<cl_double2>(timer, deviceType);
+    else
+    {
+      std::cerr << "Unknown trsv function" << std::endl;
+      return -1;
+    }
+  }
+  else if (function == "trmv")
+  {
+    if (precision == "s")
+      my_function = new xTrmv<cl_float>(timer, deviceType);
+    else if (precision == "d")
+      my_function = new xTrmv<cl_double>(timer, deviceType);
+    else if (precision == "c")
+      my_function = new xTrmv<cl_float2>(timer, deviceType);
+    else if (precision == "z")
+      my_function = new xTrmv<cl_double2>(timer, deviceType);
+    else
+    {
+      std::cerr << "Unknown trmv function" << std::endl;
+      return -1;
+    }
+  }
+  else if (function == "ger")
+  {
+    if (precision == "s")
+      my_function = new xGer<cl_float>(timer, deviceType);
+    else if (precision == "d")
+          my_function = new xGer<cl_double>(timer, deviceType);
+    else
+    {
+      std::cerr << "Unknown ger function" << std::endl;
+      return -1;
+    }
+  }
+  else if (function == "syr")
+  {
+    if (precision == "s")
+      my_function = new xSyr<cl_float>(timer, deviceType);
+    else if (precision == "d")
+      my_function = new xSyr<cl_double>(timer, deviceType);
+    else
+    {
+      std::cerr << "Unknown syr function" << std::endl;
+      return -1;
+    }
+  }
+  else if (function == "syr2")
+  {
+    if (precision == "s")
+      my_function = new xSyr2<cl_float>(timer, deviceType);
+    else if (precision == "d")
+      my_function = new xSyr2<cl_double>(timer, deviceType);
+    else
+    {
+      std::cerr << "Unknown syr2 function" << std::endl;
+      return -1;
+    }
+  }
+  else if (function == "geru")
+  {
+    if (precision == "c")
+      my_function = new xGeru<cl_float2>(timer, deviceType);
+    else if (precision == "z")
+      my_function = new xGeru<cl_double2>(timer, deviceType);
+    else
+    {
+      std::cerr << "Unknown geru function" << std::endl;
+      return -1;
+    }
+  }
+  else if (function == "gerc")
+  {
+    if (precision == "c")
+      my_function = new xGerc<cl_float2>(timer, deviceType);
+    else if (precision == "z")
+      my_function = new xGerc<cl_double2>(timer, deviceType);
+    else
+    {
+      std::cerr << "Unknown gerc function" << std::endl;
+      return -1;
+    }
+  }
+  else if (function == "her")
+  {
+    if (precision == "c")
+      my_function = new xHer<cl_float2>(timer, deviceType);
+    else if (precision == "z")
+      my_function = new xHer<cl_double2>(timer, deviceType);
+    else
+    {
+      std::cerr << "Unknown her function" << std::endl;
+      return -1;
+    }
+  }
+  else if (function == "her2")
+  {
+    if (precision == "c")
+      my_function = new xHer2<cl_float2>(timer, deviceType);
+    else if (precision == "z")
+      my_function = new xHer2<cl_double2>(timer, deviceType);
+    else
+    {
+      std::cerr << "Unknown her2 function" << std::endl;
+      return -1;
+    }
+  }
+  else if (function == "hemv")
+  {
+    if (precision == "c")
+      my_function = new xHemv<cl_float2>(timer, deviceType);
+    else if (precision == "z")
+      my_function = new xHemv<cl_double2>(timer, deviceType);
+    else
+    {
+      std::cerr << "Unknown hemv function" << std::endl;
+      return -1;
+    }
+  }
+  else if (function == "hemm")
+  {
+    if (precision == "c")
+      my_function = new xHemm<cl_float2>(timer, deviceType);
+    else if (precision == "z")
+      my_function = new xHemm<cl_double2>(timer, deviceType);
+    else
+    {
+      std::cerr << "Unknown hemm function" << std::endl;
+      return -1;
+    }
+  }
+  else if (function == "symm")
+  {
+    if (precision == "s")
+      my_function = new xSymm<cl_float>(timer, deviceType);
+    else if (precision == "d")
+      my_function = new xSymm<cl_double>(timer, deviceType);
+    else if (precision == "c")
+      my_function = new xSymm<cl_float2>(timer, deviceType);
+    else if (precision == "z")
+      my_function = new xSymm<cl_double2>(timer, deviceType);
+    else
+    {
+      std::cerr << "Unknown symm function" << std::endl;
+      return -1;
+    }
+  }
+  try
+  {
+      my_function->setup_buffer( order_option, side_option, uplo_option,
+                                 diag_option, transA_option, transB_option,
+                                   M, N, K, lda, ldb, ldc, offA, offBX, offCY,
+                                   alpha, beta );
+
+
+      my_function->initialize_cpu_buffer();
+      my_function->initialize_gpu_buffer();
+
+      my_function->call_func(); // do a calculation first to get any compilation out of the way
+      my_function->reset_gpu_write_buffer(); // reset GPU write buffer
+  }
+  catch( std::exception& exc )
+  {
+      std::cerr << exc.what( ) << std::endl;
+      return 1;
+  }
+  if(roundtrip=="roundtrip"||roundtrip=="both")
+  {
+  timer.Reset();
+  for( cl_uint i = 0; i < profileCount; ++i )
+  {
+    my_function->roundtrip_setup_buffer( order_option, side_option, uplo_option,
+                                 diag_option, transA_option, transB_option,
+                                   M, N, K, lda, ldb, ldc, offA, offBX, offCY,
+                                   alpha, beta );
+
+
+    my_function->initialize_cpu_buffer();
+    /*my_function->initialize_gpu_buffer();
+    my_function->call_func();
+	my_function->read_gpu_buffer();
+    my_function->reset_gpu_write_buffer();*/
+	my_function->roundtrip_func();
+	my_function->reset_gpu_write_buffer();
+  }
+
+  if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE )
+  {
+    //std::cout << timer << std::endl;
+    timer.pruneOutliers( 3.0 );
+    std::cout << "BLAS (round trip) execution time < ns >: " << my_function->time_in_ns() << std::endl;
+    std::cout << "BLAS (round trip) execution Gflops < " <<
+      my_function->gflops_formula() << " >: " << my_function->gflops() <<
+      std::endl;
+  }
+  }
+  if(roundtrip=="noroundtrip"||roundtrip=="both")
+  {
+  timer.Reset();
+  for( cl_uint i = 0; i < profileCount; ++i )
+  {
+    my_function->setup_buffer( order_option, side_option, uplo_option,
+                                 diag_option, transA_option, transB_option,
+                                   M, N, K, lda, ldb, ldc, offA, offBX, offCY,
+                                   alpha, beta );
+
+
+    my_function->initialize_cpu_buffer();
+    my_function->initialize_gpu_buffer();
+    my_function->call_func();
+	my_function->read_gpu_buffer();
+    my_function->reset_gpu_write_buffer();
+  }
+
+  if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE )
+  {
+    //std::cout << timer << std::endl;
+    timer.pruneOutliers( 3.0 );
+    std::cout << "BLAS kernel execution time < ns >: " << my_function->time_in_ns() << std::endl;
+    std::cout << "BLAS kernel execution Gflops < " <<
+      my_function->gflops_formula() << " >: " << my_function->gflops() <<
+      std::endl;
+  }
+  }
+
+  return 0;
+}
+
diff --git a/src/client/ctimer.h b/src/client/ctimer.h
new file mode 100644
index 0000000..6a55d75
--- /dev/null
+++ b/src/client/ctimer.h
@@ -0,0 +1,42 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#ifndef C_TIMER_HXX__
+#define C_TIMER_HXX__
+
+#if defined(__cplusplus)
+typedef class timer *Timer;
+#else
+typedef struct timer *Timer;
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+extern Timer CreateTimer();
+extern void DeleteTimer(Timer timer);
+extern double GetTime(Timer timer);
+extern void PauseTimer(Timer timer);
+extern void RestartTimer(Timer timer);
+extern void ResetTimer(Timer timer);
+extern void ResetDelayTimer(Timer timer, double delay_time);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif // ifndef C_TIMER_HXX__
diff --git a/src/client/makefile b/src/client/makefile
new file mode 100644
index 0000000..2eeb288
--- /dev/null
+++ b/src/client/makefile
@@ -0,0 +1,14 @@
+SHELL		=	/bin/bash
+CXX		=	g++
+CXXFLAGS	=	-O3 -fomit-frame-pointer -finline-functions -I../include -I../tests/include
+LIBS		=	-lclblas -lOpenCL -lboost_program_options -lrt
+.PHONY: clean
+
+%.o:%.cpp
+	${CXX} ${CXXFLAGS} $< -c
+
+clblas_client: clblas_client.o statisticalTimer.o timer.o
+	${CXX} ${CXXFLAGS} $^ ${LIBS} -o $@ 
+
+clean:
+	rm -rf *.o
diff --git a/src/client/statisticalTimer.cpp b/src/client/statisticalTimer.cpp
new file mode 100644
index 0000000..a2ee941
--- /dev/null
+++ b/src/client/statisticalTimer.cpp
@@ -0,0 +1,341 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// StatTimer.cpp : Defines the exported functions for the DLL application.
+//
+
+#include "stdafx.h"
+#include <iostream>
+#include <string>
+#include <cassert>
+#include <limits>
+#include <functional>
+#include "statisticalTimer.h"
+
+#if defined( __GNUC__ )
+	#include <sys/time.h>
+#endif
+
+//	Functor object to help with accumulating values in vectors
+template< typename T >
+struct Accumulator: public std::unary_function< T, void >
+{
+	T acc;
+
+	Accumulator( ): acc( 0 ) {}
+	void operator( )(T x) { acc += x; }
+};
+
+//	Unary predicate used for remove_if() algorithm
+//	Currently, RangeType is expected to be a floating point type, and ValType an integer type
+template< typename RangeType, typename ValType >
+struct PruneRange
+{
+	RangeType lower, upper;
+
+	PruneRange( RangeType mean, RangeType stdev ): lower( mean-stdev ), upper( mean+stdev ) {}
+
+	bool operator( )( ValType val )
+	{
+		//	These comparisons can be susceptible to signed/unsigned casting problems
+		//	This is why we cast ValType to RangeType, because RangeType should always be floating and signed
+		if( static_cast< RangeType >( val ) < lower )
+			return true;
+		else if( static_cast< RangeType >( val ) > upper )
+			return true;
+
+		return false;
+	}
+};
+
+StatisticalTimer&
+StatisticalTimer::getInstance( )
+{
+	static	StatisticalTimer	timer;
+	return	timer;
+}
+
+StatisticalTimer::StatisticalTimer( ): nEvents( 0 ), nSamples( 0 ), normalize( true )
+{
+#if defined( _WIN32 )
+	//	OS call to get ticks per second2
+	::QueryPerformanceFrequency( reinterpret_cast<LARGE_INTEGER*>( &clkFrequency ) );
+#else
+	clkFrequency = 1000000;
+#endif
+}
+
+StatisticalTimer::~StatisticalTimer( )
+{}
+
+void
+StatisticalTimer::Clear( )
+{
+	labelID.clear( );
+	clkStart.clear( );
+	clkTicks.clear( );
+}
+
+void
+StatisticalTimer::Reset( )
+{
+	if( nEvents == 0 || nSamples == 0 )
+		throw	std::runtime_error( "StatisticalTimer::Reserve( ) was not called before Reset( )" );
+
+	clkStart.clear( );
+	clkTicks.clear( );
+
+	clkStart.resize( nEvents );
+	clkTicks.resize( nEvents );
+
+	for( unsigned int	i = 0; i < nEvents; ++i )
+	{
+		clkTicks.at( i ).reserve( nSamples );
+	}
+
+	return;
+}
+
+//	The caller can pre-allocate memory, to improve performance.
+//	nEvents is an approximate value for how many seperate events the caller will think
+//	they will need, and nSamples is a hint on how many samples we think we will take
+//	per event
+void
+StatisticalTimer::Reserve( unsigned int nEvents, unsigned int nSamples )
+{
+	this->nEvents	= std::max<unsigned int> (1, nEvents);
+	this->nSamples	= std::max<unsigned int> (1, nSamples);
+
+	Clear( );
+	labelID.reserve( nEvents );
+
+	clkStart.resize( nEvents );
+	clkTicks.resize( nEvents );
+
+	for( unsigned int i = 0; i < nEvents; ++i )
+	{
+		clkTicks.at( i ).reserve( nSamples );
+	}
+}
+
+void
+StatisticalTimer::setNormalize( bool norm )
+{
+	normalize = norm;
+}
+
+void
+StatisticalTimer::Start( sTimerID id )
+{
+#if defined( _WIN32 )
+	::QueryPerformanceCounter( reinterpret_cast<LARGE_INTEGER*>( &clkStart.at( id ) ) );
+#else
+	struct timeval s;
+	gettimeofday(&s, 0);
+	clkStart.at( id ) = (unsigned long long)s.tv_sec * 1000000 + (unsigned long long)s.tv_usec;
+#endif
+}
+
+void
+StatisticalTimer::Stop( sTimerID id )
+{
+	unsigned long long n;
+
+#if defined( _WIN32 )
+	::QueryPerformanceCounter( reinterpret_cast<LARGE_INTEGER*>( &n ) );
+#else
+	struct timeval s;
+	gettimeofday(&s, 0);
+	n = (unsigned long long)s.tv_sec * 1000000 + (unsigned long long)s.tv_usec;
+#endif
+
+	n		-= clkStart.at( id );
+	clkStart.at( id )	= 0;
+	AddSample( id, n );
+}
+
+void
+StatisticalTimer::AddSample( const sTimerID id, const unsigned long long n )
+{
+	clkTicks.at( id ).push_back( n );
+}
+
+//	This function's purpose is to provide a mapping from a 'friendly' human readable text string
+//	to an index into internal data structures.
+StatisticalTimer::sTimerID
+StatisticalTimer::getUniqueID( const std::string& label, unsigned int groupID )
+{
+	//	I expect labelID will hardly ever grow beyond 30, so it's not of any use
+	//	to keep this sorted and do a binary search
+
+	labelPair	sItem	= std::make_pair( label, groupID );
+
+	stringVector::iterator	iter;
+	iter	= std::find( labelID.begin(), labelID.end(), sItem );
+
+	if( iter != labelID.end( ) )
+		return	std::distance( labelID.begin( ), iter );
+
+	labelID.push_back( sItem );
+
+	return	labelID.size( ) - 1;
+
+}
+
+double
+StatisticalTimer::getMean( sTimerID id ) const
+{
+	if( clkTicks.empty( ) )
+		return	0;
+
+	size_t	N	= clkTicks.at( id ).size( );
+
+	Accumulator<unsigned long long> sum = std::for_each( clkTicks.at( id ).begin(), clkTicks.at( id ).end(), Accumulator<unsigned long long>() );
+
+	return	static_cast<double>( sum.acc ) / N;
+}
+
+double
+StatisticalTimer::getVariance( sTimerID id ) const
+{
+	if( clkTicks.empty( ) )
+		return	0;
+
+	double	mean	= getMean( id );
+
+	size_t	N	= clkTicks.at( id ).size( );
+	double	sum	= 0;
+
+	for( unsigned int i = 0; i < N; ++i )
+	{
+		double	diff	= clkTicks.at( id ).at( i ) - mean;
+		diff	*= diff;
+		sum		+= diff;
+	}
+
+	return	 sum / N;
+}
+
+double
+StatisticalTimer::getStdDev( sTimerID id ) const
+{
+	double	variance	= getVariance( id );
+
+	return	sqrt( variance );
+}
+
+double
+StatisticalTimer::getAverageTime( sTimerID id ) const
+{
+	if( normalize )
+		return getMean( id ) / clkFrequency;
+	else
+		return getMean( id );
+}
+
+double
+StatisticalTimer::getMinimumTime( sTimerID id ) const
+{
+	clkVector::const_iterator iter	= std::min_element( clkTicks.at( id ).begin( ), clkTicks.at( id ).end( ) );
+
+	if( iter != clkTicks.at( id ).end( ) )
+	{
+		if( normalize )
+			return static_cast<double>( *iter ) / clkFrequency;
+		else
+			return static_cast<double>( *iter );
+	}
+	else
+		return	0;
+}
+
+unsigned int
+StatisticalTimer::pruneOutliers( sTimerID id , double multiple )
+{
+	if( clkTicks.empty( ) )
+		return	0;
+
+	double	mean	= getMean( id );
+	double	stdDev	= getStdDev( id );
+
+	clkVector&	clks = clkTicks.at( id );
+
+	//	Look on p. 379, "The C++ Standard Library"
+	//	std::remove_if does not actually erase, it only copies elements, it returns new 'logical' end
+	clkVector::iterator	newEnd	= std::remove_if( clks.begin( ), clks.end( ), PruneRange< double,unsigned long long >( mean, multiple*stdDev ) );
+
+	clkVector::difference_type dist	= std::distance( newEnd, clks.end( ) );
+
+	if( dist != 0 )
+		clks.erase( newEnd, clks.end( ) );
+
+	assert( dist < std::numeric_limits< unsigned int >::max( ) );
+
+	return static_cast< unsigned int >( dist );
+}
+
+unsigned int
+StatisticalTimer::pruneOutliers( double multiple )
+{
+	unsigned int	tCount	= 0;
+
+	for( unsigned int l = 0; l < labelID.size( ); ++l )
+	{
+		unsigned int lCount	= pruneOutliers( l , multiple );
+		std::clog << "\tStatisticalTimer:: Pruning " << lCount << " samples from " << labelID[l].first << std::endl;
+		tCount += lCount;
+	}
+
+	return	tCount;
+}
+
+//	Defining an output print operator
+std::ostream&
+operator<<( std::ostream& os, const StatisticalTimer& st )
+{
+	if( st.clkTicks.empty( ) )
+		return	os;
+
+	std::ios::fmtflags bckup	= os.flags( );
+
+	for( unsigned int l = 0; l < st.labelID.size( ); ++l )
+	{
+		unsigned long long min	= 0;
+		StatisticalTimer::clkVector::const_iterator iter	= std::min_element( st.clkTicks.at( l ).begin( ), st.clkTicks.at( l ).end( ) );
+
+		if( iter != st.clkTicks.at( l ).end( ) )
+			min		= *iter;
+
+		os << st.labelID[l].first << ", " << st.labelID[l].second << std::fixed << std::endl;
+		os << "Min:," << min << std::endl;
+		os << "Mean:," << st.getMean( l ) << std::endl;
+		os << "StdDev:," << st.getStdDev( l ) << std::endl;
+		os << "AvgTime:," << st.getAverageTime( l ) << std::endl;
+		os << "MinTime:," << st.getMinimumTime( l ) << std::endl;
+
+		for( unsigned int	t = 0; t < st.clkTicks[l].size( ); ++t )
+		{
+			os << st.clkTicks[l][t]<< ",";
+		}
+		os << "\n" << std::endl;
+
+	}
+
+	os.flags( bckup );
+
+	return	os;
+}
diff --git a/src/client/statisticalTimer.h b/src/client/statisticalTimer.h
new file mode 100644
index 0000000..bba8f8e
--- /dev/null
+++ b/src/client/statisticalTimer.h
@@ -0,0 +1,170 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#pragma once
+#ifndef _STATISTICALTIMER_H_
+#define _STATISTICALTIMER_H_
+#include <iosfwd>
+#include <vector>
+#include <algorithm>
+
+/**
+ * \file clAmdFft.StatisticalTimer.h
+ * \brief A timer class that provides a cross platform timer for use
+ * in timing code progress with a high degree of accuracy.
+ *	This class is implemented entirely in the header, to facilitate inclusion into multiple
+ *	projects without needing to compile an object file for each project.
+ */
+
+/**
+ * \class StatisticalTimer
+ * \brief Counter that provides a fairly accurate timing mechanism for both
+ * windows and linux. This timer is used extensively in all the samples.
+ */
+
+class StatisticalTimer
+{
+	//	Private typedefs
+	typedef std::vector< unsigned long long > clkVector;
+	typedef	std::pair< std::string, unsigned int > labelPair;
+	typedef	std::vector< labelPair > stringVector;
+
+	//	In order to calculate statistics <std. dev.>, we need to keep a history of our timings
+	stringVector	labelID;
+	clkVector	clkStart;
+	std::vector< clkVector >	clkTicks;
+
+	//	How many clockticks in a second
+	unsigned long long	clkFrequency;
+
+	//	Saved sizes for our vectors, used in Reset() to reallocate vectors
+	clkVector::size_type	nEvents, nSamples;
+
+	//	This setting controls whether the Timer should convert samples into time by dividing by the
+	//	clock frequency
+	bool normalize;
+
+	/**
+	 * \fn StatisticalTimer()
+	 * \brief Constructor for StatisticalTimer that initializes the class
+	 *	This is private so that user code cannot create their own instantiation.  Instead, you
+	 *	must go through getInstance( ) to get a reference to the class.
+	 */
+	StatisticalTimer( );
+
+	/**
+	 * \fn ~StatisticalTimer()
+	 * \brief Destructor for StatisticalTimer that cleans up the class
+	 */
+	~StatisticalTimer( );
+
+	/**
+	 * \fn StatisticalTimer(const StatisticalTimer& )
+	 * \brief Copy constructors do not make sense for a singleton, disallow copies
+	 */
+	StatisticalTimer( const StatisticalTimer& );
+
+	/**
+	 * \fn operator=( const StatisticalTimer& )
+	 * \brief Assignment operator does not make sense for a singleton, disallow assignments
+	 */
+	StatisticalTimer& operator=( const StatisticalTimer& );
+
+	friend std::ostream& operator<<( std::ostream& os, const StatisticalTimer& s );
+
+public:
+	//	Public typedefs
+	typedef stringVector::difference_type sTimerID;
+
+	/**
+	 * \fn getInstance()
+	 * \brief This returns a reference to the singleton timer.  Guarantees only 1 timer class is ever
+	 *	instantiated within a compilable executable.
+	 */
+	static StatisticalTimer& getInstance( );
+
+	/**
+	 * \fn void Start( sTimerID id )
+	 * \brief Start the timer
+	 * \sa Stop(), Reset()
+	 */
+	void Start( sTimerID id );
+
+	/**
+	 * \fn void Stop( sTimerID id )
+	 * \brief Stop the timer
+	 * \sa Start(), Reset()
+	 */
+	void Stop( sTimerID id );
+
+	/**
+	 * \fn void AddSample( const sTimerID id, const unsigned long long n )
+	 * \brief Explicitely add a timing sample into the class
+	 */
+	void AddSample( const sTimerID id, const unsigned long long n );
+
+	/**
+	 * \fn void Reset(void)
+	 * \brief Reset the timer to 0
+	 * \sa Start(), Stop()
+	 */
+	void Clear( );
+
+	/**
+	 * \fn void Reset(void)
+	 * \brief Reset the timer to 0
+	 * \sa Start(), Stop()
+	 */
+	void Reset( );
+
+	void Reserve( unsigned int nEvents, unsigned int nSamples );
+
+	sTimerID getUniqueID( const std::string& label, unsigned int groupID );
+
+	//	Calculate the average/mean of data for a given event
+	void	setNormalize( bool norm );
+
+	//	Calculate the average/mean of data for a given event
+	double	getMean( sTimerID id ) const;
+
+	//	Calculate the variance of data for a given event
+	//	Variance - average of the squared differences between data points and the mean
+	double	getVariance( sTimerID id ) const;
+
+	//	Sqrt of variance, also in units of the original data
+	double	getStdDev( sTimerID id ) const;
+
+	/**
+	 * \fn double getAverageTime(sTimerID id) const
+	 * \return Return the arithmetic mean of all the samples that have been saved
+	 */
+	double getAverageTime( sTimerID id ) const;
+
+	/**
+	 * \fn double getMinimumTime(sTimerID id) const
+	 * \return Return the arithmetic min of all the samples that have been saved
+	 */
+	double getMinimumTime( sTimerID id ) const;
+
+	//	Using the stdDev of the entire population (of an id), eliminate those samples that fall
+	//	outside some specified multiple of the stdDev.  This assumes that the population
+	//	form a gaussian curve.
+	unsigned int	pruneOutliers( double multiple );
+	unsigned int	pruneOutliers( sTimerID id , double multiple );
+};
+
+#endif // _STATISTICALTIMER_H_
diff --git a/src/client/stdafx.cpp b/src/client/stdafx.cpp
new file mode 100644
index 0000000..5f5f9a1
--- /dev/null
+++ b/src/client/stdafx.cpp
@@ -0,0 +1,25 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// stdafx.cpp : source file that includes just the standard includes
+// clAmdFft.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+
+#include "stdafx.h"
+
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
diff --git a/src/client/stdafx.h b/src/client/stdafx.h
new file mode 100644
index 0000000..9e1e80f
--- /dev/null
+++ b/src/client/stdafx.h
@@ -0,0 +1,40 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// stdafx.h : include file for standard system include files,
+// or project specific include files that are used frequently, but
+// are changed infrequently
+//
+
+#pragma once
+
+#include "targetver.h"
+
+#include <iostream>
+#include <stdexcept>
+#include <iomanip>
+#include <complex>
+#include <valarray>
+#include <stdarg.h>
+#if defined( _WIN32 )
+	#define NOMINMAX
+	#define WIN32_LEAN_AND_MEAN			// Exclude rarely-used stuff from Windows headers
+
+	#include <tchar.h>
+	#include <windows.h>
+#endif
+
diff --git a/src/client/targetver.h b/src/client/targetver.h
new file mode 100644
index 0000000..b4b4cc8
--- /dev/null
+++ b/src/client/targetver.h
@@ -0,0 +1,27 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#pragma once
+
+// Including SDKDDKVer.h defines the highest available Windows platform.
+
+// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
+// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
+
+#if defined( _WIN32 )
+	#include <SDKDDKVer.h>
+#endif
diff --git a/src/client/testPerfWrapper.cpp b/src/client/testPerfWrapper.cpp
new file mode 100644
index 0000000..3df0af3
--- /dev/null
+++ b/src/client/testPerfWrapper.cpp
@@ -0,0 +1,206 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <iostream>
+#include <stdio.h>
+#include <clBLAS.h>
+#include <boost/program_options.hpp>
+
+
+#if defined( _WIN32 )
+#define popen _popen
+#define pclose _pclose
+#pragma warning (disable:4996)
+#endif
+
+namespace po = boost::program_options;
+
+int
+main(int argc, char *argv[])
+{
+    size_t M;
+    size_t N;
+    size_t K;
+    cl_double alpha;
+    cl_double beta;
+    cl_uint profileCount;
+    int order_option;
+    int transA_option;
+    int transB_option;
+    int uplo_option;
+    int side_option;
+    int diag_option;
+    size_t lda;
+    size_t ldb;
+    size_t ldc;
+    size_t offA;
+    size_t offBX;
+    size_t offCY;
+    std::string function;
+    std::string perf_options;
+    std::string precision;
+    std::string command_line;
+    FILE *perf_pipe;
+    float perfGFL;
+    int test_case;
+
+    perf_options = "";
+    po::options_description desc( "clBLAS client command line options" );
+    desc.add_options()
+        ( "help,h", "produces this help message" )
+        ( "gpu,g", "Force instantiation of an OpenCL GPU device" )
+        ( "cpu,c", "Force instantiation of an OpenCL CPU device" )
+        ( "all,a", "Force instantiation of all OpenCL devices" )
+        ( "useimages", "Use an image-based kernel" )
+        ( "sizem,m", po::value<size_t>( &M )->default_value(128), "number of rows in A and C" )
+        ( "sizen,n", po::value<size_t>( &N )->default_value(128), "number of columns in B and C" )
+        ( "sizek,k", po::value<size_t>( &K )->default_value(128), "number of columns in A and rows in B" )
+        ( "lda", po::value<size_t>( &lda )->default_value(0), "first dimension of A in memory. if set to 0, lda will default to M (when transposeA is \"no transpose\") or K (otherwise)" )
+        ( "ldb", po::value<size_t>( &ldb )->default_value(0), "first dimension of B in memory. if set to 0, ldb will default to K (when transposeB is \"no transpose\") or N (otherwise)" )
+        ( "ldc", po::value<size_t>( &ldc )->default_value(0), "first dimension of C in memory. if set to 0, ldc will default to M" )
+        ( "offA", po::value<size_t>( &offA )->default_value(0), "offset of the matrix A in memory object (ignored, just for compatibility with the python script)" )
+        ( "offBX", po::value<size_t>( &offBX )->default_value(0), "offset of the matrix B or vector X in memory object (ignored, just for compatibility with the python script)" )
+        ( "offCY", po::value<size_t>( &offCY )->default_value(0), "offset of the matrix C or vector Y in memory object (ignored, just for compatibility with the python script)" )
+        ( "alpha", po::value<cl_double>( &alpha )->default_value(1.0f), "specifies the scalar alpha" )
+        ( "beta", po::value<cl_double>( &beta )->default_value(1.0f), "specifies the scalar beta" )
+        ( "order,o", po::value<int>( &order_option )->default_value(0), "0 = row major, 1 = column major" )
+        ( "transposeA", po::value<int>( &transA_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" )
+        ( "transposeB", po::value<int>( &transB_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" )
+        ( "function,f", po::value<std::string>( &function )->default_value("gemm"), "BLAS function to test. Options: gemm, trsm, trmm, gemv, symv, syrk, syr2k" )
+        ( "precision,r", po::value<std::string>( &precision )->default_value("s"), "Options: s,d,c,z" )
+        ( "side", po::value<int>( &side_option )->default_value(0), "0 = left, 1 = right. only used with trmm, trsm" )
+        ( "uplo", po::value<int>( &uplo_option )->default_value(0), "0 = upper, 1 = lower. only used with trmm, trs, syrk, syr2k, symv" )
+        ( "diag", po::value<int>( &diag_option )->default_value(0), "0 = unit diagonal, 1 = non unit diagonal. only used with trmm, trsm" )
+        ( "profile,p", po::value<cl_uint>( &profileCount )->default_value(1), "Time and report the kernel speed (default: profiling off)" )
+        ;
+
+        po::variables_map vm;
+        po::store( po::parse_command_line( argc, argv, desc ), vm );
+        po::notify( vm );
+
+        if( vm.count( "help" ) )
+        {
+            std::cout << desc << std::endl;
+            return 0;
+        }
+
+        if( vm.count( "cpu" ) )
+        {
+            perf_options += " --device cpu";
+        }
+        else
+        {
+            perf_options += " --device gpu";
+        }
+
+        perf_options = " --gtest_filter=Custom/";
+        test_case = 0;
+        if( function == "gemm" )
+        {
+            perf_options += "GEMM.";
+            test_case += transB_option;
+            test_case += 3 * transA_option;
+            test_case += 9 * (1 - order_option);
+        }
+        else if( function == "trmm" )
+        {
+            perf_options += "TRMM.";
+            test_case += diag_option;
+            test_case += 2 * transA_option;
+            test_case += 6 * uplo_option;
+            test_case += 12 * side_option;
+            test_case += 24 * (1 - order_option);
+        }
+        else if( function == "trsm" )
+        {
+            perf_options += "TRSM.";
+            test_case += diag_option;
+            test_case += 2 * transA_option;
+            test_case += 6 * uplo_option;
+            test_case += 12 * side_option;
+            test_case += 24 * (1 - order_option);
+        }
+        else if( function == "syrk" )
+        {
+            perf_options += "SYRK.";
+            test_case += transA_option;
+            test_case += 3 * uplo_option;
+            test_case += 6 * (1 - order_option);
+        }
+        else if( function == "syr2k" )
+        {
+            perf_options += "SYR2K.";
+            test_case += transA_option;
+            test_case += 3 * uplo_option;
+            test_case += 6 * (1 - order_option);
+        }
+        else if( function == "gemv" )
+        {
+            perf_options += "GEMV.";
+            test_case += transA_option;
+            test_case += 3 * (1 - order_option);
+        }
+        else if( function == "symv" )
+        {
+            perf_options += "SYMV.";
+            test_case += uplo_option;
+            test_case += 2 * (1 - order_option);
+        }
+        else {
+            std::cerr << "Invalid value for --function" << std::endl;
+            return -1;
+        }
+        perf_options += precision + function;
+
+        std::stringstream sizes_str;
+        sizes_str << "/" <<  test_case << " " << M << " " << N << " " << K;
+        perf_options += sizes_str.str();
+
+        command_line = "test-performance" + perf_options;
+
+        std::cerr << "Command line: " << command_line << std::endl;
+
+        perfGFL = 0;
+        perf_pipe = popen( command_line.c_str(), "r" );
+        if (perf_pipe == NULL) {
+            perror(command_line.c_str());
+            std::cerr << "Could not run " << command_line << std::endl;
+            return -1;
+        }
+        else {
+            char strbuf[512];
+            while(!feof(perf_pipe)) {
+                strbuf[0] = '\0';
+                if (fgets(strbuf, sizeof(strbuf), perf_pipe) == NULL) {
+                    std::cout << "[ERROR]: Read from the pipe has failed!" <<
+                                 std::endl;
+                    pclose(perf_pipe);
+                    return 1;
+                }
+
+                if (sscanf(strbuf, "average performance = %f", &perfGFL) == 1) {
+                    break;
+                }
+            }
+
+        }
+        pclose(perf_pipe);
+
+        std::cout << "BLAS kernel execution Gflops < >: " << perfGFL << std::endl;
+        return 0;
+}
+
diff --git a/src/client/timer.cpp b/src/client/timer.cpp
new file mode 100644
index 0000000..73ef2cc
--- /dev/null
+++ b/src/client/timer.cpp
@@ -0,0 +1,103 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include "ctimer.h"
+#include "timer.hpp"
+
+timer::
+timer() : elapsed_time_(0.0)
+{
+  init_time_ = get_walltime();
+}
+
+timer::
+~timer()
+{
+}
+
+double
+timer::
+get()
+{
+  return elapsed_time_ + get_walltime() - init_time_;
+}
+
+void
+timer::
+pause()
+{
+  elapsed_time_ = get();
+}
+
+void
+timer::
+restart()
+{
+  init_time_ = get_walltime();
+}
+
+void
+timer::
+reset()
+{
+  elapsed_time_ = 0.0;
+  init_time_ = get_walltime();
+}
+
+void
+timer::
+reset_delay(double delay_time)
+{
+  reset();
+  elapsed_time_ = delay_time;
+}
+
+Timer CreateTimer()
+{
+  Timer local_timer = new timer();
+  return local_timer;
+}
+
+void DeleteTimer(Timer timer)
+{
+  delete timer;
+}
+
+double GetTime(Timer timer)
+{
+  return timer->get();
+}
+
+void ResetTimer(Timer timer)
+{
+  timer->reset();
+}
+
+void RestartTimer(Timer timer)
+{
+  timer->restart();
+}
+
+void PauseTimer(Timer timer)
+{
+  timer->pause();
+}
+
+void ResetDelayTimer(Timer timer, double delay_time)
+{
+  timer->reset_delay(delay_time);
+}
+
diff --git a/src/client/timer.hpp b/src/client/timer.hpp
new file mode 100644
index 0000000..b167397
--- /dev/null
+++ b/src/client/timer.hpp
@@ -0,0 +1,50 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#ifndef TIMER_HXX__
+#define TIMER_HXX__
+
+#include <time.h>
+
+class timer
+{
+public:
+  double get();
+  void pause();
+  void restart();
+  void reset();
+  void reset_delay(double delay_time);
+
+private:
+  inline double get_walltime()
+  {
+    struct timespec ts;
+
+    clock_gettime(CLOCK_REALTIME, &ts);
+    return static_cast<double>(ts.tv_sec) +
+      static_cast<double>(ts.tv_nsec) * 1.0e-9;
+  }
+
+private:
+  double init_time_;
+  double elapsed_time_;
+
+public:
+    timer();
+  ~timer();
+}; // class timer
+
+#endif // ifndef TIMER_HXX__
diff --git a/src/include/clblas_stddef.h b/src/include/clblas_stddef.h
new file mode 100644
index 0000000..821ef3c
--- /dev/null
+++ b/src/include/clblas_stddef.h
@@ -0,0 +1,131 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef CLBLAS_STDDEF_H_
+#define CLBLAS_STDDEF_H_
+
+static __inline size_t
+szmin(size_t a, size_t b)
+{
+    return (a <= b ? a : b);
+}
+
+static __inline size_t
+szmax(size_t a, size_t b)
+{
+    return (a >= b ? a : b);
+}
+
+static __inline unsigned int
+umin(unsigned int a, unsigned int b)
+{
+    return (a <= b ? a : b);
+}
+
+static __inline unsigned int
+umax(unsigned int a, unsigned int b)
+{
+    return (a >= b ? a : b);
+}
+
+static __inline void
+uswap(unsigned int *a, unsigned int *b)
+{
+    unsigned int tmp;
+
+    tmp = *a;
+    *a = *b;
+    *b = tmp;
+}
+
+static __inline size_t
+roundDown(size_t a, size_t b)
+{
+    return (a / b * b);
+}
+
+static __inline size_t
+roundUp(size_t a, size_t b)
+{
+    return (a + b - 1) / b * b;
+}
+
+static __inline size_t
+divRoundUp(size_t a, size_t b)
+{
+    return (a / b) + (a % b != 0);
+}
+
+static __inline int
+isRoundedPow2(size_t a)
+{
+    return ((a & (a - 1)) == 0);
+}
+
+/*
+ * Return zero based sequential number of the highest set bit the
+ * number. If the number is 0, then the function returns -1.
+ */
+static __inline int
+findHighestSetBit(size_t a)
+{
+    int n = (sizeof(size_t) * 8 - 1);
+    size_t s = (size_t)1 << n;
+
+    for (; (s != 0) && !(s & a); s >>= 1) {
+        n--;
+    }
+
+    return (s == 0) ? -1 : n;
+}
+
+static __inline size_t
+roundDownPow2(size_t a)
+{
+    size_t s;
+
+    if (isRoundedPow2(a)) {
+        return a;
+    }
+
+    s = (size_t)1 << (sizeof(size_t) * 8 - 1);
+
+    // find the highest non zero bit
+    for (; (s != 0) && !(s & a); s >>= 1);
+
+    return s;
+}
+
+/*
+ * With BLAS we never deal with so large number sufficient for overflowing.
+ * So, it's safe
+ */
+static __inline size_t
+roundUpPow2(size_t a)
+{
+    size_t s;
+
+    if (isRoundedPow2(a)) {
+        return a;
+    }
+
+    s = roundDownPow2(a);
+
+    return (s << 1);
+}
+
+#endif /* CLBLAS_STDDEF_H_ */
diff --git a/src/include/clkern.h b/src/include/clkern.h
new file mode 100644
index 0000000..0570adc
--- /dev/null
+++ b/src/include/clkern.h
@@ -0,0 +1,199 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef CLARGS_H_
+#define CLARGS_H_
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include <string.h>
+#include <dis_warning.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define INIT_KARG(karg, val)                            \
+do {                                                    \
+    memcpy((karg)->arg.data, &(val), sizeof(val));      \
+    (karg)->typeSize = sizeof(val);                     \
+} while (0)
+
+enum {
+    MAX_KERNEL_ARGS = 32,
+    MAX_ARG_SIZE = sizeof(cl_double2),
+    MAX_WORK_DIM = 3
+};
+
+// memory object data transfer direction
+typedef enum MemobjDir {
+    MEMOBJ_READ = 0x1,
+    MEMOBJ_WRITE = 0x2
+} MemobjDir;
+
+typedef enum KernelLaunchPhase {
+    PHASE_SET_ARGS,
+    PHASE_ENQUEUE_WRITE,
+    PHASE_ENQUEUE_KERNEL,
+    PHASE_PROFILING,
+    PHASE_ENQUEUE_READ
+} KernelLaunchPhase;
+
+typedef union KernelArgValue {
+    cl_mem mem;
+    int ival;
+    unsigned char data[MAX_ARG_SIZE];
+} KernelArgValue;
+
+/*
+ * Structure describing an argument to be passed to a kernel
+ *
+ * @arg:        pointer to the argument
+ * @ardIdx:     argument index in the kernel argument list
+ * @hostBuf:    buffer to copy data to/from from/to GPU memory
+ * @enqType:    buffer enqueue type
+ * @sync:       blocking I/O
+ * @event:      event for I/O
+ */
+typedef struct KernelArg {
+    KernelArgValue arg;
+    unsigned int typeSize;  // argument type size, ignored for mem objects
+    void *hostBuf;          // host buffer for using with OpenCL memory objects
+    size_t hostBufLen;
+    MemobjDir dir;
+} KernelArg;
+
+typedef struct KernelDesc {
+    cl_kernel kernel;
+    size_t globalThreads[MAX_WORK_DIM];
+    size_t localThreads[MAX_WORK_DIM];
+    size_t workDim;
+    const cl_event *eventWaitList;
+    size_t waitListSize;
+    cl_event *event;
+    int nowait;
+    int needExecTime;
+    KernelArg args[MAX_KERNEL_ARGS];
+    unsigned long execTime;
+} KernelDesc;
+
+typedef struct KernelErrorInfo {
+    unsigned int wrongArg;
+    KernelLaunchPhase phase;
+} KernelErrorInfo;
+
+/*
+ * store kernel arguments launch the kernel and read its results
+ *
+ * @kernDesc:    descriptor of the kernel to be launched
+ * @queue:       command queue associated with the device
+ * @errInfo:     location to store info about occurred error,
+ *               ignored if NULL
+ *
+ * The function gets itself number of arguments to the kernel
+ * usging the OpenCL API
+ */
+cl_int launchClKernel(
+    KernelDesc *kernDesc,
+    cl_command_queue queue,
+    KernelErrorInfo *errInfo);
+
+/*
+ * build a program from source
+ *
+ * @source:     program source
+ * @buildOpts:  options to the opencl program builder
+ * @DevID:      ID of device to create program for
+ * @logBuf:     buffer to store build log at error
+ * @status:     location to store OpenCL status at error
+ *
+ * On success returns a build program object.
+ * On error returns <NULL>, and stores to the 'status' location
+ * opencl status; if <NULL> result is returned, but 'status'
+ * cointains 'CL_SUCCESS', it means an file I/O or memory allocation
+ * failure is occurred. If 'status' is set to NULL, it is ignored
+ */
+cl_program
+buildClProgram(
+    const char *source,
+    const char *buildOpts,
+    cl_context ctx,
+    cl_device_id devID,
+    char *logBuf,
+    size_t logBufSize,
+    cl_int *status);
+
+/*
+ * TODO: Doxygen-style comments
+ */
+cl_program
+createClProgramWithBinary(
+    cl_context ctx,
+    cl_device_id devID,
+    unsigned char *binary,
+    size_t binSize,
+    cl_int *status);
+
+/*
+ * TODO: Doxygen-style comments
+ */
+size_t
+getProgramBinarySize(cl_program program);
+
+/*
+ * TODO: Doxygen-style comments
+ */
+unsigned char
+*getProgramBinary(cl_program program);
+
+/*
+ * set a kernel argument of the size_t type
+ */
+static __inline void
+initSizeKarg(KernelArg *arg, size_t value)
+{
+    memcpy(arg->arg.data, &value, sizeof(cl_uint));
+    arg->typeSize = sizeof(cl_uint);
+}
+
+/*
+ * @inOut: memory object data transfer direction
+ */
+static __inline void
+initMemobjKarg(
+    KernelArg *karg,
+    cl_mem memobj,
+    void *hostBuf,
+    size_t hostBufLen,
+    MemobjDir dir)
+{
+    karg->arg.mem = memobj;
+    karg->typeSize = sizeof(cl_mem);
+    karg->hostBuf = hostBuf;
+    karg->hostBufLen = hostBufLen;
+    karg->dir = dir;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CLARGS_H_ */
diff --git a/src/include/cltypes.h b/src/include/cltypes.h
new file mode 100644
index 0000000..2c603af
--- /dev/null
+++ b/src/include/cltypes.h
@@ -0,0 +1,79 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef CLTYPES_H_
+#define CLTYPES_H_
+
+#include <defbool.h>
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+/**
+ * @internal
+ * @defgroup DTYPES Data types
+ */
+/*@{*/
+
+/**
+ * @brief OpenCL type identifiers
+ */
+typedef enum DataType {
+    TYPE_FLOAT,             /**< single float precision type */
+    TYPE_DOUBLE,            /**< double float precision type */
+    TYPE_COMPLEX_FLOAT,     /**< single float precision complex type */
+    TYPE_COMPLEX_DOUBLE,    /**< double float precision complex type */
+    TYPE_UNSIGNED_INT       /**< Unsigned int, for output buffer for iAMAX routine */
+} DataType;
+
+/*@}*/
+
+enum {
+    FLOAT4_VECLEN = sizeof(cl_float4) / sizeof(cl_float)
+};
+
+/*
+ * return size of a BLAS related data type
+ */
+#ifdef __cplusplus
+extern "C"
+#endif
+unsigned int
+dtypeSize(DataType type);
+
+/*
+ * width of the matrix (block) in float4 words
+ */
+size_t
+fl4RowWidth(size_t width, size_t typeSize);
+
+static __inline bool
+isDoubleBasedType(DataType dtype)
+{
+    return (dtype == TYPE_DOUBLE || dtype == TYPE_COMPLEX_DOUBLE);
+}
+
+static __inline bool
+isComplexType(DataType dtype)
+{
+    return (dtype == TYPE_COMPLEX_FLOAT || dtype == TYPE_COMPLEX_DOUBLE);
+}
+
+#endif /* CLTYPES_H_ */
diff --git a/src/include/dblock_kgen.h b/src/include/dblock_kgen.h
new file mode 100644
index 0000000..88ddd5e
--- /dev/null
+++ b/src/include/dblock_kgen.h
@@ -0,0 +1,220 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Common generators for functions manipulating
+ * with data blocks placed in the global, local,
+ * or private memory.
+ */
+
+/*
+ * TODO: add the unroll option to 'rwMatrBlockGen'
+ *       and 'smulMatrBlockGen'
+ */
+
+#ifndef DBLOCK_KGEN_H_
+#define DBLOCK_KGEN_H_
+
+#include <cltypes.h>
+#include <kerngen.h>
+
+/**
+ * @internal
+ * @defgroup MAJOR_GENS Major common used generators
+ */
+/*@{*/
+
+/**
+ * @internal
+ * @brief Data block copying directions
+ */
+typedef enum DBlockCopyDirection {
+    /** Copy from the global to the local memory */
+    DBLOCK_GLOBAL_TO_LOCAL,
+    /** Copy from the local to the global memory */
+    DBLOCK_LOCAL_TO_GLOBAL,
+    /** Copy from the global memory to an image */
+    DBLOCK_GLOBAL_TO_IMAGE,
+    /** Copy from the local memory to an image */
+    DBLOCK_LOCAL_TO_IMAGE
+} DBlockCopyDirection;
+
+/**
+ * @internal
+ * @brief Data block copying flags
+ */
+typedef enum DBlockCopyFlags {
+    DBLOCK_COPY_TRANSPOSE = 0x1,        /**< Transpose 2D block */
+    /** pack several rows into single image row */
+    DBLOCK_COPY_PACKED_IMAGE = 0x2,
+    DBLOCK_COPY_CONJUGATE = 0x4,        /**< Conjugate complex elements */
+    DBLOCK_COPY_NOT_VECTORIZE = 0x8     /**< Disable vectorized copying */
+} DBlockCopyFlags;
+
+/**
+ * @internal
+ * @brief Generator to copy data blocks between different kinds
+ *        of memory
+ *
+ * @param[out] ctx              Generator context
+ * @param[in] dim               Subproblem dimension to generate a function for
+ * @param[in] pgran             Data parallelism granularity
+ * @param[in] dtype             Data type
+ * @param[in] dir               Copying direction
+ * @param[in] flags             Copying flags; when an image is used as destination
+ *                              block transposing is prohibited
+ *
+ * If 'dim' is set to NULL a generic version working with subproblem
+ * of any dimension is generated. In the case specific work group
+ * sizes are ignored, only work group dimension is used.
+ *
+ * 'x' field of the passed SuproblemDim structure should contain
+ *     the block width
+ * 'y' should contain the block height
+ *
+ * Copied blocks can be as well one as two dimensional. For any one
+ * dimensional block 'y' field of the dimension structure should be
+ * set to 1. If a block is two dimensional, and the local memory is \n
+ * the source or destination memory, the block's rows must be aligned
+ * to float4 boundary.
+ *
+ * Rows of the matrix block must be aligned to float4 boundary. \n
+ *
+ * Generated functions have the following definitions: \n
+ *\n
+ * Buffer-buffer copying function for optimal blocks: \n
+ * @code
+ * void
+ * funcName(
+ *     <Unified pointer type> dst,
+ *     <Unified pointer type> src,
+ *     size_t startRow,
+ *     size_t startCol,
+ *     size_t ld)
+ * @endcode
+ *
+ * The unified pointer types can be GPtr if the global memory is used or LPtr
+ * is the local memory is used respectively
+ * (See the "Data types in kernels" section). Function naming rule is follow: \n
+ * (type prefix)copyDBlock['Transp']['Conj']['Nvec'](src mem][dst mem]
+ * [block height][block width] \n
+ * The 'Nvec' suffix is added if vectorized copying is prohibited.\n
+ *\n
+ * Buffer-buffer copying function, generic version: \n
+ * @code
+ * void
+ * funcName(
+ *     <Unified pointer type> dst,
+ *     <Unified pointer type> src,
+ *     size_t startRow,
+ *     size_t startCol,
+ *     size_t nrRows,
+ *     size_t nrCols,
+ *     size_t dstLD,
+ *     size_t srcLD)
+ * @endcode
+ *
+ * Here "dstLD" is destination leading dimension, "srcLD" - source leading
+ * dimension. \n
+ * Naming rule is the same as for the function above except block sizes. \n
+ *\n
+ * Function copying optimal blocks from the global memory to an image: \n
+ * @code
+ * void
+ * funcName(
+ *     __write_only image2d_t dst,
+ *     size_t startX,
+ *     size_t startY,
+ *     GPtr src,
+ *     size_t startRow,
+ *     size_t startCol,
+ *     size_t ld)
+ * @endcode
+ * 'start' and 'startY' arguments is start X and Y coordinate in the image to
+ * write from. The generic version has the analogous definition, and takes two
+ * additional arguments 'nrRows' and 'nrCols' of the size_t type following just
+ * fter the 'startCol' argument. \n
+ *\n
+ * Function copying optimal blocks from the local memory to an image: \n
+ * @code
+ * void
+ * funcName(
+ *     __write_only image2d_t dst,
+ *     size_t startX,
+ *     size_t startY,
+ *     LPtr src)
+ * @endcode
+ * The generic version takes two additional arguments 'nrRows' and 'nrCols' of the
+ * size_t type following just after the 'src' argument.
+ *
+ * @return 0 on success; on error returns negated error code:
+ *
+ *      - -EINVAL: unsupported data type is passed, or
+ *               'DBLOCK_COPY_TRANSPOSE' is set when
+ *               an image is used as destination
+ *      - -ENOTSUP: unsupported copying direction is passed
+ *      - -EOVEFFLOW: code buffer overflowed
+ */
+int
+copyDataBlockGen(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    const PGranularity *pgran,
+    DataType dtype,
+    DBlockCopyDirection dir,
+    DBlockCopyFlags flags);
+
+/*@}*/
+
+/*
+ * Zero data block in the local or global memory
+ *
+ * @ctx: generator context
+ * @dim: Subproblem dimension to generate the function for
+ * @pgran: data parallelism granularity
+ * @memPrefix: type of memory to generate the function for
+ *
+ * The 'memPrefix' field of the passed BlasKernExtra structure
+ * should contain the type of memory the buffer is stored in.
+ * It cane take one of the "__local", or the "__global" value.
+ *
+ * 'x' field of the passed SuproblemDim structure should contain
+ * the block width in float4 words. In the case the function takes only
+ * a buffer pointer. If the field is set to 'SUBDIM_UNUSED'
+ * the function is generated without any loop unrollings. In the case
+ * the function takes buffer length as the second argument.
+ *
+ * If 'unroll' is set, the 'bwidth' field of the structure should
+ * contain the maximum width of a block zeroed with loop unrolling.
+ * If 'unroll' is set but the 'bwidth' is set to 'SUBDIM_UNUSED',
+ * the generator don't apply any restriction to loop unrolling.
+ * The parameter is ignored if the 'x' field of the 'dim' is set to
+ * 'SUBDIM_UNUSED'.
+ *
+ * On success returns 0, on error returns negated error code:
+ *
+ *      -EINVAL: wrong memory prefix is passed
+ *      -EOVEFFLOW: code buffer overflowed
+ */
+int
+f4zeroBlockGen(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    const PGranularity *pgran,
+    const char *memPrefix);
+
+#endif /* DBLOCK_KGEN_H_ */
diff --git a/src/include/defbool.h b/src/include/defbool.h
new file mode 100644
index 0000000..e90736d
--- /dev/null
+++ b/src/include/defbool.h
@@ -0,0 +1,57 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef DEFBOOL_H_
+#define DEFBOOL_H_
+
+#if defined(_MSC_VER) && _MSC_VER <= 1600
+
+/*
+FIX for windows compilation
+#if !defined(__cplusplus)
+
+typedef int _Bool;
+#define bool _Bool
+enum {
+    false,
+    true
+};
+#endif
+*/
+
+#define  __bool_true_false_are_defined  1
+
+#ifndef __cplusplus
+
+#define  bool  _Bool
+#if __STDC_VERSION__ < 199901L && __GNUC__ < 3
+#define  false  0
+#define  true  1
+
+typedef  int  _Bool;
+#endif
+
+#endif /* !__cplusplus */
+
+
+#else /* defined(_MSC_VER) && _MSC_VER <= 1600 */
+
+#include <stdbool.h>
+
+#endif /* defined(_MSC_VER) && _MSC_VER <= 1600 */
+
+#endif /* DEFBOOL_H_ */
diff --git a/src/include/devinfo.h b/src/include/devinfo.h
new file mode 100644
index 0000000..ef179e3
--- /dev/null
+++ b/src/include/devinfo.h
@@ -0,0 +1,99 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef DEVINFO_H_
+#define DEVINFO_H_
+
+#include <defbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * TODO: Expand these enumerations in respect with known
+ *       vendors and devices
+ */
+
+typedef enum DeviceVendor {
+    VENDOR_UNKNOWN,
+    VENDOR_AMD,
+    VENDOR_NVIDIA
+} DeviceVendor;
+
+typedef enum DeviceFamily {
+    DEVICE_FAMILY_UNKNOWN,
+    GPU_FAMILY_EVERGREEN,
+    GPU_FAMILY_FERMI
+} DeviceFamily;
+
+typedef enum DeviceChip {
+    CHIP_UNKNOWN,
+    REDWOOD,
+    JUNIPER,
+    CYPRESS,
+    HEMLOCK,
+    CAYMAN,
+    TAHITI,
+    GEFORCE_GTX_480,
+    GEFORCE_GTX_580,
+    NUM_DEVICE_CHIPS
+} DeviceChip;
+
+typedef struct DeviceIdent {
+    DeviceVendor vendor;
+    DeviceFamily family;
+    DeviceChip chip;
+} DeviceIdent;
+
+typedef struct DeviceHwInfo {
+    unsigned int wavefront;
+    unsigned int channelSize;
+    unsigned int bankSize;
+    unsigned int l1CacheAssoc;
+} DeviceHwInfo;
+
+typedef struct TargetDevice {
+    cl_device_id id;
+    DeviceIdent ident;
+    bool hwInfoValid;
+    DeviceHwInfo hwInfo;
+} TargetDevice;
+
+cl_int
+identifyDevice(TargetDevice *target);
+
+cl_uint  deviceComputeUnits    (cl_device_id device, cl_int *error);
+cl_ulong deviceLDSSize         (cl_device_id device, cl_int *error);
+cl_uint  deviceWavefront       (cl_device_id device, cl_int *error);
+cl_uint  deviceDataAlignment   (cl_device_id device, cl_int *error);
+cl_uint  deviceAddressBits     (cl_device_id device, cl_int *error);
+bool     deviceHasNativeDouble (cl_device_id device, cl_int *error);
+bool     deviceHasNativeComplex(cl_device_id device, cl_int *error);
+
+cl_ulong deviceL2CacheSize     (cl_device_id device, cl_int *error);
+cl_ulong deviceL1CacheSize     (cl_device_id device, cl_ulong l2CacheSize,
+                                cl_int *error);
+cl_uint  deviceL1CacheAssoc    (cl_device_id device, cl_ulong l1CacheSize,
+                                cl_int *error);
+size_t  deviceMaxWorkgroupSize (cl_device_id device, cl_int *error);
+
+#ifdef __cplusplus
+}       /* extern "C" { */
+#endif
+
+#endif  /* DEVINFO_H_ */
diff --git a/src/include/dis_warning.h b/src/include/dis_warning.h
new file mode 100644
index 0000000..3b9536c
--- /dev/null
+++ b/src/include/dis_warning.h
@@ -0,0 +1,65 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef DIS_WARNING_H_
+#define DIS_WARNING_H_
+
+#if _MSC_VER
+
+#pragma warning (disable:4204)
+#pragma warning (disable:4127)
+#define MAY_ALIAS
+
+#else                       /* _MSC_VER */
+
+#define MAY_ALIAS __attribute__((__may_alias__))
+
+#endif
+
+
+/*
+ * Set of macro to mute gcc when we don't need in using some
+ * function arguments
+ */
+
+#define DUMMY_ARG_USAGE(arg)                            \
+do {                                                    \
+    (void)arg;                                          \
+} while (0)
+
+#define DUMMY_ARGS_USAGE_2(arg1, arg2)                  \
+do {                                                    \
+    (void)arg1;                                         \
+    (void)arg2;                                         \
+} while (0)
+
+#define DUMMY_ARGS_USAGE_3(arg1, arg2, arg3)            \
+do {                                                    \
+    (void)arg1;                                         \
+    (void)arg2;                                         \
+    (void)arg3;                                         \
+} while(0)                                              \
+
+#define DUMMY_ARGS_USAGE_4(arg1, arg2, arg3, arg4)      \
+do {                                                    \
+    (void)arg1;                                         \
+    (void)arg2;                                         \
+    (void)arg3;                                         \
+    (void)arg4;                                         \
+} while(0)                                              \
+
+#endif /* DIS_WARNING_H_ */
diff --git a/src/include/granulation.h b/src/include/granulation.h
new file mode 100644
index 0000000..215ec90
--- /dev/null
+++ b/src/include/granulation.h
@@ -0,0 +1,79 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Data and execution granulation
+ */
+
+#ifndef GRANULATION_H_
+#define GRANULATION_H_
+
+/**
+ * @internal
+ * @brief Decomposition axis
+ * @ingroup PROBLEM_DECOMPOSITION
+ */
+typedef enum DecompositionAxis {
+    DECOMP_AXIS_Y,
+    DECOMP_AXIS_X
+} DecompositionAxis;
+
+/**
+ * @internal
+ * @brief Data parallelism granularity
+ * @ingroup PROBLEM_DECOMPOSITION
+ */
+typedef struct PGranularity {
+    /** work group sizes */
+    unsigned int wgSize[2];
+    /** work group dimension */
+    unsigned int wgDim;
+    /** wavefront size */
+    unsigned int wfSize;
+    /** Record number of work-groups spawned */
+    unsigned int numWGSpawned[2];
+} PGranularity;
+
+/**
+ * @internal
+ * @brief Subproblem dimensions
+ *
+ * The structure represents how a problem is decomposed during
+ * the computation. The decomposition is made in terms of
+ * resulting data. It describes as well what portion of work each
+ * computing item gets as what chunk it evaluates at a time.
+ * The chunk processed at a time is typically bound by amount
+ * of resources consumed at this level of decomposition while
+ * the whole portion is bound of amount of more high level resources
+ * to be available, and can also be used for the purpose of work
+ * balancing.
+ *
+ * @ingroup PROBLEM_DECOMPOSITION
+ */
+typedef struct SubproblemDim {
+    size_t x;       /**< Subproblem step size in X dimension */
+    size_t y;       /**< Subproblem step size in Y dimension */
+    /** Width of data blocks processed consecutively
+     * to evaluate a subproblem of 'x' by 'y' size */
+    size_t bwidth;
+    size_t itemX;   /**< Size of the whole subproblem in X dimension
+                        evaluated by a computing item */
+    size_t itemY;   /**< Size of the whole subproblem in Y dimension
+                        evaluated by a computing item */
+} SubproblemDim;
+
+#endif /* GRANULATION_H_ */
diff --git a/src/include/kern_cache.h b/src/include/kern_cache.h
new file mode 100644
index 0000000..b6749c5
--- /dev/null
+++ b/src/include/kern_cache.h
@@ -0,0 +1,187 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * OpenCL kernel cache
+ */
+
+#ifndef KERN_CACHE_H_
+#define KERN_CACHE_H_
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include <defbool.h>
+#include <list.h>
+#include <kerngen.h>
+#include <solver.h>
+#include <mutex.h>
+#include <trace_malloc.h>
+
+struct KernelCache;
+
+/* Unique kernel characteristics */
+typedef struct KernelKey {
+    cl_device_id device;
+    cl_context context;
+    unsigned int nrDims;
+    SubproblemDim subdims[MAX_SUBDIMS];
+} KernelKey;
+
+/*
+ * structure describing an optimal CL kernel for some
+ * memory pattern and subproblem dimensions
+ */
+typedef struct Kernel {
+    cl_program program;     // program the kernel belongs to
+    /* extra information specific for the application field */
+    void *extra;
+    size_t extraSize;
+    void (*dtor)(struct Kernel *kern);
+} Kernel;
+
+typedef int
+(*KernelExtraCmpFn)(const void *extra, const void *extraKey);
+
+
+/*
+ * Create kernel cache
+ *
+ * @nrSolvers:  total solvers amount to store kernels of in a cache
+ * @sizeLimit:  limit of the cache in bytes;
+ *              if set to 0 the cache size is
+ *              unlimited
+ *
+ * On success returns pointer to kernel cache object;
+ * On error returns NULL, if it has not succeeded to allocated need resources
+ */
+struct KernelCache
+*createKernelCache(
+    unsigned int nrSolvers,
+    size_t sizeLimit);
+
+void
+destroyKernelCache(struct KernelCache *kcache);
+
+/*
+ * Allocate kernel
+ *
+ * After allocation fill the structure with zero bytes
+ * and set the kernel's reference counter to 1.
+ *
+ * return pointer to a just created kernel,
+ * return NULL if there is not enough memory
+ * to allocate a kernel
+ */
+Kernel
+*allocKernel(void);
+
+/*
+ * Get reference to kernel not yet added to a cache
+ */
+void
+getKernel(Kernel *kern);
+
+/*
+ * Decrement reference counter of this kernel
+ *
+ * @kcache: the cache the kernel inserted to;
+ *          may be NULL if the kernel is not yet
+ *          added to a cache, it is ignored in the case
+ *
+ * When there are no more references to the kernel, it is automatically
+ * destroyed
+ */
+void
+putKernel(struct KernelCache *kcache, Kernel *kern);
+
+/*
+ * Add new generated kernel to cache
+ *
+ * @kcache: cache to add the kernel to
+ * @sid: solver ID to add the kernel for
+ * @kern: kernel to add
+ * @key: kernel characteristics
+ *
+ * On success returns 0.
+ * On error returns -1, in on of the following cases:
+ *      kernel size is larger than the maximum cache size,
+ *      or there is not enough memory to allocate internal
+ *      structures,
+ *      or the passed solver ID is wrong,
+ *      or 'nrDims' is wrong,
+ */
+int
+addKernelToCache(
+    struct KernelCache *kcache,
+    solver_id_t sid,
+    Kernel *kern,
+    const KernelKey *key,
+    KernelExtraCmpFn extraCmp);
+
+/*
+ * Find the kernel for the given OpenCL solver and
+ * subproblem dimensions, and increment reference counter to it
+ *
+ * On success returns the kernel being actually stored in the cache.
+ * On error returns NULL; it means the passed solver ID
+ * is wrong, or any kernel for the given solver and subprolem
+ * dimensions is not stored in the cache
+ */
+Kernel
+*findKernel(
+    struct KernelCache *kcache,
+    solver_id_t sid,
+    const KernelKey *key,
+    const void *extraKey);
+
+/*
+ * Get available size in the kernel cache
+ */
+size_t
+availKernelCacheSize(struct KernelCache *kcache);
+
+/*
+ * Remove all kernels from the cache
+ */
+void
+cleanKernelCache(struct KernelCache *kcache);
+
+size_t
+fullKernelSize(struct Kernel *kern);
+
+
+#if defined(TRACE_MALLOC)
+
+void
+printKernelCacheSize(struct KernelCache *kcache);
+
+#else       /* TRACE_MALLOC */
+
+static __inline void
+printKernelCacheSize(struct KernelCache *kcache)
+{
+    /* do nothing */
+    (void)kcache;
+}
+
+#endif      /* !TRACE_MALLOC */
+
+#endif /* KERN_CACHE_H_ */
diff --git a/src/include/kernel_extra.h b/src/include/kernel_extra.h
new file mode 100644
index 0000000..81e1f5c
--- /dev/null
+++ b/src/include/kernel_extra.h
@@ -0,0 +1,167 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef KERNEL_EXTRA_H_
+#define KERNEL_EXTRA_H_
+
+#include <cltypes.h>
+
+enum {
+    MAX_SOLVER_PRIVATE_SIZE = 16
+};
+
+//
+// Moving BUILD_OPTS_MAXLEN here. Originally in clblas-internal.h
+//      Including "clblas-internal.h"
+enum {
+    MEMPAT_PER_BLASFN = 8,
+    BUILD_OPTS_MAXLEN = 256
+};
+
+/**
+ * @internal
+ * @brief BLAS kernel type identifiers
+ *
+ * @ingroup BLAS_SOLVERIF_SPEC
+ */
+typedef enum CLBlasKernelType {
+    CLBLAS_COMPUTING_KERNEL,        /**< Main computing kernel */
+    CLBLAS_PREP_A_KERNEL,           /**< Kernel preparing matrix A */
+    CLBLAS_PREP_B_KERNEL,           /**< Kernel preparing matrix B */
+    MAX_CLBLAS_KERNELS_PER_STEP
+} CLBlasKernelType;
+
+/**
+ * @internal
+ * @defgroup BLAS_SOLVERIF_SPEC BLAS specifics
+ * @ingroup SOLVERIF
+ */
+
+/*@{*/
+
+/**
+ * @brief BLAS kernel flags
+ *
+ * These flags uniquely determine problem options kernels are generated for
+ */
+typedef enum KernelExtraFlags {
+    /** Matches to a problem without any options */
+    KEXTRA_NO_FLAGS = 0,
+    KEXTRA_TRANS_A = 0x01,      /**< Matrix A should be transposed */
+    /** matrix A should be took in the conjugate form */
+    KEXTRA_CONJUGATE_A = 0x02,
+    KEXTRA_TRANS_B = 0x04,      /**< matrix B should be transposed */
+    /** Matrix B should be taken in the conjugate form */
+    KEXTRA_CONJUGATE_B = 0x08,
+    KEXTRA_COLUMN_MAJOR = 0x10, /**< Order is column major */
+    /**
+     * Matrix A is upper triangular, it is lower triangular
+     * if this flag is not set
+     */
+    KEXTRA_UPPER_TRIANG = 0x20,
+    /**
+     * Matrix A is placed on the right, it is placed
+     * on the left if this flag is not set
+     */
+    KEXTRA_SIDE_RIGHT = 0x40,
+    /**
+     * Unit diagonal matrix
+     */
+    KEXTRA_UNIT_DIAGONAL = 0x80,
+    /** kernel should process tails of upper level blocks in M dimension */
+    KEXTRA_TAILS_M = 0x100,
+    /** kernel should process tails of upper level blocks in N dimension */
+    KEXTRA_TAILS_N = 0x200,
+    /** kernel should process tails of upper level blocks in K dimension */
+    KEXTRA_TAILS_K = 0x400,
+    /** Beta multiplier is zero */
+    KEXTRA_BETA_ZERO = 0x800,
+    /** Disable vectorization at block copying for matrix A */
+    KEXTRA_NO_COPY_VEC_A = 0x1000,
+    /** Disable vectorization at block copying for matrix B */
+    KEXTRA_NO_COPY_VEC_B = 0x2000,
+    /** Disable vectorization at block copying for matrix C */
+    KEXTRA_NO_COPY_VEC_C = 0x4000,
+    // SYRXK specific flags
+    /** Diagonal solution blocks are evaluated in a separate kernel */
+    KEXTRA_SYRK_SEPARATE_DIAGONAL = 0x8000,
+    /** Evaluate diagonal solution blocks for a SYRXK function */
+    KEXTRA_SYRK_EVALUATE_DIAGONAL = 0x10000,
+    /** 2k rank update */
+    KEXTRA_SYRK_2K_RANK = 0x20000,
+    // BLAS2 specific flags
+    /** Incx increment is one */
+    KEXTRA_INCX_ONE = 0x40000,
+    /** Incy increment is one */
+    KEXTRA_INCY_ONE = 0x80000,
+    // Generator specific flags
+    /** MAD function can be used */
+    // FIXME: throw this kludge away
+    KEXTRA_ENABLE_MAD = 0x100000,
+    // FIXME: It's a kludge, pass further DeviceIndent structure to generators
+    KEXTRA_VENDOR_AMD = 0x200000,
+    /* Flags showing not zero starting offsets for kernels */
+    KEXTRA_STARTM_NOT_ZERO = 0x400000,
+    KEXTRA_STARTN_NOT_ZERO = 0x800000,
+    //KEXTRA_STARTK_NOT_ZERO = 0x2000000,
+    /** Matrix A offset in a memory object is not zero */
+    KEXTRA_A_OFF_NOT_ZERO = 0x1000000,
+    /** Matrix B or vector X offset in a memory object is not zero */
+    KEXTRA_BX_OFF_NOT_ZERO = 0x2000000,
+    /** Matrix C or vector Y offset in a memory object is not zero */
+    KEXTRA_CY_OFF_NOT_ZERO = 0x4000000,
+    /** kernel should process tails of lower level blocks in M dimension */
+    KEXTRA_TAILS_M_LOWER = 0x8000000,
+    /** kernel should process tails of lower level blocks in N dimension */
+    KEXTRA_TAILS_N_LOWER = 0x10000000,
+    /** kernel should process tails of lower level blocks in K dimension */
+    KEXTRA_TAILS_K_LOWER = 0x20000000
+} KernelExtraFlags;
+
+/**
+ * @internal
+ * @brief extra information CLBLAS kernel generator
+ * @ingroup BLAS_SOLVERIF_SPEC
+ */
+typedef struct CLBLASKernExtra {
+    DataType dtype;             /**< Data type */
+    KernelExtraFlags flags;     /**< Kernel flags identifying a problem */
+    CLBlasKernelType kernType;  /**< Kernel type */
+    // Fixme: Deprecate it; now it is just for backward compatibility
+    unsigned int vecLen;        /**< vector length to evaluate with */
+    /** vector length for matrix A elements to evaluate with */
+    unsigned int vecLenA;
+    /** vector length for matrix B elements to evaluate with */
+    unsigned int vecLenB;
+    /*
+     * FIXME: remove this kludge; vectorization for the result should be
+     *        autodetected
+     */
+    unsigned int vecLenC;
+    char solverPriv[MAX_SOLVER_PRIVATE_SIZE];
+    char buildOptions[BUILD_OPTS_MAXLEN]; // Build Flags used for the kernel call
+} CLBLASKernExtra;
+
+/*
+ * function to compare blas kernels extra information
+ */
+int
+clblasKernelExtraCmp(const void *extra, const void *extraKey);
+
+/*@}*/
+
+#endif /* KERNEL_EXTRA_H_ */
diff --git a/src/include/kerngen.h b/src/include/kerngen.h
new file mode 100644
index 0000000..dd44b9e
--- /dev/null
+++ b/src/include/kerngen.h
@@ -0,0 +1,685 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Kernel generator related common definitions
+ */
+
+#ifndef KERNGEN_H_
+#define KERNGEN_H_
+
+#include <sys/types.h>
+#include <errno.h>
+
+#if defined (_MSC_VER)
+#include <msvc.h>
+#endif
+
+#include <defbool.h>
+#include <list.h>
+#include <cltypes.h>
+#include <mutex.h>
+#include <granulation.h>
+#include <trace_malloc.h>
+
+/**
+ * @internal
+ * @defgroup KGEN_INFRA Kernel generator infrastructure
+ */
+/*@{*/
+
+#define SUBDIM_UNUSED (size_t)-1
+
+enum {
+    MAX_TABS = 16,
+    MAX_STATEMENT_PRIORITY = 63,
+    MAX_STATEMENT_LENGTH = 4096
+};
+
+enum {
+    // maximum subproblem dimensions
+    MAX_SUBDIMS = 3,
+    // maximum code nesting
+    MAX_NESTING = 10,
+    KSTRING_MAXLEN = 256,
+    // generated function name max len
+    FUNC_NAME_MAXLEN = KSTRING_MAXLEN
+};
+
+typedef struct{
+	SubproblemDim	subdims[MAX_SUBDIMS];
+	PGranularity	pgran;
+}DecompositionStruct;
+
+struct KgenContext;
+struct KgenGuard;
+struct StatementBatch;
+
+/**
+ * @internal
+ * @defgroup KGEN_TYPES Types
+ * @ingroup KGEN_INFRA
+ */
+/*@{*/
+
+/**
+ * @internal
+ * @brief Memory fence type
+ */
+typedef enum CLMemFence {
+    /** Fence for operations against the local memory */
+    CLK_LOCAL_MEM_FENCE,
+    /** Fence for operations against the global memory */
+    CLK_GLOBAL_MEM_FENCE
+} CLMemFence;
+
+// TODO: deprecate
+typedef enum UptrType {
+    UPTR_GLOBAL,
+    UPTR_LOCAL,
+    UPTR_PRIVATE
+} UptrType;
+
+/**
+ * @internal
+ * @brief Null-terminated string being a part of a kernel
+ */
+typedef struct Kstring {
+    /** Buffer storing the string */
+    char buf[KSTRING_MAXLEN];
+} Kstring;
+
+/**
+ * @internal
+ * @brief Type of custom generator for loop unrolling
+ */
+typedef int
+(*LoopUnrollGen)(struct KgenContext *ctx, void *priv);
+
+/*@}*/
+
+/**
+ * @internal
+ * @brief Unrolled loop control information
+ */
+typedef struct LoopCtl {
+    const char *ocName;     /**< outer loop counter name */
+    union {
+        const char *name;
+        unsigned long val;
+    } outBound;             /**< outer loop bound */
+    bool obConst;           /**< outer loop bound is constant flag */
+    unsigned long inBound;  /**< inner loop bound */
+} LoopCtl;
+
+/**
+ * @internal
+ * @brief Set of loop unrolling subgenerators
+ */
+typedef struct LoopUnrollers {
+    /** generate preparative code before unrolling */
+    LoopUnrollGen preUnroll;
+    /** generate single step for unrolled body in the vectorized way */
+    LoopUnrollGen genSingleVec;
+    /** generated single step for unrolled body in non vectorized way */
+    LoopUnrollGen genSingle;
+    /** generate code that should be inserted just after unrolled loop body */
+    LoopUnrollGen postUnroll;
+    /** return veclen*/
+    LoopUnrollGen getVecLen;
+} LoopUnrollers;
+
+/*@}*/
+
+static __inline void
+emptyKstring(Kstring *kstr)
+{
+    kstr->buf[0] = '\0';
+}
+
+static __inline bool
+isKstringEmpty(const Kstring *kstr)
+{
+    return (kstr->buf[0] == '\0');
+}
+
+/**
+ * @internal
+ * @defgroup KGEN_CORE Core API
+ * @ingroup KGEN_INFRA
+ */
+/*@{*/
+
+/**
+ * @internal
+ * @brief Create new generator context
+ *
+ * @param[out] srcBuf        Source buffer; if NULL, then any statements
+ *                           were not actually added to the source buffer, just
+ *                           their overall size will be calculated
+ * @param[in]  srcBufLen     Maximal length of the source which is being
+ *                           generated; ignored if an actual buffer was not
+ *                           specified
+ * @param[in]  fmt           Format the source. Code formatting assumes
+ *                           tabulation and watch line width
+ *
+ * @return New generator context on success. Returns NULL
+ *         if there is not enough memory to allocate internal structures
+ */
+struct KgenContext
+*createKgenContext(char *srcBuf, size_t srcBufLen, bool fmt);
+
+/**
+ * @internal
+ * @brief Destroy a kernel generator context
+ *
+ * @param[out] ctx           An existing generator context to be destroyed
+ */
+void
+destroyKgenContext(struct KgenContext *ctx);
+
+/**
+ * @internal
+ * @brief Reset a kernel generator context used before
+ *
+ * @param[out] ctx           A generator context to be reset
+ *
+ * Clear the source buffer and another information associated
+ * with this context
+ */
+void
+resetKgenContext(struct KgenContext *ctx);
+
+/**
+ * @internal
+ * @brief Synchronize formatting of 2 contexts
+ *
+ * @param[in]  srcCtx        Source generator context
+ * @param[out] dstCtx        Destination generator context
+ * @param[in]  nrTabs        Tabs number to be inserted in the source context.
+ *                           It is relative on the current nesting level of the
+ *                           target context. It must be not less than zero, and
+ *                           resulting number of tabs which is evaluated as
+ *                           the target context's nesting level plus 'nrTabs'
+ *                           must not exceed 'MAX_TABS'
+ *
+ * The function is usable when it's needed to insert a code from
+ * one context into another one, and don't disturb formatting.
+ *
+ * @return 0 on success, -EINVAL if the 'nrTabs' parameter is out
+ *         of range
+ */
+int
+kgenSyncFormatting(
+    struct KgenContext *srcCtx,
+    const struct KgenContext *dstCtx,
+    int nrTabs);
+
+/**
+ * @internal
+ * @brief Add a function declaration
+ *
+ * @param[out] ctx           Generator context
+ * @param[in]  decl          The declaration to be added
+ *
+ * @return 0 on success; -1 if the source code exceeds the buffer,
+ *           or level of the code nesting is not zero, or the returned
+ *           type is not defined, or there is not a paranthesis opening
+ *           the argument list
+ */
+int
+kgenDeclareFunction(struct KgenContext *ctx, const char *decl);
+
+/**
+ * @internal
+ * @brief Begin function body
+ *
+ * @param[out] ctx           Generator context
+ *
+ * Adds the opening bracket and increments a nesting counter.
+ *
+ * @return 0 on success; -1 if the source code exceeds the buffer
+ */
+int
+kgenBeginFuncBody(struct KgenContext *ctx);
+
+/**
+ * @internal
+ * @brief End function body
+ *
+ * @param[out] ctx           Generator context
+ *
+ * Adds the closing bracket and decrements a nesting counter
+ *
+ * @return 0 on success; -1 if the source code exceeds the buffer,
+ * or code nesting is not 1
+ */
+int
+kgenEndFuncBody(struct KgenContext *ctx);
+
+/**
+ * @internal
+ * @brief Get the last declared function name for the context
+ *
+ * @param[out] buf           A buffer to store the function name
+ * @param[in] buflen         Size of the buffer
+ * @param[in] ctx            Generator context
+ *
+ * @return pointer to the gotten function name on success; -1
+ *         if no functions were declared or the passed buffer is
+ *         insufficient
+ */
+int
+kgenGetLastFuncName(
+    char *buf,
+    size_t buflen,
+    const struct KgenContext *ctx);
+
+/**
+ * @internal
+ * @brief Begin new execution branch: conditional branch or loop
+ *
+ * @param[out] ctx           Generator context
+ * @param[in]  stmt          A statement containing a branch control code.
+ *                           Ignored if NULL.
+ *
+ * The opening bracket and trailing new line symbol are added
+ * automatically and should not be passed
+ *
+ * @return 0 on success; -1 if the overall source exceeds the set
+ *         limit or nesting exceeds the maximum allowed one
+ */
+int
+kgenBeginBranch(struct KgenContext *ctx, const char *stmt);
+
+/**
+ * @internal
+ * @brief End the current code branch
+ *
+ * @param[out] ctx           Generator context
+ * @param[in]  stmt          A statement containing a branch control code
+ *
+ * As well closing bracket as trailing ';' and '\n' are added automatically and
+ * should not be passed.
+ * The statement passed in 'stmt' is appended after the closing bracket.
+ *
+ * @return 0 on sucess; -1 if the overall source exceeds the set limit,
+ *         or there is not an opened branch
+ */
+int
+kgenEndBranch(struct KgenContext *ctx, const char *stmt);
+
+/**
+ * @internal
+ * @brief Add a statement to generated source
+ *
+ * @param[out] ctx           Generator context
+ * @param[in]  stmt          A statement to be added
+ *
+ * If formatting is enabled and the statement is multiline, all the lines are
+ * formatted automatically. It's strongly not recommended to add with this
+ * function any statements containing variables or function declaration,
+ * or branch bounds. The appropriated functions should be used for that to avoid
+ * unexpected side effects.
+ *
+ * @return 0 on success; -1 if the overall source exceeds the set limit
+ */
+int
+kgenAddStmt(struct KgenContext *ctx, const char *stmt);
+
+int
+kgenPrintf(struct KgenContext *ctx, const char *fmt,...);
+
+struct StatementBatch
+*createStmtBatch(void);
+
+int
+kgenAddStmtToBatch(
+    struct StatementBatch *batch,
+    int priority,
+    const char *stmt);
+
+int
+kgenBatchPrintf(
+    struct StatementBatch *batch,
+    int priority,
+    const char *fmt,...);
+
+int
+flushStmtBatch(struct KgenContext *ctx, struct StatementBatch *batch);
+
+void
+destroyStmtBatch(struct StatementBatch *batch);
+
+/**
+ * @internal
+ * @brief Add a blank line to generated source
+ *
+ * @param[out] ctx           Generator context
+ *
+ * @return 0 on success; -1 if the overall source exceeds
+ *           the set limit returns -1
+ */
+int
+kgenAddBlankLine(struct KgenContext *ctx);
+
+/**
+ * @internal
+ * @brief Get resulting source size
+ *
+ * @param[out] ctx           Generator context
+ *
+ * @return size of the overall source was added to the
+ *         generator context including the trailing null
+ *         byte
+ */
+size_t
+kgenSourceSize(struct KgenContext *ctx);
+
+/*@}*/
+
+/**
+ * @internal
+ * @defgroup KGEN_BASIC Basic generating functions
+ * @ingroup KGEN_INFRA
+ */
+/*@{*/
+
+/**
+ * @internal
+ * @brief Add barrier
+ *
+ * @param[out] ctx           Generator context
+ * @param[in]  fence         Fence type
+ *
+ * @return 0 on success, and -EOVERFLOW on buffer overflowing
+ */
+int
+kgenAddBarrier(struct KgenContext *ctx, CLMemFence fence);
+
+/**
+ * @internal
+ * @brief Add memory fence
+ *
+ * @param[out] ctx           Generator context
+ * @param[in]  fence         Fence type
+ *
+ * @return 0 on success, and -EOVERFLOW on buffer overflowing
+ */
+int
+kgenAddMemFence(struct KgenContext *ctx, CLMemFence fence);
+
+/**
+ * @internal
+ * @brief Add local ID declaration and evaluating expression
+ *
+ * @param[out] ctx           Generator context
+ * @param[in]  lidName       Local id variable name
+ * @param[in]  pgran         Data parallelism granularity
+ *
+ * The resulting expression depends on the work group dimension and size
+ * of the first one.
+ *
+ * @return 0 on success, and -EOVERFLOW on buffer overflowing
+ */
+int
+kgenDeclareLocalID(
+    struct KgenContext *ctx,
+    const char *lidName,
+    const PGranularity *pgran);
+
+/**
+ * @internal
+ * @brief Add work group ID declaration and evaluating expression
+ *
+ * @param[out] ctx           Generator context
+ * @param[in]  gidName       Group id variable name
+ * @param[in]  pgran         Data parallelism granularity
+ *
+ * The resulting expression depends on the work group dimension and size
+ * of the first one.
+ *
+ * @return 0 on success, and -EOVERFLOW on buffer overflowing
+ */
+int
+kgenDeclareGroupID(
+    struct KgenContext *ctx,
+    const char *gidName,
+    const PGranularity *pgran);
+
+/*
+ * TODO: deprecate when casting is eliminated
+ *
+ * declare unified pointers
+ *
+ * @withDouble: double based types pointers area needed
+ *
+ * On success returns 0, on buffer overflowing returns -EOVERFLOW
+ */
+int
+kgenDeclareUptrs(struct KgenContext *ctx, bool withDouble);
+
+/*@}*/
+
+/**
+ * @internal
+ * @defgroup KGEN_HELPERS Generating helpers
+ * @ingroup KGEN_INFRA
+ */
+/*@{*/
+
+/**
+ * @internal
+ * @brief Assistant for loop body unrolling
+ *
+ * @param[out] ctx           Generator context
+ * @param[in]  loopCtl       Unrolled loop control information
+ * @param[in]  dtype         Data type to unroll the loop body for
+ * @param[in]  unrollers     Set of subgenerators;
+ *                           If 'preUnroll', 'postUnroll' or 'vecUnroll'
+ *                           is set to NULL, it is ignored. Vectorized unrolling
+ *                           is not used for 'COMPLEX_DOUBLE' type
+ * @param[out] priv          Private data for generators
+ *
+ * The unrolled loop can be as well single as double. In the case
+ * of the double loop only the inner loop is unrolled, and the outer
+ * loop is generated in the standard way with using the passed loop
+ * counter name and its bound. For the single loop 'ocName' field of the
+ * 'loop' structure should be NULL.
+ *
+ * @return 0 on success. On error returns negated error code:\n
+ *\n
+ *      -EOVERFLOW: code buffer overflowed\n
+ *      -EINVAL: invalid parameter is passed
+ *               (unsupported data type, or 'genSingle' generator
+ *               is not specified)
+ */
+int
+kgenLoopUnroll(
+    struct KgenContext *ctx,
+    LoopCtl *loopCtl,
+    DataType dtype,
+    const LoopUnrollers *unrollers,
+    void *priv);
+
+/**
+ * @internal
+ * @brief Create code generation guard
+ *
+ * @param[out] ctx           Generator context
+ * @param[in]  genCallback   Generator callback which is invoked it the function
+ *                           matching to a pattern is not found
+ * @param[in]  patSize       Pattern size
+ *
+ * The guard doesn't allow to generate several functions matching to the same
+ * pattern and as result having the same name.
+ *
+ * @return a guard object on success; -ENOMEM if there is
+ *         not enough of memory to allocate internal structures
+ */
+struct KgenGuard
+*createKgenGuard(
+    struct KgenContext *ctx,
+    int (*genCallback)(struct KgenContext *ctx, const void *pattern),
+    size_t patSize);
+
+/**
+ * @internal
+ * @brief Reinitialize generator guard
+ *
+ * @param[out] guard         An existing generation guard
+ * @param[out] ctx           Generator context
+ * @param[in]  genCallback   Generator callback which is invoked it the function
+ *                           matching to a pattern is not found
+ * @param[in]  patSize       Pattern size
+ */
+void
+reinitKgenGuard(
+    struct KgenGuard *guard,
+    struct KgenContext *ctx,
+    int (*genCallback)(struct KgenContext *ctx, const void *pattern),
+    size_t patSize);
+
+/**
+ * @internal
+ * @brief Find an already generated function or generate it
+ *
+ * @param[out] guard         An existing generation guard
+ * @param[in]  pattern       Pattern the function being looked for should match
+ * @param[out] name          Buffer to store a name of the function
+ * @param[in]  nameLen       Name buffer length
+ *
+ * At first it tries to find an already generated function mathing to the passed
+ * pattern. If the guard doesn't find the function, it invokes the generator
+ * callback
+ *
+ * NOTE: names of generated functions should not exceed 'FUNC_NAME_MAXLEN'
+ *       constant.
+ *
+ * @return 0 on success, otherwise returns a negated error code:\n
+ *      -ENOMEM: enough of memory to allocate internal structures\n
+ *      -EOVERFLOW: source buffer overflowing
+ */
+int
+findGenerateFunction(
+    struct KgenGuard *guard,
+    const void *pattern,
+    char *name,
+    size_t nameLen);
+
+/**
+ * @internal
+ * @brief Destroy code generation guard
+ *
+ * @param[out] guard         A guard instance to be destroyed
+ */
+void
+destroyKgenGuard(struct KgenGuard *guard);
+
+/*@}*/
+
+/**
+ * @internal
+ * @defgroup KGEN_AUX_FUNCS Auxiliary functions
+ * @ingroup KGEN_INFRA
+ */
+/*@{*/
+
+void
+kstrcpy(Kstring *kstr, const char *str);
+
+void
+ksprintf(Kstring *kstr, const char *fmt,...);
+
+void
+kstrcatf(Kstring *kstr, const char *fmt,...);
+
+// unified pointer type name
+const char
+*uptrTypeName(UptrType type);
+
+/**
+ * @internal
+ * @brief get a BLAS data type dependendtto function prefix
+ *
+ * @param[in]  type          Data type
+ *
+ * A literal returned by the function is assumed to be used as the prefix
+ * of some generated function to put the accent on the BLAS data type it
+ * operates with.
+ *
+ * @return 0 if an unknown type is passed
+ */
+char
+dtypeToPrefix(DataType type);
+
+/**
+ * @internal
+ * @brief convert a BLAS data type to the respective built-in OpenCL type
+ *
+ * @param[in]  dtype         Data type
+ *
+ * @return NULL if an unknown type is passed
+ */
+const char
+*dtypeBuiltinType(DataType dtype);
+
+/**
+ * internal
+ * @brief Return unified pointer field corresponding to the data type
+ *
+ * @param[in]  dtype         Data type
+ *
+ * @Returns NULL if an unknown type is passed
+ */
+const char
+*dtypeUPtrField(DataType dtype);
+
+/**
+ * @internal
+ * @brief Return "one" value string depending on the data type
+ *
+ * @param[in]  dtype         Data type
+ *
+ * @return NULL if an unknown type is passed
+ */
+const char
+*strOne(DataType dtype);
+
+/**
+ * @internal
+ * @brief Get vector type name
+ *
+ * @param[in]  dtype         Data type
+ * @param[in]  vecLen        Vector length for the type. Must be set to 1 if
+ *                           the type is scalar.
+ * @param[out] typeName      Location to store pointer to a constant string
+ *                           with the type name
+ * @param[out] typePtrName   Location to store unified pointer field
+ *                           corresponding to the vector consisting of elements
+ *                           of \b dtype \b type
+ */
+void
+getVectorTypeName(
+    DataType dtype,
+    unsigned int vecLen,
+    const char **typeName,
+    const char **typePtrName);
+
+/*@}*/
+
+#endif /* KERNGEN_H_ */
diff --git a/src/include/list.h b/src/include/list.h
new file mode 100644
index 0000000..38ca7c6
--- /dev/null
+++ b/src/include/list.h
@@ -0,0 +1,116 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Work with circular double linked lists
+ */
+
+#ifndef LIST_H_
+#define LIST_H_
+
+#include <defbool.h>
+
+#if defined (_WIN64)
+typedef unsigned long long prt_size_t;
+#else
+typedef unsigned long prt_size_t;
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define offset_of(field, type)                                 \
+    (prt_size_t)(&((type*)0)->field)
+
+#define container_of(node, field, type)                        \
+    (type*)((prt_size_t)(node) - offset_of(field, type))
+
+typedef struct ListNode {
+    struct ListNode *prev;
+    struct ListNode *next;
+} ListNode;
+
+typedef ListNode ListHead;
+typedef void (*ListAction)(ListNode *node);
+typedef void (*ListPrivAction)(ListNode *node, void *priv);
+
+/*
+ *  Type of function comparing list node contents with a key.
+ *  On equality such a function must return 0
+ */
+typedef int (*ListCmpFn)(const ListNode *node, const void *key);
+
+static __inline
+bool isListEmpty(ListHead *list)
+{
+    return (list->next == list);
+}
+
+static __inline ListNode
+*listNodeFirst(const ListHead *head)
+{
+    return head->next;
+}
+
+static __inline ListNode
+*listNodeLast(const ListHead *head)
+{
+    return head->prev;
+}
+
+static __inline void
+listInitHead(ListHead *head)
+{
+    head->prev = head;
+    head->next = head;
+}
+
+void
+listAddToTail(ListHead *head, ListNode *node);
+
+void
+listAddToHead(ListHead *head, ListNode *node);
+
+void listDel(ListNode *node);
+
+ListNode
+*listDelFromTail(ListHead *head);
+
+void
+listDoForEach(ListHead *head, ListAction act);
+
+void
+listDoForEachSafe(ListHead *head, ListAction act);
+
+void
+listDoForEachPriv(const ListHead *head, ListPrivAction act, void *actPriv);
+
+void
+listDoForEachPrivSafe(const ListHead *head, ListPrivAction act, void *actPriv);
+
+ListNode
+*listNodeSearch(const ListHead *head, const void *key, ListCmpFn cmp);
+
+size_t
+listLength(const ListHead *head);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* LIST_H_ */
diff --git a/src/include/mempat.h b/src/include/mempat.h
new file mode 100644
index 0000000..70f0703
--- /dev/null
+++ b/src/include/mempat.h
@@ -0,0 +1,86 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Memory usage pattern related definitions
+ */
+
+#ifndef MEMPAT_H_
+#define MEMPAT_H_
+
+#include <solver.h>
+
+enum {
+    MAX_MEMORY_PATTERNS = 16
+};
+
+/**
+ * @internal
+ * @brief Memory level identifiers
+ *
+ * @ingroup SOLVERIF
+ */
+typedef enum CLMemLevel {
+    CLMEM_LEVEL_LDS = 0x01,        /**< Local data storage */
+    CLMEM_LEVEL_L1 = 0x02,         /**< L1 cache */
+    CLMEM_LEVEL_L2 = 0x04          /**< L2 cache */
+} CLMemLevel;
+
+/**
+ * @internal
+ * @brief Memory type identifiers
+ *
+ * @ingroup SOLVERIF
+ */
+typedef enum CLMemType {
+    CLMEM_GLOBAL_MEMORY,
+    CLMEM_LOCAL_MEMORY,
+    CLMEM_IMAGE,
+    // FIXME: it's for backward compatibility, remove after blkmul deprecation
+    CLMEM_BUFFER = CLMEM_LOCAL_MEMORY
+} CLMemType;
+
+// memory levels set
+typedef unsigned int meml_set_t;
+
+/*
+ * FIXME: deprecate cuLevel and thLevel
+ */
+
+/**
+ * @internal
+ * @brief Solver memory pattern description structure
+ *
+ * The structure decribes memory using features and used
+ * by frontend at choosing of solving strategy and decomposition
+ * block sizes
+ *
+ * @ingroup SOLVERIF
+ */
+typedef struct MemoryPattern {
+    const char *name;           /**< Pattern's name */
+    unsigned int nrLevels;      /**< Decomposition levels number */
+    /** Level a problem is decomposed among compute units at */
+    int cuLevel;
+    /** Level a problem is decomposed among threads within single compute unit */
+    int thLevel;
+    SolverOps *sops;            /**< Solver operations */
+    /** extra information specific for the application field */
+    void *extra;
+} MemoryPattern;
+
+#endif /* MEMPAT_H_ */
diff --git a/src/include/msvc.h b/src/include/msvc.h
new file mode 100644
index 0000000..55e2ebf
--- /dev/null
+++ b/src/include/msvc.h
@@ -0,0 +1,34 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Declarations not supported in visual studio
+ * by default
+ */
+
+#ifndef MSVC_H_
+#define MSVC_H_
+
+#ifndef EOVERFLOW
+#define EOVERFLOW 1000
+#endif  /* EOVERFLOW */
+
+#define snprintf _snprintf
+
+typedef long ssize_t;
+
+#endif /* MSVS_H_ */
diff --git a/src/include/mutex.h b/src/include/mutex.h
new file mode 100644
index 0000000..5e920fe
--- /dev/null
+++ b/src/include/mutex.h
@@ -0,0 +1,36 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef MUTEX_H_
+#define MUTEX_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void* mutex_t;
+
+mutex_t* mutexInit(void);
+int mutexDestroy(mutex_t *mutex);
+int mutexLock(mutex_t *mutex);
+int mutexUnlock(mutex_t *mutex);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* MUTEX_H_ */
diff --git a/src/include/solver.h b/src/include/solver.h
new file mode 100644
index 0000000..411519e
--- /dev/null
+++ b/src/include/solver.h
@@ -0,0 +1,196 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef SOLVER_H_
+#define SOLVER_H_
+
+#include <defbool.h>
+
+#include <cltypes.h>
+#include <kerngen.h>
+#include <clkern.h>
+#include <clBLAS.h>
+#include <kernel_extra.h>
+
+struct Kernel;
+
+// OpenCL solver ID
+typedef int solver_id_t;
+
+/**
+ * @internal
+ * @defgroup SOLVERIF Solver interface
+ *
+ * This interface binds the library frontend to the library backend
+ */
+/*@{*/
+
+/**
+ * @internal
+ * @brief Solver flags
+ */
+typedef enum SolverFlags {
+    /** supports 1D work space */
+    SF_WSPACE_1D = 0x01,
+    /** supports 2D work space */
+    SF_WSPACE_2D = 0x02,
+    /** input data blocks at the top level must be square */
+    SF_TOP_INPUT_SQUARE_BLOCKS = 0x04
+} SolverFlags;
+
+typedef enum PatternPerformance{
+    PPERF_NOT_SUPPORTED = -1,
+    PPERF_POOR = 0,
+    PPERF_AVERAGE,
+    PPERF_GOOD,
+    PPERF_BEST
+} PatternPerformance;
+
+typedef enum CheckCalcPGran{
+    PGRAN_CHECK = 0,
+    PGRAN_CALC
+} CheckCalcPGran;
+
+/**
+ * @internal
+ * @brief type of function generating kernel source for an
+ *         OpenCL based solver
+ *
+ * @param[out] buf         Pointer to a buffer to store a generated kenrel to
+ * @param[in] buflen       Length of the buffer
+ * @param[in] subdims      Subproblem dimensions to generate an optimal kernel
+ * @param[in] pgran        Data parallelism granularity
+ * @param[in] extra        Generator extra information depending on the
+ *                          application fields
+ *
+ * If the pointer to the buffer is NULL, the function should just calculate
+ * needed size of the buffer to fit the code in.
+ *
+ * @return size of the generated kernel source on success; negated error code
+ *         otherwise
+ *   - -ENOMEM: enough of memory to allocated internal structures
+ *   - -EOVERFLOW: generated source exceeds the buffer size
+ *   - -EINVAL: invalid argument is passed
+ */
+typedef ssize_t
+(*SolverKgen)(
+   char *buf,
+   size_t buflen,
+   const SubproblemDim *subdims,
+   const PGranularity *pgran,
+   void *extra);
+
+/**
+ * @internal
+ * @brief Solver operations
+ *
+ * The 'args' parameter for 'calcPrepWorkGroups',
+ * and the second parameter for the 'assignKargs' methods plays the role of pointer
+ * to a kernel arguments structure depending on the application field.
+ */
+typedef struct SolverOps {
+    /** Kernel generator */
+    SolverKgen genKernel;
+
+    /** Assign kernel arguments; the first argument is kernel argument batch
+     *  passed immediately to a kernel */
+    void (*assignKargs)(KernelArg*, const void* args, const void *extra);
+
+    /** Check if available LDS size is enough to fit all needed data at such
+     *  granulation; 'kernelArgs' - kernel arguments depending on the
+     *  application fields */
+    bool (*isFitToLDS)(
+        SubproblemDim *dims,
+        DataType,
+        cl_ulong ldsSize,
+        const void *args);
+
+    /** Get the pattern`s performance estimation for specified flags,
+     * arguments and granulation.
+     * Is used for selecting most suitable pattern current problem */
+    int (*getPatternPerf)(
+        unsigned int kflags,
+        const void *args);
+
+    /**
+     * Inner decomposition axis matching to the fastest moving OpenCL
+     * work dimension. Used only for those patterns which use 2 dimensional
+     * decomposition
+     */
+    DecompositionAxis (*innerDecompositionAxis)(const void *args);
+
+    /** Calculate number of needed global threads to execute a kernel */
+    void (*calcThreads)(
+        size_t threads[2],
+        const SubproblemDim *subdims,
+        const PGranularity *pgran,
+        const void *args,
+        const void *extra);
+
+    /** Set number of lines of the same top level block stored into the image
+     *  together and the direction of blocks storing. A solver that uses images
+     *  and stores data to images by blocks must provide the method */
+    void (*imgPackMode)(
+        const void *extra,
+        const SubproblemDim *subdims,
+        int dataID,
+        unsigned int *rate,
+        clblasOrder *order);
+
+    /** Get solver flags */
+    SolverFlags (*getFlags)(void);
+
+    /** Correct problem arguments anr extra kernel parameters
+     *  depending on solver specifics. Basically, a solver should not
+     *  change any arguments that come from the API level to avoid any
+     *  confusing points */
+    void (*fixupArgs)(void *args, SubproblemDim* pSubDims, void *extra);
+
+    /** Function, returning default decomposition for the pattern */
+    int ( *getDefaultDecomp)(
+        PGranularity *pgran,
+        SubproblemDim *subdims,
+        unsigned int subdimsNum,
+        void *pArgs);
+
+    /** Perform validation of decomposition.
+      * If "check" flag set to true: validate specified decomposition and
+      * check, if specified granulation is valid for it.
+      * If "check" flag set to false: calculate granulation,
+      * fitting the specified decomposition, if possible */
+    bool (*checkCalcDecomp)(
+        PGranularity *pgran,
+        SubproblemDim *subdims,
+        unsigned int subdimsNum,
+        DataType dtype,
+        int check);
+
+
+	/*
+	 SetBuildOptions
+	*/
+	void (*setBuildOptions)( char *buildOptsStr, const void *args);
+
+	/*
+  	 * selectVectorization
+	*/
+    KernelExtraFlags (*selectVectorization)( void *kargs, unsigned int vlen);
+} SolverOps;
+
+/*@}*/
+
+#endif /* SOLVER_H_ */
diff --git a/src/include/trace_malloc.h b/src/include/trace_malloc.h
new file mode 100644
index 0000000..3dfa315
--- /dev/null
+++ b/src/include/trace_malloc.h
@@ -0,0 +1,75 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Define simple functionality to track memory leaks in order to separate
+ * library leaks from leaks in the other components and to take info in
+ * a human friendly format
+ */
+
+#ifndef TRACE_MALLOC_H_
+#define TRACE_MALLOC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(TRACE_MALLOC)
+
+#define malloc(size)            debugMalloc(size, __FILE__, __LINE__)
+#define calloc(nmemb, size)     debugCalloc(size * nmemb, __FILE__, __LINE__)
+#define realloc(ptr, size)      debugRealloc(ptr, size, __FILE__, __LINE__)
+#define free(ptr)               debugFree(ptr)
+
+void initMallocTrace(void);
+void *debugMalloc(size_t size, const char *file, int line);
+void *debugCalloc(size_t size, const char *file, int line);
+void *debugRealloc(void *ptr, size_t size, const char *file, int line);
+void debugFree(void *ptr);
+void printMallocStatistics(void);
+void printMemLeaksInfo(void);
+void releaseMallocTrace(void);
+
+#else       /* TRACE_MALLOC */
+
+static __inline void initMallocTrace(void)
+{
+    /* do noting */
+}
+
+static __inline void printMallocStatistics(void)
+{
+    /* do nothing */
+}
+
+static __inline void printMemLeaksInfo(void)
+{
+    /* do nothing */
+}
+
+static __inline void releaseMallocTrace(void)
+{
+    /* do nothing */
+}
+
+#endif      /* !TRACE_MALLOC */
+
+#ifdef __cplusplus
+}      /* extern "C" { */
+#endif
+
+#endif /* TRACE_MALLOC_H_ */
diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt
new file mode 100644
index 0000000..5bc8e2a
--- /dev/null
+++ b/src/library/CMakeLists.txt
@@ -0,0 +1,329 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+set(SRC_BLAS
+    blas/init.c
+    blas/impl.c
+    blas/scimage.c
+    blas/xgemv.c
+    blas/xsymv.c
+    blas/xgemm.c
+    blas/xtrmm.c
+    blas/xtrsm.c
+    blas/xsyrk.c
+    blas/xsyr2k.c
+    blas/xtrmv.c
+    blas/xtrsv.c
+    blas/xsymm.c
+	blas/xgemm2.c
+    blas/xger.c
+	blas/xsyr.c
+	blas/xsyr2.c
+	blas/xher.c
+	blas/xher2.c
+	blas/xhemv.c
+	blas/xhemm.c
+	blas/xherk.c
+	blas/xhpmv.c
+	blas/xspmv.c
+	blas/xgbmv.c
+	blas/xtbmv.c
+	blas/xshbmv.c
+	blas/xtbsv.c
+	blas/xher2k.c
+	blas/xswap.c
+	blas/xscal.c
+	blas/xcopy.c
+	blas/xaxpy.c
+	blas/xdot.c
+	blas/xrotg.c
+	blas/xrotmg.c
+	blas/xrot.c
+	blas/xrotm.c
+    blas/ixamax.c
+	blas/xnrm2.c
+    blas/xasum.c
+)
+
+set(SRC_BLAS_HEADERS
+    blas/include/blas_funcs.h
+    blas/include/matrix_dims.h
+    blas/include/matrix_props.h
+    blas/include/blas_mempat.h
+    blas/include/clblas-internal.h
+    blas/include/solution_seq.h
+    blas/include/events.h
+)
+
+set(SRC_BLAS_GENERIC
+    blas/generic/common.c
+    blas/generic/blas_funcs.c
+    blas/generic/events.c
+    blas/generic/matrix_props.c
+    blas/generic/matrix_dims.c
+    blas/generic/kdump.c
+    blas/generic/solution_assert.c
+    blas/generic/solution_seq.c
+    blas/generic/solution_seq_make.c
+    blas/generic/problem_iter.c
+    blas/generic/kernel_extra.c)
+
+set(SRC_BLAS_GENS
+    blas/gens/gen_init.c
+    blas/gens/blas_kgen.c
+	blas/gens/blas_subgroup.c
+    blas/gens/gen_helper.c
+    blas/gens/tilemul.c
+    blas/gens/fetch.c
+    blas/gens/tile.c
+    blas/gens/tile_iter.c
+    blas/gens/decomposition.c
+    blas/gens/gemv.c
+    blas/gens/symv.c
+    blas/gens/gemm.c
+    blas/gens/trmm.c
+    blas/gens/trsm.c
+    blas/gens/syrxk.c
+    blas/gens/trxm_common.c
+    blas/gens/trsm_kgen.c
+    blas/gens/xxmv_common.c
+    blas/gens/legacy/blas_kgen_legacy.c
+    blas/gens/legacy/gen_helper_legacy.c
+    blas/gens/legacy/trxm_common_legacy.c
+    blas/gens/legacy/trsm_kgen_legacy.c
+    blas/gens/legacy/blkmul.c
+    blas/gens/legacy/gemm_lds.c
+    blas/gens/legacy/gemm_img.c
+    blas/gens/legacy/trmm_lds.c
+    blas/gens/legacy/trmm_img.c
+    blas/gens/legacy/trsm_lds.c
+    blas/gens/legacy/trsm_img.c
+    blas/gens/legacy/trsm_cached_lds.c
+
+	blas/gens/trmv_reg.cpp
+	blas/gens/ger_lds.cpp
+	blas/gens/trsv_trtri.cpp
+	blas/gens/trsv_gemv.cpp
+	blas/gens/kprintf.cpp
+	blas/gens/syr_lds.cpp
+	blas/gens/her_lds.cpp
+	blas/gens/syr2_lds.cpp
+	blas/gens/her2_lds.cpp
+	blas/gens/symm_cached.cpp
+	blas/gens/gemm_cached.cpp
+	blas/gens/gemm_tail_cached.cpp
+	blas/gens/gbmv.cpp
+	blas/gens/tuned_numbers.c
+	blas/gens/swap_reg.cpp
+    blas/gens/scal_reg.cpp
+    blas/gens/copy_reg.cpp
+    blas/gens/axpy_reg.cpp
+    blas/gens/dot.cpp
+    blas/gens/reduction.cpp
+    blas/gens/rotg_reg.cpp
+    blas/gens/rotmg_reg.cpp
+    blas/gens/rotm_reg.cpp
+    blas/gens/iamax.cpp
+    blas/gens/nrm2.cpp
+    blas/gens/asum.cpp
+)
+
+set (SRC_CL_TEMPLATES
+    gemm.cl
+    gemm_helper.cl
+    gbmv.cl
+    ger.cl
+    her.cl
+    symm_helper.cl
+    syr2_her2.cl
+    syr_her.cl
+    trsv.cl
+    her2.cl
+    symm.cl
+    syr2.cl
+    syr.cl
+    trmv.cl
+    trsv_gemv.cl
+    swap.cl
+    scal.cl
+    copy.cl
+    axpy.cl
+    dot.cl
+    reduction.cl
+    rotg.cl
+    rotmg.cl
+    rotm.cl
+    iamax.cl
+    nrm2.cl
+    asum.cl
+)
+
+set(SRC_BLAS_GENERIC_HEADERS
+    blas/generic/solution_assert.h
+    blas/generic/problem_iter.h
+)
+
+set(SRC_BLAS_GENS_HEADERS
+    blas/gens/fetch.h
+    blas/gens/blas_kgen.h
+	blas/gens/blas_subgroup.h
+    blas/gens/gen_helper.h
+    blas/gens/init.h
+    blas/gens/trxm_common.h
+    blas/gens/trsm_kgen.h
+    blas/gens/xxmv_common.h
+    blas/gens/tile.h
+    blas/gens/tile_iter.h
+    blas/gens/tuned_numbers.h
+)
+
+set(SRC_COMMON
+    common/list.c
+    common/clkern.c
+    common/kern_cache.c
+    common/kerngen_core.c
+    common/kgen_basic.c
+    common/kgen_loop_helper.c
+    common/kgen_guard.c
+    common/misc.c
+    common/devinfo.c
+    common/devinfo-cache.c
+    common/mutex.c
+    common/trace_malloc.c
+)
+
+set(SRC_COMMON_GENS
+    common/gens/dblock_kgen.c
+)
+
+set(SRC_TOOLS
+    tools/tune/toolslib.c
+    tools/tune/fileio.c
+    tools/tune/dimension.c
+    tools/tune/storage_init.c
+    tools/tune/storage_io.c
+    tools/tune/storage_data.c
+)
+
+set(CLBLAS_SOURCES
+    ${SRC_COMMON} ${SRC_COMMON_GENS} ${SRC_BLAS} ${SRC_BLAS_GENERIC}
+    ${SRC_BLAS_GENS} ${SRC_TOOLS} ../clBLAS.def
+)
+set(GLOBAL_HEADERS
+    ${clBLAS_SOURCE_DIR}/clBLAS.h
+    ${clBLAS_SOURCE_DIR}/clBLAS-complex.h 
+    ${clBLAS_SOURCE_DIR}/include/clkern.h
+    ${clBLAS_SOURCE_DIR}/include/cltypes.h
+    ${clBLAS_SOURCE_DIR}/include/dblock_kgen.h
+    ${clBLAS_SOURCE_DIR}/include/defbool.h
+    ${clBLAS_SOURCE_DIR}/include/devinfo.h
+    ${clBLAS_SOURCE_DIR}/include/dis_warning.h
+    ${clBLAS_SOURCE_DIR}/include/kern_cache.h
+    ${clBLAS_SOURCE_DIR}/include/kernel_extra.h
+    ${clBLAS_SOURCE_DIR}/include/kerngen.h
+    ${clBLAS_SOURCE_DIR}/include/list.h
+    ${clBLAS_SOURCE_DIR}/include/mempat.h
+    ${clBLAS_SOURCE_DIR}/include/msvc.h
+    ${clBLAS_SOURCE_DIR}/include/mutex.h
+    ${clBLAS_SOURCE_DIR}/include/solver.h
+)
+
+source_group(common FILES ${SRC_COMMON})
+source_group(common\\gens FILES ${SRC_COMMON_GENS})
+source_group(blas FILES ${SRC_BLAS})
+source_group(blas\\include FILES ${SRC_BLAS_HEADERS})
+source_group(blas\\generic FILES ${SRC_BLAS_GENERIC})
+source_group(blas\\gens FILES ${SRC_BLAS_GENS}
+    ${SRC_BLAS_GENS_HEADERS})
+
+include_directories(${OPENCL_INCLUDE_DIRS}
+    ${clBLAS_SOURCE_DIR}
+    ${clBLAS_SOURCE_DIR}/include
+    ${clBLAS_SOURCE_DIR}/library/blas/include
+    ${clBLAS_SOURCE_DIR}/library/tools/tune
+    ${clBLAS_BINARY_DIR}/include
+)
+
+option( BLAS_DUMP_CLBLAS_KERNELS "Force the library to dump OpenCL kernels to disk" OFF )
+if( BLAS_DUMP_CLBLAS_KERNELS )
+    add_definitions( -DDUMP_CLBLAS_KERNELS )
+endif()
+
+option( BLAS_KEEP_KERNEL_SOURCES "Prevent the library from stripping source from kernels" OFF )
+if( BLAS_KEEP_KERNEL_SOURCES )
+    add_definitions( -DKEEP_CLBLAS_KERNEL_SOURCES )
+endif()
+
+option( BLAS_TRACE_MALLOC "Simple functionality to track memory leaks" OFF )
+if( BLAS_TRACE_MALLOC )
+    add_definitions( -DTRACE_MALLOC )
+endif()
+
+option( BLAS_PRINT_BUILD_ERRORS "Enable printing of OpenCL compiler errors on stdout" ON )
+if( BLAS_PRINT_BUILD_ERRORS )
+    add_definitions( -DPRINT_BUILD_ERRORS )
+endif()
+
+#add_executable(tplgen tools/tplgen/tplgen.cpp)
+if (CMAKE_COMPILER_IS_GNUCXX)
+    include(ExternalProject)
+    ExternalProject_Add(
+        tplgen
+        URL "${CMAKE_SOURCE_DIR}/library/tools/tplgen"
+        INSTALL_COMMAND ""
+    )
+    add_custom_target( GENERATE_CLT
+                   COMMAND ${CMAKE_BINARY_DIR}/library/tplgen-prefix/src/tplgen-build/tplgen -o ../../include/ ${SRC_CL_TEMPLATES}
+                   WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/library/blas/gens/clTemplates
+                )
+    add_dependencies(GENERATE_CLT tplgen)
+else()
+    include(ExternalProject)
+    ExternalProject_Add(
+        tplgen
+        URL "${CMAKE_SOURCE_DIR}/library/tools/tplgen"
+        CONFIGURE_COMMAND "${CMAKE_BINARY_DIR}\\library\\tplgen-prefix\\src\\tplgen\\configure.bat"
+        BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Debug
+#        BUILD_COMMAND MSBuild.exe tplgen.sln /m /fl /flp1:logfile=errors.log;errorsonly /flp2:logfile=warnings.log;warningsonly /t:rebuild
+        INSTALL_COMMAND ""
+    )
+    add_custom_target( GENERATE_CLT
+        COMMAND ${CMAKE_BINARY_DIR}\\library\\tplgen-prefix\\src\\tplgen-build\\Debug\\tplgen.exe -o ..\\..\\include ${SRC_CL_TEMPLATES}
+        WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}\\library\\blas\\gens\\clTemplates
+    )
+    add_dependencies(GENERATE_CLT tplgen)
+endif()
+add_library(clBLAS SHARED ${CLBLAS_SOURCES} ${GLOBAL_HEADERS} ${SRC_BLAS_HEADERS} ${SRC_BLAS_GENS_HEADERS})
+add_dependencies(clBLAS GENERATE_CLT)
+set_target_properties(clBLAS PROPERTIES VERSION ${clBLAS_VERSION})
+set_target_properties(clBLAS PROPERTIES SOVERSION ${clBLAS_SOVERSION})
+target_link_libraries(clBLAS ${OPENCL_LIBRARIES} ${MATH_LIBRARY})
+
+if( TARGET_PLATFORM EQUAL 64 )
+	# CPack configuration; include the executable into the package
+	install( TARGETS clBLAS
+			RUNTIME DESTINATION bin64
+			LIBRARY DESTINATION lib64
+			ARCHIVE DESTINATION lib64/import
+			)
+else()
+	# CPack configuration; include the executable into the package
+	install( TARGETS clBLAS
+			RUNTIME DESTINATION bin32
+			LIBRARY DESTINATION lib32
+			ARCHIVE DESTINATION lib32/import
+			)
+endif()
diff --git a/src/library/blas/generic/blas_funcs.c b/src/library/blas/generic/blas_funcs.c
new file mode 100644
index 0000000..34d00f5
--- /dev/null
+++ b/src/library/blas/generic/blas_funcs.c
@@ -0,0 +1,96 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <blas_funcs.h>
+
+int
+funcBlasLevel(BlasFunctionID funcID)
+{
+    switch(funcID)
+    {
+        case CLBLAS_SWAP:
+        case CLBLAS_SCAL:
+        case CLBLAS_COPY:
+        case CLBLAS_AXPY:
+        case CLBLAS_DOT:
+        case CLBLAS_REDUCTION_EPILOGUE:
+        case CLBLAS_ROTG:
+        case CLBLAS_ROTMG:
+        case CLBLAS_ROT:
+        case CLBLAS_ROTM:
+        case CLBLAS_iAMAX:
+        case CLBLAS_NRM2:
+        case CLBLAS_ASUM:
+                            return 1;
+
+        case CLBLAS_GEMV:
+        case CLBLAS_SYMV:
+        case CLBLAS_TRMV:
+        case CLBLAS_TRSV:
+        case CLBLAS_TRSV_GEMV:
+        case CLBLAS_HEMV:
+        case CLBLAS_SYR:
+        case CLBLAS_SYR2:
+        case CLBLAS_GER:
+        case CLBLAS_HER:
+        case CLBLAS_HER2:
+        case CLBLAS_TPMV:
+        case CLBLAS_SPMV:
+        case CLBLAS_HPMV:
+        case CLBLAS_TPSV:
+        case CLBLAS_SPR:
+        case CLBLAS_SPR2:
+        case CLBLAS_HPR:
+        case CLBLAS_HPR2:
+        case CLBLAS_GBMV:
+        case CLBLAS_TBMV:
+        case CLBLAS_SBMV:
+        case CLBLAS_HBMV:
+        case CLBLAS_TBSV:
+                            return 2;
+
+        default:            return 3;
+    }
+}
+
+bool
+funcHasBeta(BlasFunctionID funcID)
+{
+    return !funcHasTriangMatrix(funcID);
+}
+
+bool
+funcHasTriangMatrix(BlasFunctionID funcID)
+{
+    bool ret = false;
+
+    switch (funcID) {
+    // go through
+    case CLBLAS_TRMM:
+    case CLBLAS_TRSM:
+	case CLBLAS_TRMV:
+	case CLBLAS_HEMV:
+	case CLBLAS_TRSV:
+        ret = true;
+        break;
+    default:
+        /* do nothing */
+        break;
+    }
+
+    return ret;
+}
diff --git a/src/library/blas/generic/common.c b/src/library/blas/generic/common.c
new file mode 100644
index 0000000..9e26887
--- /dev/null
+++ b/src/library/blas/generic/common.c
@@ -0,0 +1,877 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include <clBLAS.h>
+#include <clkern.h>
+#include <cltypes.h>
+#include <stdio.h>
+
+#include "clblas-internal.h"
+
+#if defined(DUMP_CLBLAS_KERNELS) && !defined(KEEP_CLBLAS_KERNEL_SOURCES)
+#define KEEP_CLBLAS_KERNEL_SOURCES
+#endif
+
+int clblasInitialized = 0;
+CLBlasSolvers clblasSolvers[BLAS_FUNCTIONS_NUMBER];
+struct KernelCache *clblasKernelCache = NULL;
+
+enum {
+    BUILD_LOG_SIZE = 65536
+};
+
+static __inline void
+storeErrorCode(cl_int *error, cl_int code)
+{
+    if (error != NULL) {
+        *error = code;
+    }
+}
+
+#ifndef PRINT_BUILD_ERRORS
+    #define PRINT_BUILD_ERRORS
+#endif
+
+#ifdef PRINT_BUILD_ERRORS
+
+static char
+*allocBuildLog(void)
+{
+	char *log;
+
+    log = malloc(BUILD_LOG_SIZE);
+	if (log) {
+		log[0] = '\0';
+	}
+
+	return log;
+}
+
+static void
+freeBuildLog(char *buildLog)
+{
+    free(buildLog);
+}
+
+static void
+printBuildError(
+    cl_int error,
+    cl_device_id device,
+    SolverKgen kgen,
+    const SubproblemDim *dims,
+    const PGranularity *pgran,
+    const CLBLASKernExtra *kextra,
+    const char *source,
+    const char *buildLog)
+{
+    char name[128];
+    char dimStr[1024];
+    char pgranStr[1024];
+    char *p;
+    MemoryPattern *mempat = NULL;
+    unsigned int i, j;
+    const char *s;
+
+    name[0] = '\0';
+    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(name), name, NULL);
+
+    // lookup memory pattern
+    s = NULL;
+    for (i = 0; i < BLAS_FUNCTIONS_NUMBER; i++) {
+        for (j = 0; j < clblasSolvers[i].nrPatterns; j++) {
+            mempat = &clblasSolvers[i].memPatterns[j];
+            if (kgen == mempat->sops->genKernel) {
+                s = kernelTypeString(kextra->kernType);
+                break;
+            }
+        }
+        if (s != NULL) {
+            break;
+        }
+    }
+
+    // sprintf Subproblem dimensions
+    p = dimStr;
+    for (i = 0; i < mempat->nrLevels; i++) {
+        p = sprintfGranulation(p, dims, i);
+        strcat(p, "; ");
+        p += strlen(p);
+    }
+
+    // sprintf data parallelism granularity
+    sprintf(pgranStr, "pgran->wgDim = %d, pgran->wgSize[0] = %u, "
+            "pgran->wgSize[1] = %u, pgran->wfSize = %u",
+            pgran->wgDim, pgran->wgSize[0], pgran->wgSize[1],
+            pgran->wfSize);
+
+    fprintf(stderr, "\n========================================================\n\n");
+    fprintf(stderr, "AN INTERNAL KERNEL BUILD ERROR OCCURRED!\n");
+    fprintf(stderr, "device name = %s\n", name);
+    fprintf(stderr, "error = %d\n", error);
+    fprintf(stderr, "memory pattern = %s, %s kernel generator\n", mempat->name, s);
+    fprintf(stderr, "Subproblem dimensions: %s\n", dimStr);
+    fprintf(stderr, "Parallelism granularity: %s\n", pgranStr);
+    fprintf(stderr, "Kernel extra flags: %u\n", kextra->flags);
+    fprintf(stderr, "Source:\n\n%s\n\n", source);
+    fprintf(stderr, "--------------------------------------------------------\n\n");
+    if (buildLog) {
+        fprintf(stderr, "Build log:\n\n%s\n", buildLog);
+    }
+    else {
+        fprintf(stderr, "Build log is unavailable\n");
+    }
+    fprintf(stderr, "========================================================\n\n");
+}
+
+#else               /* PRINT_BUILD_ERRORS */
+
+static __inline char*
+allocBuildLog(void)
+{
+    /* stub, do nothing */
+    return NULL;
+}
+
+#define freeBuildLog(log)                       /* stub, do nothing */
+#define printBuildError(error, device, kgen, \
+    dims, pgran, kextra, source, buildLog)      /* stub, do nothing */
+
+#endif              /* !PRINT_BUILD_ERRORS */
+
+static void
+extraDtor(struct Kernel *kernel)
+{
+    if (kernel->extra != NULL) {
+        free(kernel->extra);
+        kernel->extra = NULL;
+    }
+}
+
+static char
+*sprintfDim(
+    char *buf,
+    size_t dim,
+    const char *dimName,
+    int level,
+    bool first)
+{
+    if (!first) {
+        strcat(buf, ", ");
+        buf += strlen(buf);
+    }
+    if (dim == SUBDIM_UNUSED) {
+        sprintf(buf, "dims[%d].%s = SUBDIM_UNUSED", level, dimName);
+    }
+    else {
+        sprintf(buf, "dims[%d].%s = %lu", level, dimName, dim);
+    }
+
+    buf += strlen(buf);
+
+    return buf;
+}
+
+const char VISIBILITY_HIDDEN
+*kernelTypeString(CLBlasKernelType ktype)
+{
+    switch (ktype) {
+    case CLBLAS_COMPUTING_KERNEL:
+        return "computing";
+    case CLBLAS_PREP_A_KERNEL:
+        return "preparative for matrix A";
+    case CLBLAS_PREP_B_KERNEL:
+        return "preparative for matrix B";
+    default:
+        return NULL;
+    }
+}
+
+/*
+ * Assign a scalar multiplied on a matrix a kernel argument
+ */
+void VISIBILITY_HIDDEN
+assignScalarKarg(KernelArg *arg, const void *value, DataType dtype)
+{
+    arg->typeSize = dtypeSize(dtype);
+    memcpy(arg->arg.data, value, arg->typeSize);
+}
+
+void VISIBILITY_HIDDEN
+calcGlobalThreads(
+    size_t globalThreads[2],
+    const SubproblemDim *wgDim,
+    const PGranularity *pgran,
+    size_t M,
+    size_t N)
+{
+    globalThreads[1] = 1;
+
+    if ((wgDim->itemX != SUBDIM_UNUSED) &&
+        (wgDim->itemY != SUBDIM_UNUSED)) {
+
+        size_t groupWorkX, groupWorkY;
+        size_t nrGroupsX, nrGroupsY;
+        int nrDims;
+
+        groupWorkX = wgDim->itemX;
+        groupWorkY = wgDim->itemY;
+
+        nrGroupsX = N / groupWorkX;
+        if (N % groupWorkX) {
+            nrGroupsX++;
+        }
+
+        nrGroupsY = M / groupWorkY;
+        if (M % groupWorkY) {
+            nrGroupsY++;
+        }
+
+        nrDims = (pgran == NULL) ? 1 : pgran->wgDim;
+        if (nrDims == 1) {
+            globalThreads[0] = nrGroupsX * nrGroupsY;
+        }
+        else {
+            globalThreads[0] = nrGroupsY;
+            globalThreads[1] = nrGroupsX;
+        }
+    }
+    else {
+        size_t totalWork, groupWork;
+
+        if (wgDim->itemX != SUBDIM_UNUSED) {
+            totalWork = N;
+            groupWork = wgDim->itemX;
+        }
+        else {
+            totalWork = M;
+            groupWork = wgDim->itemY;
+        }
+
+        globalThreads[0] = totalWork / groupWork;
+        if (totalWork % groupWork) {
+            globalThreads[0]++;
+        }
+    }
+
+    if (pgran != NULL) {
+        globalThreads[0] *= pgran->wgSize[0];
+        globalThreads[1] *= pgran->wgSize[1];
+    }
+}
+
+cl_int VISIBILITY_HIDDEN
+getKernelContext(cl_kernel kernel, cl_context *context)
+{
+    cl_int err;
+    cl_context ctx;
+
+    err = clGetKernelInfo(kernel, CL_KERNEL_CONTEXT,
+        sizeof(cl_context), &ctx, NULL);
+    if (err != CL_SUCCESS)
+        return err;
+    if (context != NULL)
+        *context = ctx;
+    return err;
+}
+
+cl_int VISIBILITY_HIDDEN
+getQueueContext(cl_command_queue queue, cl_context *context)
+{
+    cl_int err;
+    cl_context ctx;
+
+    err = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT,
+        sizeof(cl_context), &ctx, NULL);
+    if (err != CL_SUCCESS)
+        return err;
+    if (context != NULL)
+        *context = ctx;
+    return err;
+}
+
+cl_int VISIBILITY_HIDDEN
+getQueueDevice(cl_command_queue queue, cl_device_id *device)
+{
+    cl_int err;
+    cl_device_id dev;
+
+    err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE,
+        sizeof(cl_device_id), &dev, NULL);
+    if (err != CL_SUCCESS)
+        return err;
+    if (device != NULL)
+        *device = dev;
+    return err;
+}
+
+cl_int VISIBILITY_HIDDEN
+getQueueProperties(
+    cl_command_queue queue,
+    cl_command_queue_properties *props)
+{
+    cl_int err;
+    cl_command_queue_properties p;
+
+    err = clGetCommandQueueInfo(queue, CL_QUEUE_PROPERTIES,
+        sizeof(cl_command_queue_properties), &p, NULL);
+    if (err != CL_SUCCESS)
+        return err;
+    if (props != NULL)
+        *props = p;
+    return err;
+}
+
+Kernel VISIBILITY_HIDDEN
+*loadKernel( const unsigned char** buffer,
+             size_t sizeBuffer,
+             KernelKey *key,
+             const CLBLASKernExtra *extra,
+             cl_int *error)
+
+{
+    cl_int status = CL_SUCCESS;
+    Kernel* kernel;
+
+    kernel = allocKernel();
+    if (kernel == NULL) {
+        return NULL;
+    }
+
+    kernel->program = createClProgramWithBinary(key->context,
+                                                key->device,
+                                                (unsigned char*)*buffer,
+                                                sizeBuffer,
+                                                &status);
+    if (status == CL_SUCCESS) {
+        kernel->extraSize = sizeof(CLBLASKernExtra);
+        kernel->extra = calloc(1, kernel->extraSize);
+        *(CLBLASKernExtra*)(kernel->extra) = *extra;
+        kernel->dtor = extraDtor;
+    }
+    else {
+        putKernel(NULL, kernel);
+        storeErrorCode(error, status);
+        kernel = NULL;
+    }
+
+    return kernel;
+}
+
+#if !defined(DUMP_CLBLAS_KERNELS)
+
+/*
+ * Drop the program's source so as to consume memory as few as possible
+ * at caching
+ */
+static cl_int
+dropProgramSource(cl_program *program, cl_context ctx, cl_device_id devID)
+{
+    size_t size;
+    unsigned char *bin;
+    cl_program p = *program;
+    cl_int err;
+
+    size = getProgramBinarySize(p);
+    bin = getProgramBinary(p);
+
+    /*
+     * Don't release the original program until a new one is created
+     * in order to retain its own reference to the context if it is
+     * released by user
+     */
+    p = createClProgramWithBinary(ctx, devID, bin, size, &err);
+    if (err == CL_SUCCESS) {
+        clReleaseProgram(*program);
+        *program = p;
+    }
+
+    free(bin);
+
+    return err;
+}
+
+#endif /* !DUMP_CLBLAS_KERNELS */
+
+Kernel
+*makeKernel(
+    cl_device_id device,
+    cl_context context,
+    SolverKgen kernelGenerator,
+    const SubproblemDim *dims,
+    const PGranularity *pgran,
+    const CLBLASKernExtra *extra,
+    const char *buildOpts,
+    cl_int *error)
+{
+
+    cl_int err;
+    char *source;
+    ssize_t size;
+    Kernel *kernel;
+    char *log;
+
+	#ifdef DEBUG_2
+	printf("Make kernel called\n");
+	printf("x : %d, y : %d, itemX: %d, itemY: %d\n",  dims->x, dims->y, dims->itemX, dims->itemY);
+	printf("PG : wgSize[0] : %d, wgSize[1] : %d, wfSize: %d\n",  pgran->wgSize[0], pgran->wgSize[1], pgran->wfSize);
+	#endif
+
+    size = kernelGenerator(NULL, 0, dims, pgran, (void*)extra);
+    if (size < 0) {
+        storeErrorCode(error, CL_OUT_OF_HOST_MEMORY);
+        return NULL;
+    }
+    source = calloc(1, size);
+    if (source == NULL) {
+        storeErrorCode(error, CL_OUT_OF_HOST_MEMORY);
+        return NULL;
+    }
+    if (kernelGenerator(source, size, dims, pgran, (void*)extra) != size) {
+        free(source);
+        storeErrorCode(error, CL_OUT_OF_HOST_MEMORY);
+        return NULL;
+    }
+
+	kernel = allocKernel();
+    if (kernel == NULL) {
+        free(source);
+        storeErrorCode(error, CL_OUT_OF_HOST_MEMORY);
+        return NULL;
+    }
+
+    log = allocBuildLog();
+
+	//#define DEBUG_2
+	#ifdef DEBUG_2
+	printf("Build Options used %s \n", buildOpts);
+	printf("Source kernel used %s \n", source);
+	#endif
+	#undef DEBUG_2
+
+    kernel->program = buildClProgram(source, buildOpts, context, device,
+                                     log, BUILD_LOG_SIZE, &err);
+    if (err != CL_SUCCESS) {
+        printBuildError(err, device, kernelGenerator, dims,
+                        pgran, extra, source, log);
+        freeBuildLog(log);
+        putKernel(NULL, kernel);
+        free(source);
+        storeErrorCode(error, err);
+        return NULL;
+    }
+	else
+	{
+		// #define DEBUG_2
+		#ifdef DEBUG_2
+		printf("Kernel compilation succeeded\n");
+		#endif
+		#undef DEBUG_2
+	}
+
+    freeBuildLog(log);
+    free(source);
+
+#if !defined(KEEP_CLBLAS_KERNEL_SOURCES)
+    if (err == CL_SUCCESS) {
+        err = dropProgramSource(&kernel->program, context, device);
+    }
+#endif  /* !DUMP_CLBLAS_KERNELS */
+
+    if (err != CL_SUCCESS) {
+        putKernel(NULL, kernel);
+        storeErrorCode(error, err);
+        return NULL;
+    }
+
+    kernel->extraSize = sizeof(CLBLASKernExtra);
+    kernel->extra = calloc(1, kernel->extraSize);
+    *(CLBLASKernExtra*)(kernel->extra) = *extra;
+    kernel->dtor = extraDtor;
+
+    storeErrorCode(error, CL_SUCCESS);
+
+    return kernel;
+
+}
+
+void
+setupBuildOpts(
+    char opts[BUILD_OPTS_MAXLEN],
+    cl_device_id devID,
+    MemoryPattern *mempat)
+{
+    TargetDevice target;
+
+    target.id = devID;
+    identifyDevice(&target);
+    opts[0] = '\0';
+
+#if !defined NDEBUG
+    strcpy(opts, "-g ");
+#endif  /* NDEBUG */
+
+    if (target.ident.vendor == VENDOR_NVIDIA &&
+        !strcmp(mempat->name, "2-staged cached global memory based "
+                              "block trsm")) {
+
+        strcat(opts, "-cl-opt-disable");
+    }
+}
+
+char VISIBILITY_HIDDEN
+*sprintfGranulation(char *buf, const SubproblemDim *dim, int level)
+{
+    buf = sprintfDim(buf, dim[level].itemY, "itemY", level, true);
+    buf = sprintfDim(buf, dim[level].itemX, "itemX", level, false);
+    buf = sprintfDim(buf, dim[level].y, "y", level, false);
+    buf = sprintfDim(buf, dim[level].x, "x", level, false);
+    buf = sprintfDim(buf, dim[level].bwidth, "bwidth", level, false);
+    strcat(buf, "; ");
+    buf += strlen(buf);
+
+    return buf;
+}
+
+clblasStatus VISIBILITY_HIDDEN
+checkMatrixSizes(
+    DataType dtype,
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    cl_mem A,
+    size_t offA,
+    size_t lda,         // lda is passed as zero for packed matrices
+    ErrorCodeSet err )
+{
+    size_t memSize, matrSize, tsize, memUsed;
+    size_t unusedTail = 0;
+    bool tra;
+
+    if ((M == 0) || (N == 0)) {
+        return clblasInvalidDim;
+    }
+
+    tsize = dtypeSize(dtype);
+    tra = (order == clblasRowMajor && transA != clblasNoTrans) ||
+          (order == clblasColumnMajor && transA == clblasNoTrans);
+
+    if( lda > 0 )              // For Non-packed matrices
+    {
+        if (tra) {
+            if (lda < M) {
+                switch( err )
+                {
+                case A_MAT_ERRSET:
+                    return clblasInvalidLeadDimA;
+                case B_MAT_ERRSET:
+                    return clblasInvalidLeadDimB;
+                case C_MAT_ERRSET:
+                    return clblasInvalidLeadDimC;
+                default:
+                    return clblasNotImplemented;
+                }
+            }
+            matrSize = ((N - 1) * lda + M) * tsize;
+            unusedTail = ( lda - N ) * tsize;
+        }
+        else {
+            if (lda < N) {
+                switch( err )
+                {
+                case A_MAT_ERRSET:
+                    return clblasInvalidLeadDimA;
+                case B_MAT_ERRSET:
+                    return clblasInvalidLeadDimB;
+                case C_MAT_ERRSET:
+                    return clblasInvalidLeadDimC;
+                default:
+                    return clblasNotImplemented;
+                }
+            }
+            matrSize = ((M - 1) * lda + N) * tsize;
+            unusedTail = ( lda - M ) * tsize;
+        }
+    }
+    else {                     // For the case of packed matrices
+         matrSize = ((M * (N+1)) / 2) * tsize;
+    }
+
+    offA *= tsize;
+
+    if (clGetMemObjectInfo(A, CL_MEM_SIZE, sizeof(memSize), &memSize, NULL) !=
+                                CL_SUCCESS) {
+        switch( err )
+        {
+        case A_MAT_ERRSET:
+            return clblasInvalidMatA;
+        case B_MAT_ERRSET:
+            return clblasInvalidMatB;
+        case C_MAT_ERRSET:
+            return clblasInvalidMatC;
+        default:
+            return clblasNotImplemented;
+        }
+    }
+
+    //  It is possible to allocate a buffer, and set up lda & ldb such that it looks like it will access outside of the allocated buffer, but if
+    //  M & N are kept small enough, no out of bounds access will occur.  Compensate for the offset values and the unused tail memory caused by lda & ldb.
+    //  Ex: BuffSize=6 floats, M=1, N=2, lda=ldb=3, offA = 0, offB = 2 :  |A[0,0]|unused|B[0,0]|A[0,1]|unused|B[0,1]|
+    memUsed = (( offA + matrSize ) > unusedTail) ? offA + matrSize - unusedTail: 0;
+    if (( memUsed > memSize ) || (offA + matrSize < offA)) {
+        switch( err )
+        {
+        case A_MAT_ERRSET:
+            return clblasInsufficientMemMatA;
+        case B_MAT_ERRSET:
+            return clblasInsufficientMemMatB;
+        case C_MAT_ERRSET:
+            return clblasInsufficientMemMatC;
+        default:
+            return clblasNotImplemented;
+        }
+    }
+
+    return clblasSuccess;
+}
+
+
+clblasStatus VISIBILITY_HIDDEN
+checkBandedMatrixSizes(
+    DataType dtype,
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    size_t KL,
+    size_t KU,
+    cl_mem A,
+    size_t offA,
+    size_t lda,
+    ErrorCodeSet err )
+{
+    size_t memSize, matrSize, tsize, K, memUsed;
+    size_t unusedTail = 0;
+    bool tra;
+
+    if ((M == 0) || (N == 0)) {
+        return clblasInvalidDim;
+    }
+
+    tsize = dtypeSize(dtype);
+    K = KL + KU + 1;
+    tra = (order == clblasRowMajor && transA != clblasNoTrans) ||
+          (order == clblasColumnMajor && transA == clblasNoTrans);
+
+    if (lda < K) {
+        switch( err )
+        {
+        case A_MAT_ERRSET:
+            return clblasInvalidLeadDimA;
+        case B_MAT_ERRSET:
+            return clblasInvalidLeadDimB;
+        case C_MAT_ERRSET:
+            return clblasInvalidLeadDimC;
+        default:
+            return clblasNotImplemented;
+        }
+    }
+
+    if (tra) {
+        matrSize = ((N - 1) * lda + K) * tsize;
+        unusedTail = ( lda - N ) * tsize;
+    }
+    else {
+        matrSize = ((M - 1) * lda + K) * tsize;
+        unusedTail = ( lda - M ) * tsize;
+    }
+
+    offA *= tsize;
+
+    if (clGetMemObjectInfo(A, CL_MEM_SIZE, sizeof(memSize), &memSize, NULL) !=
+                                CL_SUCCESS) {
+        switch( err )
+        {
+        case A_MAT_ERRSET:
+            return clblasInvalidMatA;
+        case B_MAT_ERRSET:
+            return clblasInvalidMatB;
+        case C_MAT_ERRSET:
+            return clblasInvalidMatC;
+        default:
+            return clblasNotImplemented;
+        }
+    }
+
+    //  It is possible to allocate a buffer, and set up lda & ldb such that it looks like it will access outside of the allocated buffer, but if
+    //  M & N are kept small, no out of bounds access will occur.  Compensate for the offset values and the unused tail memory caused by lda & ldb.
+    //  Ex: BuffSize=6 floats, M=1, N=2, lda=ldb=3, offA = 0, offB = 2 :  |A[0,0]|unused|B[0,0]|A[0,1]|unused|B[0,1]|
+    memUsed = (( offA + matrSize ) > unusedTail) ? offA + matrSize - unusedTail: 0;
+    if (memUsed > memSize) {
+        switch( err )
+        {
+        case A_MAT_ERRSET:
+            return clblasInsufficientMemMatA;
+        case B_MAT_ERRSET:
+            return clblasInsufficientMemMatB;
+        case C_MAT_ERRSET:
+            return clblasInsufficientMemMatC;
+        default:
+            return clblasNotImplemented;
+        }
+    }
+
+    return clblasSuccess;
+}
+
+clblasStatus VISIBILITY_HIDDEN
+checkVectorSizes(
+    DataType dtype,
+    size_t N,
+    cl_mem x,
+    size_t offx,
+    int incx,
+    ErrorCodeSet err )
+{
+    size_t memSize, sizev;
+    size_t tsize;
+
+    if (N == 0) {
+        return clblasInvalidDim;
+    }
+
+    if (incx == 0) {
+        switch( err )
+        {
+        case X_VEC_ERRSET:
+            return clblasInvalidIncX;
+        case Y_VEC_ERRSET:
+            return clblasInvalidIncY;
+        default:
+            return clblasNotImplemented;
+        }
+    }
+
+    if (clGetMemObjectInfo(x, CL_MEM_SIZE, sizeof(memSize), &memSize, NULL) !=
+                                CL_SUCCESS) {
+        switch( err )
+        {
+        case X_VEC_ERRSET:
+            return clblasInvalidVecX;
+        case Y_VEC_ERRSET:
+            return clblasInvalidVecY;
+        default:
+            return clblasNotImplemented;
+        }
+    }
+
+    tsize = dtypeSize(dtype);
+    sizev = ((N - 1) * abs(incx) + 1) * tsize;
+    offx *= tsize;
+
+    if ((offx + sizev > memSize) || (offx + sizev < offx)) {
+        switch( err )
+        {
+        case X_VEC_ERRSET:
+            return clblasInsufficientMemVecX;
+        case Y_VEC_ERRSET:
+            return clblasInsufficientMemVecY;
+        default:
+            return clblasNotImplemented;
+        }
+    }
+
+    return clblasSuccess;
+}
+
+clblasStatus
+checkMemObjects(
+    cl_mem A,
+    cl_mem B,
+    cl_mem C,
+    bool checkC,
+    ErrorCodeSet errA,
+    ErrorCodeSet errB,
+    ErrorCodeSet errC )
+{
+    cl_mem_object_type mobjType = 0;
+
+    if (!clGetMemObjectInfo(A, CL_MEM_TYPE, sizeof(mobjType), &mobjType, NULL) &&
+        (mobjType != CL_MEM_OBJECT_BUFFER)) {
+        switch( errA )
+        {
+        case A_MAT_ERRSET:
+            return clblasInvalidMatA;
+        case B_MAT_ERRSET:
+            return clblasInvalidMatB;
+        case C_MAT_ERRSET:
+            return clblasInvalidMatC;
+        case X_VEC_ERRSET:
+            return clblasInvalidVecX;
+        case Y_VEC_ERRSET:
+            return clblasInvalidVecY;
+        default:
+            return clblasNotImplemented;
+        }
+    }
+
+    mobjType = 0;
+    if (!clGetMemObjectInfo(B, CL_MEM_TYPE, sizeof(mobjType), &mobjType, NULL) &&
+        (mobjType != CL_MEM_OBJECT_BUFFER)) {
+        switch( errB )
+        {
+        case A_MAT_ERRSET:
+            return clblasInvalidMatA;
+        case B_MAT_ERRSET:
+            return clblasInvalidMatB;
+        case C_MAT_ERRSET:
+            return clblasInvalidMatC;
+        case X_VEC_ERRSET:
+            return clblasInvalidVecX;
+        case Y_VEC_ERRSET:
+            return clblasInvalidVecY;
+        default:
+            return clblasNotImplemented;
+        }
+    }
+
+    mobjType = 0;
+    if (checkC && !clGetMemObjectInfo(C, CL_MEM_TYPE, sizeof(mobjType),
+                                     &mobjType, NULL) &&
+        (mobjType != CL_MEM_OBJECT_BUFFER)) {
+        switch( errC )
+        {
+        case A_MAT_ERRSET:
+            return clblasInvalidMatA;
+        case B_MAT_ERRSET:
+            return clblasInvalidMatB;
+        case C_MAT_ERRSET:
+            return clblasInvalidMatC;
+        case X_VEC_ERRSET:
+            return clblasInvalidVecX;
+        case Y_VEC_ERRSET:
+            return clblasInvalidVecY;
+        default:
+            return clblasNotImplemented;
+        }
+    }
+
+    return clblasSuccess;
+}
diff --git a/src/library/blas/generic/events.c b/src/library/blas/generic/events.c
new file mode 100644
index 0000000..6a71e3a
--- /dev/null
+++ b/src/library/blas/generic/events.c
@@ -0,0 +1,75 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>
+#include <clBLAS.h>
+
+#include <mutex.h>
+#include <events.h>
+
+static const size_t ALLOCATION_STEP = 100;
+
+static mutex_t *lock = NULL;
+static cl_event *decomposeEvents = NULL;
+static size_t numDecomposeEvents = 0;
+static size_t maxDecomposeEvents = 0;
+
+void
+decomposeEventsSetup(void)
+{
+    lock = mutexInit();
+}
+
+void
+decomposeEventsTeardown(void)
+{
+    mutexLock(lock);
+
+    if (decomposeEvents != NULL) {
+        free(decomposeEvents);
+    }
+
+    decomposeEvents = NULL;
+    numDecomposeEvents = 0;
+    maxDecomposeEvents = 0;
+
+    mutexDestroy(lock);
+    lock = NULL;
+}
+
+cl_event*
+decomposeEventsAlloc(void)
+{
+    cl_event* e;
+
+    mutexLock(lock);
+
+    if (numDecomposeEvents == maxDecomposeEvents) {
+        e = realloc(decomposeEvents,
+            (maxDecomposeEvents + ALLOCATION_STEP) * sizeof(cl_event));
+        if (e == NULL) {
+            mutexUnlock(lock);
+            return NULL;
+        }
+        decomposeEvents = e;
+        maxDecomposeEvents += ALLOCATION_STEP;
+    }
+    e = &(decomposeEvents[numDecomposeEvents++]);
+
+    mutexUnlock(lock);
+    return e;
+}
diff --git a/src/library/blas/generic/kdump.c b/src/library/blas/generic/kdump.c
new file mode 100644
index 0000000..5345fc7
--- /dev/null
+++ b/src/library/blas/generic/kdump.c
@@ -0,0 +1,188 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+#include <string.h>
+#include <malloc.h>
+
+#include <cltypes.h>
+#include <clblas-internal.h>
+
+#include "solution_seq.h"
+
+#ifdef DUMP_CLBLAS_KERNELS
+
+enum {
+    SRC_BUFSIZE = 512244
+};
+
+static void
+getFuncName(char *name, BlasFunctionID funcID, DataType dtype)
+{
+    switch (funcID) {
+    case CLBLAS_GEMV:
+        strcpy(name + 1, "GEMV");
+        break;
+    case CLBLAS_SYMV:
+        strcpy(name + 1, "SYMV");
+        break;
+    case CLBLAS_GEMM:
+        strcpy(name + 1, "GEMM");
+        break;
+    case CLBLAS_TRMM:
+        strcpy(name + 1, "TRMM");
+        break;
+    case CLBLAS_TRSM:
+        strcpy(name + 1, "TRSM");
+        break;
+    case CLBLAS_SYRK:
+        strcpy(name + 1, "SYRK");
+        break;
+    case CLBLAS_SYR2K:
+        strcpy(name + 1, "SYR2K");
+        break;
+    default:
+        break;
+    }
+
+    if (dtype == TYPE_FLOAT) {
+        name[0] = 's';
+    }
+    else {
+        name[0] = dtypeToPrefix(dtype);
+    }
+}
+
+static void
+addTranspSuffix(char *buf, clblasTranspose flag)
+{
+    const char *s;
+
+    if (flag == clblasNoTrans) {
+        return;
+    }
+
+    s = (clblasTrans) ? "t" : "tc";
+    strcat(buf, s);
+}
+
+static void
+fileNameFromSolution(
+    char *name,
+    BlasFunctionID funcID,
+    const SolutionStep *step)
+{
+    const char *s;
+    const CLBlasKargs *kargs = (const CLBlasKargs*)&step->args;
+    bool isTriangFn;
+
+    isTriangFn = (funcID == CLBLAS_TRMM || funcID == CLBLAS_TRSM);
+    strcpy(name, "./");
+    name += strlen(name);
+    getFuncName(name, funcID, kargs->dtype);
+    s = (kargs->order == clblasRowMajor) ? "_row_" : "_col_";
+    strcat(name, s);
+    addTranspSuffix(name, kargs->transA);
+    if (isTriangFn) {
+        s = (kargs->uplo == clblasUpper) ? "_upper" : "_lower";
+        strcat(name, s);
+        s = (kargs->side == clblasRight) ? "_right" : "_left";
+        strcat(name, s);
+    }
+    else {
+        addTranspSuffix(name, kargs->transB);
+    }
+
+    name += strlen(name);
+    sprintf(name, "_%lu_%lu", kargs->M, kargs->N);
+    if (!isTriangFn) {
+        name += strlen(name);
+        sprintf(name, "_%lu", kargs->K);
+    }
+    strcat(name, ".kdump");
+}
+
+void
+dumpKernel(
+    const SolutionStep *step,
+    CLBlasKernelType ktype)
+{
+    FILE *file;
+    char tmp[1024];
+    MemoryPattern *pattern;
+    const char *s;
+    const CLBlasKargs *kargs = (const CLBlasKargs*)&step->args;
+    char *srcBuf;
+    unsigned int i;
+
+    fileNameFromSolution(tmp, step->funcID, step);
+    file = fopen((const char*)tmp, "a+");
+    pattern = &clblasSolvers[step->funcID].memPatterns[step->patternID];
+
+    // now, dump the info
+    sprintf(tmp, "offset M = %lu, offset N = %lu, offset A = %lu,"
+                 "offset BX = %lu, offset CY = %lu\n",
+            kargs->offsetM, kargs->offsetN, kargs->offA, kargs->offBX,
+            kargs->offCY);
+    fputs(tmp, file);
+
+    sprintf(tmp, "Memory pattern = %s\n", pattern->name);
+    fputs(tmp, file);
+
+    s = kernelTypeString(ktype);
+    sprintf(tmp, "Kernel type = %s\n", s);
+    fputs(tmp, file);
+
+    // data parallelism granularity
+    if (step->pgran.wgDim == 1) {
+        sprintf(tmp, "work group size = %u\n", step->pgran.wgSize[0]);
+    }
+    else {
+        sprintf(tmp, "work group size = %u x %u\n", step->pgran.wgSize[0],
+                step->pgran.wgSize[1]);
+    }
+    fputs(tmp, file);
+
+    fputs("Problem granulation\n", file);
+    for (i = 0; i < pattern->nrLevels; i++) {
+        sprintf(tmp, "[%u]: ", i);
+        fputs(tmp, file);
+        sprintfGranulation(tmp, step->subdims, i);
+        fputs(tmp, file);
+        fputs("\n", file);
+    }
+
+    srcBuf = malloc(SRC_BUFSIZE);
+    if (srcBuf != NULL) {
+        clGetProgramInfo(step->kernels[ktype]->program,
+                         CL_PROGRAM_SOURCE, SRC_BUFSIZE, srcBuf, NULL);
+        fputs("Kernel source:\n\n", file);
+        fputs(srcBuf, file);
+    }
+    else {
+        fputs("Kernel source: not available\n", file);
+    }
+    free(srcBuf);
+
+    fputs("--------------------------------------------------------------"
+          "------------------------------------------------------------\n",
+          file);
+
+    fclose(file);
+}
+
+#endif      /* DUMP_CLBLAS_KERNELS */
diff --git a/src/library/blas/generic/kernel_extra.c b/src/library/blas/generic/kernel_extra.c
new file mode 100644
index 0000000..37a809f
--- /dev/null
+++ b/src/library/blas/generic/kernel_extra.c
@@ -0,0 +1,27 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <string.h>
+
+#include "kernel_extra.h"
+
+int
+clblasKernelExtraCmp(const void *extra, const void *extraKey)
+{
+    return memcmp(extra, extraKey, sizeof(CLBLASKernExtra));
+}
+
diff --git a/src/library/blas/generic/matrix_dims.c b/src/library/blas/generic/matrix_dims.c
new file mode 100644
index 0000000..f340c03
--- /dev/null
+++ b/src/library/blas/generic/matrix_dims.c
@@ -0,0 +1,186 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <matrix_dims.h>
+
+static __inline bool
+isRightSide(BlasFunctionID funcID, clblasSide side)
+{
+    return ((funcID == CLBLAS_TRMM || funcID == CLBLAS_TRSM) &&
+            side == clblasRight);
+}
+
+void VISIBILITY_HIDDEN
+swapDimXY(SubproblemDim *dim)
+{
+    size_t tmp;
+
+    tmp = dim->itemX;
+    dim->itemX = dim->itemY;
+    dim->itemY = tmp;
+    tmp = dim->x;
+    dim->x = dim->y;
+    dim->y = tmp;
+}
+
+size_t VISIBILITY_HIDDEN
+matrBlockPitch(
+    const SubproblemDim *dim,
+    MatrixRole mrole,
+    DataType dtype,
+    clblasSide side)
+{
+    size_t tsize = dtypeSize(dtype);
+    size_t nfloats = tsize / sizeof(cl_float);
+    size_t rowLen = 0;
+
+    switch (mrole) {
+    case MATRIX_A:
+    case MATRIX_B:
+        rowLen = dim->bwidth;
+        break;
+    case MATRIX_C:
+        rowLen = (side == clblasLeft) ? dim->x : dim->y;
+        break;
+    default:
+        break;
+    }
+
+    rowLen = fl4RowWidth(rowLen, tsize) * FLOAT4_VECLEN / nfloats;
+
+    return rowLen;
+}
+
+cl_ulong VISIBILITY_HIDDEN
+matrBlockSize(
+    SubproblemDim *dim,
+    MatrixRole mrole,
+    DataType dtype,
+    clblasSide side)
+{
+    size_t height, pitch;
+
+    pitch = matrBlockPitch(dim, mrole, dtype, side);
+    height = matrBlockHeight(dim, mrole, side);
+
+    return (cl_ulong)height * pitch;
+}
+
+size_t VISIBILITY_HIDDEN
+matrBlockHeight(
+    SubproblemDim *dim,
+    MatrixRole mrole,
+    clblasSide side)
+{
+    size_t ret = 0;
+
+    switch (mrole) {
+    case MATRIX_A:
+        ret = dim->y;
+        break;
+    case MATRIX_B:
+        ret = dim->x;
+        break;
+    case MATRIX_C:
+        ret = (side == clblasLeft) ? dim->y : dim->x;
+        break;
+    default:
+        break;
+    }
+
+    return ret;
+}
+
+void VISIBILITY_HIDDEN
+kargsToProbDims(
+    SubproblemDim *probDim,
+    BlasFunctionID funcID,
+    const CLBlasKargs *kargs,
+    bool offset)
+{
+
+    if (funcID == CLBLAS_SYMV) {
+        if (offset) {
+            probDim->y = kargs->offsetN;
+            probDim->x = 0;
+            probDim->bwidth = 0;
+        }
+        else {
+            probDim->y = kargs->N;
+            probDim->x = kargs->N;
+            probDim->bwidth = kargs->K;
+        }
+    }
+    else {
+        if (offset) {
+            probDim->y = kargs->offsetM;
+            probDim->x = kargs->offsetN;
+        }
+        else {
+            probDim->y = kargs->M;
+            probDim->x = kargs->N;
+        }
+
+        if (isRightSide(funcID, kargs->side)) {
+            swapDimXY(probDim);
+        }
+        if (funcID == CLBLAS_GEMV) {
+            if (kargs->transA != clblasNoTrans) {
+                swapDimXY(probDim);
+            }
+            probDim->bwidth = (offset) ? 0 : probDim->x;
+        }
+        else {
+            probDim->bwidth = (offset) ? 0 : kargs->K;
+        }
+    }
+}
+
+void VISIBILITY_HIDDEN
+probDimsToKargs(
+    CLBlasKargs *kargs,
+    BlasFunctionID funcID,
+    SubproblemDim *probDim,
+    bool offset)
+{
+    size_t *m, *n;
+    SubproblemDim tmpDim;
+
+    if (offset) {
+        m = &kargs->offsetM;
+        n = &kargs->offsetN;
+    }
+    else {
+        m = &kargs->M;
+        n = &kargs->N;
+        kargs->K = probDim->bwidth;
+    }
+
+    tmpDim = *probDim;
+
+    if (isRightSide(funcID, kargs->side)) {
+        swapDimXY(&tmpDim);
+    }
+    if (funcID == CLBLAS_GEMV) {
+        if (kargs->transA != clblasNoTrans) {
+            swapDimXY(&tmpDim);
+        }
+    }
+    *m = tmpDim.y;
+    *n = tmpDim.x;
+}
+
diff --git a/src/library/blas/generic/matrix_props.c b/src/library/blas/generic/matrix_props.c
new file mode 100644
index 0000000..65094ed
--- /dev/null
+++ b/src/library/blas/generic/matrix_props.c
@@ -0,0 +1,184 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Implementation of functions for determining matrix properties
+ */
+
+#include "matrix_props.h"
+
+static bool
+gemmIsTrans(KernelExtraFlags flags, MatrixRole mrole)
+{
+    bool trans = false;
+    bool order = false;
+
+    switch (mrole) {
+    case MATRIX_A:
+        trans = ((flags & KEXTRA_TRANS_A) != 0);
+        order = ((flags & KEXTRA_COLUMN_MAJOR) != 0);
+        break;
+    case MATRIX_B:
+        trans = ((flags & KEXTRA_TRANS_B) != 0);
+        order = !(flags & KEXTRA_COLUMN_MAJOR);
+        break;
+    case MATRIX_C:
+        trans = false;
+        order = ((flags & KEXTRA_COLUMN_MAJOR) != 0);
+        break;
+    default:
+        break;
+    }
+
+    // each initial flag "flip" resulting need transposing flag
+    return (trans ^ order);
+}
+
+static bool
+trxmIsTrans(KernelExtraFlags flags, MatrixRole mrole)
+{
+    bool trans = false;
+    bool order = false;
+    bool side = ((flags & KEXTRA_SIDE_RIGHT) != 0);
+    bool ret;
+
+    switch (mrole) {
+    case MATRIX_A:
+        trans = ((flags & KEXTRA_TRANS_A) != 0);
+        order = ((flags & KEXTRA_COLUMN_MAJOR) != 0);
+        break;
+    case MATRIX_B:
+    case MATRIX_C:
+        order = !(flags & KEXTRA_COLUMN_MAJOR); // row major
+        break;
+    default:
+        break;
+    }
+
+    // each initial flag "flip" resulting need transposing flag
+    ret = trans ^ order ^ side;
+
+    if (mrole == MATRIX_C) {
+        /*
+         * the output matrix always has inverted transposing flags against
+         * matrix B
+         */
+        ret = !ret;
+    }
+
+    return ret;
+}
+
+static bool
+syrkIsTrans(KernelExtraFlags flags, MatrixRole mrole)
+{
+    bool ret = false;
+
+    switch (mrole) {
+    case MATRIX_A:
+    case MATRIX_B:
+    {
+        bool trans = ((flags & KEXTRA_TRANS_A) != 0);
+        bool order = ((flags & KEXTRA_COLUMN_MAJOR) != 0);
+
+        ret = (trans && !order) || (!trans && order);
+        break;
+    }
+    case MATRIX_C:
+        ret = ((flags & KEXTRA_COLUMN_MAJOR) != 0);
+        break;
+    default:
+        break;
+    }
+
+    return ret;
+}
+
+static bool
+l2IsTrans(KernelExtraFlags flags, MatrixRole mrole)
+{
+    bool ret;
+
+    if (mrole == MATRIX_A) {
+        bool trans = ((flags & KEXTRA_TRANS_A) != 0);
+        bool order = ((flags & KEXTRA_COLUMN_MAJOR) != 0);
+
+        ret = (trans && !order) || (!trans && order);
+    }
+    else {
+        ret = false;
+    }
+
+    return ret;
+}
+
+bool
+isMatrixConj(KernelExtraFlags flags, MatrixRole mrole)
+{
+    bool ret = false;
+
+    switch (mrole) {
+    case MATRIX_A:
+        ret = ((flags & KEXTRA_CONJUGATE_A) != 0);
+        break;
+    case MATRIX_B:
+        ret = ((flags & KEXTRA_CONJUGATE_B) != 0);
+        break;
+    default:
+        ret = false;
+        break;
+    }
+
+    return ret;
+}
+
+bool
+isMatrixAccessColMaj(
+    BlasFunctionID funcID,
+    KernelExtraFlags flags,
+    MatrixRole mrole)
+{
+    bool ret = false;
+
+    switch (funcID) {
+	case CLBLAS_SYMM:
+    case CLBLAS_GEMM:
+	case CLBLAS_GEMM2:
+        ret = gemmIsTrans(flags, mrole);
+        break;
+    case CLBLAS_TRMM:
+    case CLBLAS_TRSM:
+        ret = trxmIsTrans(flags, mrole);
+        break;
+    case CLBLAS_SYRK:
+    case CLBLAS_SYR2K:
+        ret = syrkIsTrans(flags, mrole);
+        break;
+	case CLBLAS_TRMV:
+	case CLBLAS_TRSV:
+	case CLBLAS_TRSV_GEMV:
+		ret = true;
+		break;
+    case CLBLAS_GEMV:
+    case CLBLAS_SYMV:
+        ret = l2IsTrans(flags, mrole);
+    default:
+        break;
+    }
+
+    return ret;
+}
diff --git a/src/library/blas/generic/problem_iter.c b/src/library/blas/generic/problem_iter.c
new file mode 100644
index 0000000..9792648
--- /dev/null
+++ b/src/library/blas/generic/problem_iter.c
@@ -0,0 +1,121 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// Problem iterator to scatter solving, for passing over matrix A
+
+#include <assert.h>
+#include <sys/types.h>
+#include <clblas_stddef.h>
+
+#include "matrix_dims.h"
+#include "problem_iter.h"
+
+void VISIBILITY_HIDDEN
+initProblemIterator(
+    ProblemIterator *iter,
+    BlasFunctionID funcID,
+    MatrixRole mrole,
+    CLBlasKargs *kargs,
+    size_t maxPanels,
+    size_t maxBlocks,
+    SubproblemDim *topDim)
+{
+    SubproblemDim tmp;
+
+    iter->mrole = mrole;
+    iter->funcID = funcID;
+    kargsToProbDims(&tmp, funcID, kargs, false);
+    iter->size = matrBlockHeight(&tmp, mrole, kargs->side);
+    iter->globPitch = matrBlockPitch(&tmp, mrole, kargs->dtype, kargs->side);
+    iter->maxPanels = maxPanels;
+    iter->maxBlocks = maxBlocks;
+    iter->uplo = kargs->uplo;
+    iter->side = kargs->side;
+    iter->dtype = kargs->dtype;
+    iter->bpitch = matrBlockPitch(topDim, mrole, kargs->dtype, kargs->side);
+    iter->bheight = matrBlockHeight(topDim, mrole, kargs->side);
+    iteratorReset(iter);
+}
+
+void VISIBILITY_HIDDEN
+iteratorReset(ProblemIterator *iter)
+{
+    if (isIterBackward(iter)) {
+        iter->pos = iter->size;
+        iter->prevPos = iter->size;
+    }
+    else {
+        iter->pos = 0;
+        iter->prevPos = 0;
+    }
+}
+
+bool VISIBILITY_HIDDEN
+isIterBackward(ProblemIterator *iter)
+{
+    bool ret = false;
+
+    if (iter->funcID != CLBLAS_GEMM) {
+        ret = (iter->side == clblasLeft && iter->uplo == clblasLower) ||
+              (iter->side == clblasRight && iter->uplo == clblasUpper);
+        if (iter->funcID == CLBLAS_TRSM) {
+            ret = !ret;
+        }
+    }
+
+    return ret;
+}
+
+int VISIBILITY_HIDDEN
+iterateProblem(ProblemIterator *iter)
+{
+    bool backward;
+    size_t dy = 0;
+
+    backward = isIterBackward(iter);
+
+    if (((iter->funcID != CLBLAS_TRSM) && (!iter->maxPanels)) ||
+            ((iter->funcID == CLBLAS_TRSM) && (!iter->maxBlocks))) {
+        iter->pos = (backward) ? 0 : iter->size;
+        return 1;
+    }
+
+    iter->prevPos = iter->pos;
+
+    if ((iter->funcID != CLBLAS_TRSM)) {
+        dy = iter->maxPanels * iter->bheight;
+        assert(dy != 0);
+    }
+    if (backward) {
+        dy = szmin(iter->pos, dy);
+        iter->pos -= dy;
+    }
+    else {
+        dy = szmin(dy, iter->size - iter->pos);
+        iter->pos += dy;
+    }
+
+    return (int)(backward && iter->pos == 0) ||
+                (!backward && iter->pos == iter->size);
+}
+
+size_t VISIBILITY_HIDDEN
+iterLastOffset(ProblemIterator *iter)
+{
+    return (iter->pos > iter->prevPos) ? (iter->pos - iter->prevPos) :
+           (iter->prevPos - iter->pos);
+}
diff --git a/src/library/blas/generic/problem_iter.h b/src/library/blas/generic/problem_iter.h
new file mode 100644
index 0000000..835a1dd
--- /dev/null
+++ b/src/library/blas/generic/problem_iter.h
@@ -0,0 +1,76 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef PROBLEM_ITERATOR_H_
+#define PROBLEM_ITERATOR_H_
+
+#include <kerngen.h>
+
+#include "clblas-internal.h"
+#include "blas_funcs.h"
+
+// Problem iterator to scatter solving, for passing over matrix A
+
+typedef struct ProblemIterator {
+    MatrixRole mrole;
+    size_t pos;
+    size_t prevPos;
+    size_t size;
+    size_t globPitch;
+    BlasFunctionID funcID;
+    clblasUplo uplo;
+    clblasSide side;
+    DataType dtype;
+    size_t maxPanels;
+    size_t maxBlocks;
+    size_t bpitch;
+    size_t bheight;
+} ProblemIterator;
+
+/*
+ * @maxBlocks: maximal number of blocks to iterate with;
+ *             There is as little as 1 iteration if it is
+ *             set to 0.
+ */
+void VISIBILITY_HIDDEN
+initProblemIterator(
+    ProblemIterator *iter,
+    BlasFunctionID funcID,
+    MatrixRole mrole,
+    CLBlasKargs *kargs,
+    size_t maxPanels,
+    size_t maxBlocks,
+    SubproblemDim *topDim);
+
+void VISIBILITY_HIDDEN
+iteratorReset(ProblemIterator *iter);
+
+bool VISIBILITY_HIDDEN
+isIterBackward(ProblemIterator *iter);
+
+/*
+ * Iterate in some dimension based on maximal blocks info;
+ * Iteration for the 'SDIM_BWIDTH' component is prohibited.
+ * Returns 1 when achieve the end position
+ */
+int VISIBILITY_HIDDEN
+iterateProblem(ProblemIterator *iter);
+
+size_t VISIBILITY_HIDDEN
+iterLastOffset(ProblemIterator *iter);
+
+#endif /* PROBLEM_ITERATOR_H_ */
diff --git a/src/library/blas/generic/solution_assert.c b/src/library/blas/generic/solution_assert.c
new file mode 100644
index 0000000..1c94ef9
--- /dev/null
+++ b/src/library/blas/generic/solution_assert.c
@@ -0,0 +1,195 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "solution_assert.h"
+
+#define ASSERT_GREQ_AND_DIV(a, b) assert(((a) >= (b)) && ((a) % (b) == 0))
+
+// solution area
+typedef struct SolArea {
+    size_t offsetM;
+    size_t M;
+    size_t offsetN;
+    size_t N;
+    ListNode node;
+} SolArea;
+
+#ifdef ASSERT_GRANULATION
+
+// check the found dimensions are not wrong
+void VISIBILITY_HIDDEN
+assertGranulation(
+    SubproblemDim *dims,
+    unsigned int nrDims,
+    PGranularity *pgran,
+    unsigned int thLevel)
+{
+    unsigned int i;
+    size_t gsize;
+
+    /*
+     * subproblem dimensions on all levels must meet the following requirements:
+     *
+     * 1) Item work piece is greater then a processing step
+     * 2) Item work piece is integrally divisible on the processing step
+     * 3) Work pieces and processing steps don't grows at forwarding to the bottom level
+     * 4) At passing to the thread level, the subproblem must be strict divisible among
+     *    all the threads
+     */
+
+    gsize = pgran->wgSize[0] * pgran->wgSize[1];
+
+    for (i = 0; i < nrDims; i++) {
+        if (i || dims[i].itemX != SUBDIM_UNUSED) {
+            ASSERT_GREQ_AND_DIV(dims[i].itemX, dims[i].x);
+        }
+        if (i || dims[i].itemY != SUBDIM_UNUSED) {
+            ASSERT_GREQ_AND_DIV(dims[i].itemY, dims[i].y);
+        }
+        if (i) {
+            ASSERT_GREQ_AND_DIV(dims[i - 1].x, dims[i].itemX);
+            ASSERT_GREQ_AND_DIV(dims[i - 1].y, dims[i].itemY);
+            ASSERT_GREQ_AND_DIV(dims[i - 1].bwidth, dims[i].bwidth);
+        }
+    }
+
+    assert((dims[thLevel].itemX * dims[thLevel].itemY) * gsize ==
+           dims[thLevel - 1].x * dims[thLevel - 1].y);
+}
+
+#endif  // ASSERT_GRANULATION
+
+#ifdef ASSERT_IMAGE_STEPS
+
+static __inline void
+assertEnclosed(size_t off1, size_t size1, size_t off2, size_t size2)
+{
+    bool enc = ((off1 >= off2) && (off1 < off2 + size2) &&
+                (off1 + size1 > off2) && (off1 + size1 <= off2 + size2));
+    assert(enc);
+}
+
+static __inline bool
+isIntersected(size_t off1, size_t size1, size_t off2, size_t size2)
+{
+    return ((off1 >= off2 && off1 < off2 + size2) ||
+            (off1 + size1 > off2 && off1 + size1 <= off2 + size2));
+}
+
+static void
+freeSolAreaNode(ListNode *node)
+{
+    SolArea *area = container_of(node, node, SolArea);
+
+    free(area);
+}
+
+static void
+accProcessed(ListNode *node, void *priv)
+{
+    SolArea *a1 = container_of(node, node, SolArea);
+    SolArea *a2 = (SolArea*)priv;
+
+    if (!isIntersected(a1->offsetM, a1->M, a2->offsetM, a2->M)) {
+        a2->M += a1->M;
+        if (a2->offsetM > a1->offsetM) {
+            a2->offsetM = a1->offsetM;
+        }
+    }
+    if (!isIntersected(a1->offsetN, a1->N, a2->offsetN, a2->N)) {
+        a2->N += a1->N;
+        if (a2->offsetN > a1->offsetN) {
+            a2->offsetN = a1->offsetN;
+        }
+    }
+}
+
+static int
+solAreaCmp(ListNode *a, const void *b)
+{
+    SolArea *area = container_of(a, node, SolArea);
+    const CLBlasKargs *kargs = (const CLBlasKargs*)b;
+    int ret;
+
+    ret = isIntersected(kargs->offsetM, kargs->M,
+                        area->offsetM, area->M);
+    ret = ret && isIntersected(kargs->offsetN, kargs->N,
+                               area->offsetN, area->N);
+
+    return !ret;
+}
+
+void VISIBILITY_HIDDEN
+assertImageSubstep(
+    SolutionStep *wholeStep,
+    SolutionStep *substep,
+    ListHead *doneSubsteps)
+{
+    CLBlasKargs *kargs1 = &substep->args;
+    CLBlasKargs *kargs2 = &wholeStep->args;
+    ListNode *node;
+    SolArea *area;
+
+    assertEnclosed(kargs1->offsetM, kargs1->M, kargs2->offsetM, kargs2->M);
+    assertEnclosed(kargs1->offsetN, kargs1->N, kargs2->offsetN, kargs2->N);
+    node = listNodeSearch(doneSubsteps, (const void*)&substep->args,
+                          solAreaCmp);
+    assert(!node);
+    area = malloc(sizeof(SolArea));
+    if (area == NULL) {
+        fprintf(stderr, "[%s, line %d]: Failed to allocate memory for image "
+                        "step assertion!\n", __FILE__, __LINE__);
+    }
+    else {
+        area->offsetM = substep->args.offsetM;
+        area->M = substep->args.M;
+        area->offsetN = substep->args.offsetN;
+        area->N = substep->args.N;
+        listAddToTail(doneSubsteps, &area->node);
+    }
+}
+
+void VISIBILITY_HIDDEN
+assertImageStep(SolutionStep *wholeStep, ListHead *doneSubsteps)
+{
+    SolArea area;
+
+    area.offsetM = SIZE_MAX;
+    area.M = 0;
+    area.offsetN = SIZE_MAX;
+    area.N = 0;
+    listDoForEachPriv(doneSubsteps, accProcessed, &area);
+    assert((area.offsetM == wholeStep->args.offsetM) &&
+           (area.M == wholeStep->args.M) &&
+           (area.offsetM ==wholeStep->args.offsetM) &&
+           (area.N == wholeStep->args.N));
+}
+
+void VISIBILITY_HIDDEN
+releaseImageAssertion(ListHead *doneSubsteps)
+{
+    listDoForEachSafe(doneSubsteps, freeSolAreaNode);
+    listInitHead(doneSubsteps);
+}
+
+#endif   /* ASSERT_IMAGE_STEPS */
+
diff --git a/src/library/blas/generic/solution_assert.h b/src/library/blas/generic/solution_assert.h
new file mode 100644
index 0000000..af76b4c
--- /dev/null
+++ b/src/library/blas/generic/solution_assert.h
@@ -0,0 +1,63 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef SOLUTION_ASSERT_H_
+#define SOLUTION_ASSERT_H_
+
+#include "solution_seq.h"
+
+#ifdef ASSERT_GRANULATION
+
+void
+assertGranulation(
+    SubproblemDim *dims,
+    unsigned int nrDims,
+    PGranularity *pgran,
+    unsigned int thLevel);
+
+#else   // ASSERT_GRANULATION
+
+// stub, do nothing
+#define assertGranulation(dims, nrDims, pgran, thLevel)
+
+#endif  // !ASSERT_GRANULATION
+
+#ifdef ASSERT_IMAGE_STEPS
+
+void
+assertImageSubstep(
+    SolutionStep *wholeStep,
+    SolutionStep *substep,
+    ListHead *doneSubsteps);
+
+void
+assertImageStep(SolutionStep *wholeStep, ListHead *doneSubsteps);
+
+void
+releaseImageAssertion(ListHead *doneSubsteps);
+
+#else   /* ASSERT_IMAGE_STEPS */
+
+// stubs
+
+#define assertImageSubstep(wholeStep, substep, doneSubsteps)
+#define assertImageStep(wholeStep, doneSubsteps)
+#define releaseImageAssertion(doneSubsteps)
+
+#endif  /* !ASSERT_IMAGE_STEPS */
+
+#endif /* SOLUTION_ASSERT_H_ */
diff --git a/src/library/blas/generic/solution_seq.c b/src/library/blas/generic/solution_seq.c
new file mode 100644
index 0000000..cc52c42
--- /dev/null
+++ b/src/library/blas/generic/solution_seq.c
@@ -0,0 +1,465 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>
+#include <string.h>
+#include <clblas_stddef.h>
+
+#include "matrix_dims.h"
+#include "problem_iter.h"
+#include "solution_assert.h"
+#include "solution_seq.h"
+
+bool VISIBILITY_HIDDEN isMatrixInImage(MemoryPattern *pattern, MatrixRole mrole);
+void VISIBILITY_HIDDEN releaseStepImgs(SolutionStep *step);
+
+static cl_int
+enqueueKernel(
+    SolutionStep *step,
+    const Kernel *kernel,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+static void
+splitSolutionStep(
+    SolutionStep *rem,
+    SolutionStep *cut,
+    SDimComponent component,
+    size_t chunk,
+    bool backward);
+
+static cl_int
+executeImageStep(
+    SolutionStep *step,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+void
+freeSolutionSeq(ListHead *seq)
+{
+    listDoForEachSafe(seq, freeSolutionStep);
+    listInitHead(seq);
+}
+
+cl_int
+executeSolutionSeq(const ListHead *seq)
+{
+    cl_int err = CL_SUCCESS;
+    ListNode *i;
+    SolutionStep *step;
+
+
+    /* Enqueue computing kernels */
+    for (i = listNodeFirst(seq); (i != seq) && (err == CL_SUCCESS);
+         i = i->next) {
+
+        step = container_of(i, node, SolutionStep);
+        if (step->cmdQueue == NULL) {
+            continue;
+        }
+
+        if (step->args.scimage[0]) {
+            err = executeImageStep(step, step->numEventsInWaitList,
+                                   step->eventWaitList, step->event);
+        }
+        else {
+			#ifdef DEBUG_2
+			printf("enqueueKernel from executreSolutionSeq...\n");
+			#endif
+
+            err = enqueueKernel(step,
+                                step->kernels[CLBLAS_COMPUTING_KERNEL],
+                                step->numEventsInWaitList, step->eventWaitList,
+                                step->event);
+        }
+    }
+
+    return err;
+}
+
+/* private functions */
+
+void VISIBILITY_HIDDEN
+freeSolutionStep(ListNode *node)
+{
+    SolutionStep *step = container_of(node, node, SolutionStep);
+    int i;
+
+    for (i = 0; i < MAX_CLBLAS_KERNELS_PER_STEP; i++) {
+        if (step->kernels[i] != NULL) {
+            putKernel(clblasKernelCache, step->kernels[i]);
+        }
+    }
+    releaseStepImgs(step);
+    free(step);
+}
+
+static cl_int
+enqueueKernel(
+    SolutionStep *step,
+    const Kernel *kernel,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event)
+{
+    cl_int err;
+    KernelDesc kernelDesc;
+    KernelErrorInfo errInfo;
+    MemoryPattern *pattern;
+    const CLBLASKernExtra *kextra = (const CLBLASKernExtra*)kernel->extra;
+    SubproblemDim subdims[MAX_SUBDIMS];
+
+    step->args.kernType = kextra->kernType;
+    pattern = &clblasSolvers[step->funcID].memPatterns[step->patternID];
+    kernelDesc.workDim = step->pgran.wgDim;
+
+    memcpy(subdims, step->subdims, sizeof(step->subdims));
+
+    if(NULL==pattern->sops->calcThreads)
+    {
+        SubproblemDim globDim;
+        const PGranularity *pgran;
+
+        pgran = (pattern->nrLevels == 1) ? NULL : &step->pgran;
+        kargsToProbDims(&globDim, step->funcID, &step->args, false);
+
+        // fixup dimensions in respect with desired work dispatch order
+        if ((step->pgran.wgDim == 2) && pattern->sops->innerDecompositionAxis) {
+            if (pattern->sops->innerDecompositionAxis(&step->args) ==
+                DECOMP_AXIS_X) {
+
+                /*
+                 * these dimensions will not used more anywhere, so we can
+                 * just swap them
+                 */
+                swapDimXY(&subdims[0]);
+                swapDimXY(&subdims[1]);
+                swapDimXY(&globDim);
+            }
+        }
+
+        calcGlobalThreads(kernelDesc.globalThreads, subdims,
+                          pgran, globDim.y, globDim.x);
+    }
+    else
+    {
+		#ifdef DEBUG_2
+		printf("calcThreads is defined\n");
+		#endif
+
+		pattern->sops->calcThreads(	kernelDesc.globalThreads,
+									subdims,
+									&step->pgran,
+									&step->args,
+									kextra);
+    }
+
+    //
+    // Store the numWGSpawned for this kernel
+    // This size can be used by sequence-steps down the line
+    // e.g. Reduction of intermediate results of each work group
+    //
+    step->pgran.numWGSpawned[0] = kernelDesc.globalThreads[0] / step->pgran.wgSize[0];
+    step->pgran.numWGSpawned[1] = kernelDesc.globalThreads[1] / step->pgran.wgSize[1];
+
+    kernelDesc.localThreads[0] = step->pgran.wgSize[0];
+    kernelDesc.localThreads[1] = step->pgran.wgSize[1];
+    kernelDesc.workDim = step->pgran.wgDim;
+    kernelDesc.waitListSize = numEventsInWaitList;
+    kernelDesc.eventWaitList = eventWaitList;
+    kernelDesc.nowait = 1;
+    kernelDesc.event = event;
+    kernelDesc.needExecTime = 0;
+
+    memset(kernelDesc.args, 0, sizeof(KernelArg) * MAX_KERNEL_ARGS);
+    pattern->sops->assignKargs(kernelDesc.args, (const void*)&(step->args),
+                               kextra);
+
+    errInfo.wrongArg = 0;
+    errInfo.phase = 0;
+
+    /*
+     * TODO: log launchClKernel errors
+     */
+    dumpKernel(step, kextra->kernType);
+
+    err = clCreateKernelsInProgram(kernel->program, 1, &kernelDesc.kernel,
+                                   NULL);
+    if (err == CL_SUCCESS) {
+        err = launchClKernel(&kernelDesc, step->cmdQueue, &errInfo);
+        clReleaseKernel(kernelDesc.kernel);
+    }
+
+    return err;
+}
+
+bool VISIBILITY_HIDDEN
+isMatrixInImage(
+    MemoryPattern *pattern,
+    MatrixRole mrole)
+{
+    const CLBLASMpatExtra *extra = (const CLBLASMpatExtra*)pattern->extra;
+    bool ret = false;
+
+    if (extra != NULL) {
+        switch (mrole) {
+        case MATRIX_A:
+            ret = (extra->mobjA == CLMEM_IMAGE);
+            break;
+        case MATRIX_B:
+            ret = (extra->mobjB == CLMEM_IMAGE);
+            break;
+        default:
+            break;
+        }
+    }
+
+    return ret;
+}
+
+void VISIBILITY_HIDDEN
+releaseStepImgs(SolutionStep *step)
+{
+    int i;
+    cl_mem *imgs = step->args.scimage;
+    cl_device_id devID = NULL;;
+
+    for (i = 0; (i < 2) && (imgs[i] != NULL); i++) {
+        if (devID == NULL) {
+            getQueueDevice(step->cmdQueue, &devID);
+        }
+        putSCImage(devID, imgs[i]);
+        imgs[i] = NULL; //to avoid double release
+    }
+}
+
+static cl_int
+executeImageStep(
+    SolutionStep *step,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event)
+{
+    SolutionStep outerStep, innerStep, execStep;
+    cl_int err = CL_SUCCESS;
+    int currImg = 0;
+    size_t imgWidth, imgHeight;
+    size_t ha, hb;
+    size_t maxPanels[MATRIX_ROLES_NUMBER], maxBlocks[MATRIX_ROLES_NUMBER];
+    size_t off;
+    SubproblemDim wholeDim;
+    MatrixRole mrole;
+    CLBlasKargs *kargs = &step->args;
+    cl_mem *imgs = kargs->scimage;
+    MemoryPattern *mempat = &clblasSolvers[step->funcID].memPatterns[step->patternID];
+    ProblemIterator innerIter, outerIter;
+    int oend = 0, iend;
+    SDimComponent comp[2];
+    bool backward;
+    ListHead doneSteps;
+    CLBlasKernelType ktype;
+
+    kargsToProbDims(&wholeDim, step->funcID, kargs, false);
+    memset(maxPanels, 0, sizeof(maxPanels));
+    memset(maxBlocks, 0, sizeof(maxPanels));
+
+    memcpy(&outerStep, step, sizeof(SolutionStep));
+    memcpy(&execStep, step, sizeof(SolutionStep));
+    listInitHead(&doneSteps);
+
+    /*
+     * Cover the whole problem with dimension which matrix blocks are
+     * fitted to images at.
+     */
+
+    for (mrole = MATRIX_A; mrole < MATRIX_C; mrole++) {
+        if (!isMatrixInImage(mempat, mrole)) {
+            continue;
+        }
+
+        clGetImageInfo(imgs[currImg], CL_IMAGE_WIDTH, sizeof(imgWidth),
+                       &imgWidth, NULL);
+        clGetImageInfo(imgs[currImg], CL_IMAGE_HEIGHT, sizeof(imgHeight),
+                       &imgHeight, NULL);
+
+        if (step->funcID == CLBLAS_TRSM) {
+            maxPanels[mrole] = 0;
+            maxBlocks[mrole] = 0;
+        } else {
+            maxPanels[mrole] = imgHeight / matrBlockHeight(step->subdims, mrole,
+                                                           clblasLeft);
+        }
+        currImg++;
+    }
+
+    /*
+     * for GEMM function we can take both the matrices as outer, it depends on
+     * their sizes and image sizes
+     */
+    if (step->funcID == CLBLAS_GEMM) {
+        size_t dx, dy;
+
+        // FIXME: check which of them use really an image
+
+        ha = matrBlockHeight(&wholeDim, MATRIX_A, clblasLeft);
+        hb = matrBlockHeight(&wholeDim, MATRIX_B, clblasLeft);
+
+        dx = maxPanels[MATRIX_B] * matrBlockHeight(step->subdims, MATRIX_B,
+                                                   clblasLeft);
+        dy = maxPanels[MATRIX_A] * matrBlockHeight(step->subdims, MATRIX_A,
+                                                   clblasLeft);
+
+        // hb + (hb*ha)/dx < ha + (ha*hb)/dy
+        if ((hb / ha) < (1 + hb / dy) / (1 + ha / dx)) {
+            mrole = MATRIX_B;
+        }
+        else {
+            mrole = MATRIX_A;
+        }
+    }
+    else {
+        mrole = MATRIX_B;
+    }
+    /*
+     * Let's cover the whole image based step.
+     * Pattern iterator is used for traversing
+     */
+    initProblemIterator(&outerIter, step->funcID, mrole, kargs,
+                        maxPanels[mrole], maxBlocks[mrole], step->subdims);
+    if (mrole == MATRIX_B) {
+        comp[0] = SDIM_X;
+        comp[1] = SDIM_Y;
+        mrole = MATRIX_A;
+    }
+    else {
+        comp[0] = SDIM_Y;
+        comp[1] = SDIM_X;
+        mrole = MATRIX_B;
+    }
+    initProblemIterator(&innerIter, step->funcID, mrole,
+                        kargs, maxPanels[mrole], maxBlocks[mrole],
+                        step->subdims);
+    backward = isIterBackward(&innerIter);
+
+    /*
+     * Difference in overflowing checking in the outer and inner loops
+     * is due to
+     */
+    do {
+        iteratorReset(&innerIter);
+        iend = 0;
+        oend = iterateProblem(&outerIter);
+        off = iterLastOffset(&outerIter);
+
+        splitSolutionStep(&outerStep, &execStep, comp[0],
+                                  off, false);
+        if (execStep.funcID == CLBLAS_GEMM) {
+            fixupGemmOffsets(&execStep.args, execStep.extraFlags, 0);
+        }
+
+        memcpy(&innerStep, &execStep, sizeof(SolutionStep));
+
+        ktype = (comp[0] == SDIM_Y) ? CLBLAS_PREP_A_KERNEL :
+                                      CLBLAS_PREP_B_KERNEL;
+
+        if (execStep.kernels[ktype] != NULL) {
+            err = enqueueKernel(&execStep, execStep.kernels[ktype],
+                                numEventsInWaitList, eventWaitList, event);
+            if (err != CL_SUCCESS) {
+                 break;
+            }
+        }
+
+        do {
+            iend = iterateProblem(&innerIter);
+            off = iterLastOffset(&innerIter);
+            splitSolutionStep(&innerStep, &execStep,
+                              comp[1], off, backward);
+            if (execStep.funcID == CLBLAS_GEMM) {
+                fixupGemmOffsets(&execStep.args, execStep.extraFlags, 0);
+            }
+
+            assertImageSubstep(step, &execStep, &doneSteps);
+
+            ktype = (comp[1] == SDIM_Y) ? CLBLAS_PREP_A_KERNEL :
+                                          CLBLAS_PREP_B_KERNEL;
+            if (execStep.kernels[ktype] != NULL) {
+                err = enqueueKernel(&execStep, execStep.kernels[ktype],
+                                    numEventsInWaitList, eventWaitList, event);
+            }
+            if (err == CL_SUCCESS) {
+                err = enqueueKernel(&execStep,
+                                    execStep.kernels[CLBLAS_COMPUTING_KERNEL],
+                                    numEventsInWaitList, eventWaitList,
+                                    event);
+            }
+        } while (!iend && (err == CL_SUCCESS));
+    } while (!oend && (err == CL_SUCCESS));
+
+    if (err == CL_SUCCESS) {
+        assertImageStep(step, &doneSteps);
+    }
+    releaseImageAssertion(&doneSteps);
+
+    return err;
+}
+
+static void
+splitSolutionStep(
+    SolutionStep *rem,
+    SolutionStep *cut,
+    SDimComponent component,
+    size_t chunk,
+    bool backward)
+{
+    SubproblemDim remDim, cutDim;
+    SubproblemDim remDimOff, cutDimOff;
+
+    kargsToProbDims(&remDimOff, rem->funcID, &rem->args, true);
+    kargsToProbDims(&remDim, rem->funcID, &rem->args, false);
+    memcpy(&cutDim, &remDim, sizeof(SubproblemDim));
+    memcpy(&cutDimOff, &remDimOff, sizeof(SubproblemDim));
+
+    memcpy(cut, rem, sizeof(SolutionStep));
+    if (component == SDIM_Y) {
+        if (backward) {
+            cutDimOff.y += remDim.y - chunk;
+        }
+        else {
+            remDimOff.y += chunk;
+        }
+        cutDim.y = chunk;
+        remDim.y -= chunk;
+    }
+    else {
+        if (backward) {
+            cutDimOff.x += remDim.x - chunk;
+        }
+        else {
+            remDimOff.x += chunk;
+        }
+        cutDim.x = chunk;
+        remDim.x -= chunk;
+    }
+
+    probDimsToKargs(&rem->args, rem->funcID, &remDimOff, true);
+    probDimsToKargs(&rem->args, rem->funcID, &remDim, false);
+    probDimsToKargs(&cut->args, cut->funcID, &cutDimOff, true);
+    probDimsToKargs(&cut->args, cut->funcID, &cutDim, false);
+}
diff --git a/src/library/blas/generic/solution_seq_make.c b/src/library/blas/generic/solution_seq_make.c
new file mode 100644
index 0000000..0eee1fd
--- /dev/null
+++ b/src/library/blas/generic/solution_seq_make.c
@@ -0,0 +1,2364 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+#include <assert.h>
+#include <sys/types.h>
+#include <math.h>
+#include <stdlib.h>
+
+#include <clblas_stddef.h>
+#include <clblas-internal.h>
+#include <toolslib.h>
+#include <events.h>
+
+#include "matrix_dims.h"
+#include "solution_assert.h"
+#include "solution_seq.h"
+
+#define DECOMPOSITION_THRESHOLD(type) (2560 * sizeof(cl_float) / dtypeSize(type))
+
+/* From solution_seq.c */
+bool VISIBILITY_HIDDEN isMatrixInImage(MemoryPattern *pattern, MatrixRole mrole);
+void VISIBILITY_HIDDEN releaseStepImgs(SolutionStep *step);
+
+#define isMatrixCached(pattern, mrole)                              \
+    checkMatrixMemLevelSet(pattern, mrole, (CLMEM_LEVEL_L2 | CLMEM_LEVEL_L1))
+
+#define isLdsUsed(pattern)                                          \
+    (checkMatrixMemLevelSet(pattern, MATRIX_A, CLMEM_LEVEL_LDS) ||  \
+     checkMatrixMemLevelSet(pattern, MATRIX_B, CLMEM_LEVEL_LDS))
+
+enum {
+    DEFAULT_BUFS_LSIZE_0 = 8,
+    DEFAULT_BUFS_LSIZE_1 = 8,
+    DEFAULT_CACHED_BUFS_LSIZE_0 = 8,
+    DEFAULT_CACHED_BUFS_LSIZE_1 = 8
+};
+
+static cl_uint getQueueMaxImages(cl_command_queue queue);
+
+static bool checkMatrixMemLevelSet(MemoryPattern *pattern, MatrixRole mrole,
+    meml_set_t mask);
+
+static void stripeDivision(BlasFunctionID funcID, const CLBlasKargs *args,
+    ListHead *seq, cl_uint totalCUs);
+static void rectDivision(BlasFunctionID funcID, const CLBlasKargs *args,
+    ListHead *seq, cl_uint totalCUs);
+static void triMatrixStripeDivision(BlasFunctionID funcID,
+    const CLBlasKargs *args, ListHead *seq, cl_uint totalCUs);
+
+static cl_bool findBestPattern(SolutionStep *step);
+
+static void getDefaultStepGranulation(SolutionStep *step);
+static bool avoidLoadFromStorage(SolutionStep *step);
+
+static bool getStepResources(SolutionStep *step);
+static void getSuitableImageSizes(size_t *minWidth, size_t *minHeight,
+    size_t *bestHeight, MatrixRole mrole, CLBlasKargs *kargs, unsigned int vecLen,
+    SubproblemDim *subdims);
+
+static ListNode* decomposeTRXMStep(SolutionStep *step);
+static ListNode* decomposeSYRKStep(SolutionStep *step);
+static ListNode* decomposeSYR2KStep(SolutionStep *step);
+
+// Find vector length which lda and tile width is divisible on
+unsigned int
+appropriateVecLen(size_t ld, unsigned int tsize, size_t twidth, int funcLevel)
+{
+    unsigned int vlen = sizeof(cl_float4) / tsize;
+
+    if (funcLevel == 3) {
+        vlen *= 2;
+    }
+    while (vlen > twidth) {
+        vlen /= 2;
+    }
+
+    while ((ld % vlen) || (twidth % vlen)) {
+        vlen /= 2;
+    }
+
+    return vlen;
+}
+
+/*
+ * Select an appropriate vectorization to perform computation with.
+ * It's done based upon the problem sizes and device type. The device type
+ * is taken into account as well since not all devices allow not aligned
+ * access to vector data.
+ */
+
+cl_int
+selectVectorization(
+    const SolutionStep *step,
+    CLBLASKernExtra *kextra)
+{
+    const TargetDevice *device = &step->device;
+    cl_device_type devType;
+    cl_int err;
+    size_t tw;
+    bool tra;
+    size_t checkedSizes[3];
+    int i, j;
+    const CLBlasKargs *kargs = &step->args;
+    KernelExtraFlags kflags = kextra->flags;
+    KernelExtraFlags vecFlags[3] = { KEXTRA_NO_COPY_VEC_A, KEXTRA_NO_COPY_VEC_B,
+                                     KEXTRA_NO_COPY_VEC_C };
+    unsigned int vlen;
+    unsigned int tsize;
+    MemoryPattern *mempat;
+    const SubproblemDim *dim = &step->subdims[1];
+    int funcLevel;
+
+    mempat = &clblasSolvers[step->funcID].memPatterns[step->patternID];
+    err = clGetDeviceInfo(device->id, CL_DEVICE_TYPE, sizeof(devType),
+                          &devType, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    if (isLdsUsed(mempat)) {
+        kextra->vecLenC = kextra->vecLen = sizeof(cl_float4) /
+                                                  dtypeSize(step->args.dtype);
+        kextra->vecLenA = kextra->vecLenB = kextra->vecLen;
+    }
+    else {
+        kextra->vecLenA = kextra->vecLenB = 0;
+    }
+
+    // select vectorization based upon leading dimensions and starting offsets
+    for (i = 0; i < 2; i++) {
+        if (!i) {
+           // check by leading dimensions
+           checkedSizes[0] = kargs->lda.matrix;
+           if (funcBlasLevel(step->funcID) == 2) {
+               checkedSizes[1] = checkedSizes[2] = 0;
+           }
+           else {
+               checkedSizes[1] = kargs->ldb.matrix;
+               checkedSizes[2] = kargs->ldc.matrix;
+           }
+        }
+        else {
+            // check by offsets
+            checkedSizes[0] = kargs->offA;
+            checkedSizes[1] = kargs->offBX;
+            checkedSizes[2] = kargs->offCY;
+        }
+
+        if (funcHasTriangMatrix(step->funcID)) {
+            checkedSizes[2] = checkedSizes[1];
+        }
+
+        vlen = sizeof(cl_float4) / dtypeSize(step->args.dtype);
+
+        /*
+         * Disable vectorization at load from the global memory to LDS
+         * if matrix width is not aligned on the boundary of the float4
+         */
+        for (j = 0; j < 3; j++) {
+            if (checkedSizes[j] % vlen) {
+                kflags |= vecFlags[j];
+            }
+        }
+
+		if ((step->funcID == CLBLAS_TRMV) || (step->funcID == CLBLAS_HEMV))
+		{
+		   if ( ( ((kflags & KEXTRA_UPPER_TRIANG)==0) && (kflags & KEXTRA_COLUMN_MAJOR) ) ||
+		        ( ((kflags & KEXTRA_UPPER_TRIANG)) && ((kflags & KEXTRA_COLUMN_MAJOR) == 0)) )
+
+			{
+				if( (kargs->N) % vlen)
+				{
+					kflags |= KEXTRA_NO_COPY_VEC_A;
+				}
+			}
+		}
+
+		if(mempat->sops->selectVectorization != NULL)
+		{
+			kflags |= mempat->sops->selectVectorization((void *)kargs, vlen);
+		}
+
+		if ((step->funcID == CLBLAS_TRSV) || (step->funcID == CLBLAS_TRSV_GEMV))
+		{
+			//
+			// TRTRI, GEMV Part - Only Scalar loads
+			// PENDING:
+			// Analyze Case by Case and selectively enable/disable
+			//
+			kflags |= KEXTRA_NO_COPY_VEC_A;
+			kflags |= KEXTRA_NO_COPY_VEC_B;
+		}
+
+		//
+		// Routines that Use LDS should be above this IF statement
+		//
+		if (isLdsUsed(mempat)) {
+            continue;
+        }
+
+		//
+		// Routines that dont use LDS have to be below the isLdsUsed() code
+		//
+		if (step->funcID == CLBLAS_GEMM2)
+		{
+			if ((step->subdims[0].y > step->args.M) || (step->subdims[0].x > step->args.N))
+			{
+				kextra->vecLen = 1;
+			} else {
+        	    kextra->vecLen = sizeof(cl_float4) / dtypeSize(step->args.dtype);
+			}
+        	kextra->vecLenA = kextra->vecLen;
+        	kextra->vecLenB = kextra->vecLen;
+        	kextra->vecLenC = kextra->vecLen;
+			continue;
+		}
+
+		if (step->funcID == CLBLAS_GEMM_TAIL)
+		{
+        	kextra->vecLen =  1;
+        	kextra->vecLenA = 1;
+        	kextra->vecLenB = 1;
+        	kextra->vecLenC = 1;
+			continue;
+		}
+    	funcLevel = funcBlasLevel(step->funcID);
+        funcLevel = funcBlasLevel(step->funcID);
+
+        /*
+         * If the step's pattern uses LDS, it is responsible for alignment.
+         * Otherwise it's needed to provide appropriate vector length
+         */
+        tsize = dtypeSize(step->args.dtype);
+        tra = isMatrixAccessColMaj(step->funcID, kflags, MATRIX_A);
+        tw = (tra) ? dim->y : dim->bwidth;
+        vlen = appropriateVecLen(checkedSizes[0], tsize, tw, funcLevel);
+        kextra->vecLenA = (kextra->vecLenA) ? umin(kextra->vecLenA, vlen) :
+                                              vlen;
+
+        tra = isMatrixAccessColMaj(step->funcID, kflags, MATRIX_B);
+        tw = ((funcLevel == 2) || !tra) ? dim->bwidth : dim->x;
+        vlen = appropriateVecLen(checkedSizes[1], tsize, tw, funcLevel);
+        kextra->vecLenB = (kextra->vecLenB) ? umin(kextra->vecLenB, vlen) :
+                                              vlen;
+
+        tra = isMatrixAccessColMaj(step->funcID, kflags, MATRIX_C );
+        tw = ((funcLevel == 2) || tra) ? dim->y : dim->x;
+        vlen = appropriateVecLen( checkedSizes[2],
+            tsize,
+            tw,
+            funcLevel );
+        kextra->vecLenC = kextra->vecLenC ? umin(vlen,kextra->vecLenC) :
+                                            vlen;
+
+        kextra->vecLen = umin(kextra->vecLenA, kextra->vecLenB);
+        kextra->vecLen = umin(kextra->vecLenC, kextra->vecLen);
+    }
+
+    kextra->flags = kflags;
+
+    return CL_SUCCESS;
+}
+
+/*
+ * Replace 'offsetM' and 'offsetN' field with respective extra offset at
+ * 'offA', 'offBX', 'offCY' and taking into accoutn offset along K
+ */
+void VISIBILITY_HIDDEN
+fixupGemmOffsets(CLBlasKargs *kargs, KernelExtraFlags kflags, size_t offsetK)
+{
+    if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_A)) {
+        kargs->offA += offsetK * kargs->lda.matrix + kargs->offsetM;
+    }
+    else {
+        kargs->offA += kargs->offsetM * kargs->lda.matrix + offsetK;
+    }
+    if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_B)) {
+        kargs->offBX += offsetK * kargs->ldb.matrix + kargs->offsetN;
+    }
+    else {
+        kargs->offBX += kargs->offsetN * kargs->ldb.matrix + offsetK;
+    }
+    if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_C)) {
+        kargs->offCY += kargs->offsetN * kargs->ldc.matrix + kargs->offsetM;
+    }
+    else {
+        kargs->offCY += kargs->offsetM * kargs->ldc.matrix + kargs->offsetN;
+    }
+    kargs->offsetM = kargs->offsetN = 0;
+}
+
+ListNode
+*decomposeProblemStep(SolutionStep *step)
+{
+    ListNode *node;
+
+    switch (step->funcID) {
+    case CLBLAS_TRMM:
+    case CLBLAS_TRSM:
+        node = decomposeTRXMStep(step);
+        break;
+    case CLBLAS_SYRK:
+        node = decomposeSYRKStep(step);
+        break;
+    case CLBLAS_SYR2K:
+        node = decomposeSYR2KStep(step);
+        break;
+    default:
+        node = &step->node;
+        break;
+    }
+
+    return node;
+}
+
+cl_int
+makeSolutionSeq(
+    BlasFunctionID funcID,
+    const CLBlasKargs *args,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events,
+    ListHead *seq)
+{
+    cl_int err;
+    cl_uint j, totalCUs, numDevicesWithoutDoubles;
+    bool hasDouble;
+    SolutionStep *step;
+    CLBLASKernExtra extra;
+    ListNode *i;
+    MemoryPattern *pattern;
+    solver_id_t sid;
+    KernelKey key;
+    bool need[MAX_CLBLAS_KERNELS_PER_STEP] = {true};
+    CLBlasKernelType ktype;
+    Kernel *kernel;
+    bool loadData = false;
+    unsigned char* buffer[MAX_CLBLAS_KERNELS_PER_STEP];
+    size_t sizeBuffer[MAX_CLBLAS_KERNELS_PER_STEP];
+    char bopts[BUILD_OPTS_MAXLEN]; // Moving bopts up. See the comments before findKernel()
+    int ik;
+    // first subdimension index in the subproblem dims array
+    int firstDimIdx;
+
+    if ((numCommandQueues == 0) || (commandQueues == NULL)) {
+        return CL_INVALID_VALUE;
+    }
+
+    memset(buffer, 0, sizeof(buffer));
+    listInitHead(seq);
+
+    totalCUs = 0;
+    numDevicesWithoutDoubles = 0;
+    for (j = 0; j < numCommandQueues; j++) {
+        cl_device_id devID;
+
+        err = getQueueDevice(commandQueues[j], &devID);
+        if (err != CL_SUCCESS) {
+            continue;
+        }
+        if (isDoubleBasedType(args->dtype)) {
+            hasDouble = deviceHasNativeDouble(devID, &err);
+            if (err != CL_SUCCESS) {
+                continue;
+            }
+            if (!hasDouble) {
+                numDevicesWithoutDoubles++;
+                continue;
+            }
+        }
+
+        step = calloc(1, sizeof(SolutionStep));
+        if (step == NULL) {
+            freeSolutionSeq(seq);
+            return CL_OUT_OF_HOST_MEMORY;
+        }
+
+        step->funcID = funcID;
+        step->args = *args;
+        step->args.addrBits = deviceAddressBits(devID, &err);
+        step->cmdQueue = commandQueues[j];
+        step->numEventsInWaitList = numEventsInWaitList;
+        step->eventWaitList = eventWaitList;
+        step->event = NULL;
+        if (events != NULL) {
+            step->event = events + j;
+        }
+        step->pgran.wfSize = deviceWavefront(devID, &err);
+        step->extraFlags = clblasArgsToKextraFlags(args, step->funcID);
+        if (step->funcID == CLBLAS_SYR2K) {
+            step->extraFlags |= KEXTRA_SYRK_2K_RANK;
+        }
+
+        step->device.id = devID;
+        err = identifyDevice(&step->device);
+        if (err != CL_SUCCESS) {
+            freeSolutionSeq(seq);
+            return err;
+        }
+
+        totalCUs += deviceComputeUnits(devID, &err);
+        listAddToTail(seq, &step->node);
+    }
+    if (totalCUs == 0) {
+        return (numDevicesWithoutDoubles == numCommandQueues) ?
+                    CL_INVALID_DEVICE : CL_INVALID_COMMAND_QUEUE;
+    }
+
+    memset(&extra, 0, sizeof(extra));
+    memset(bopts, 0, BUILD_OPTS_MAXLEN*sizeof(char));
+    extra.dtype = args->dtype;
+
+    /* Split task between multiple command queues */
+
+    if (funcID == CLBLAS_GEMM) {
+        rectDivision(funcID, args, seq, totalCUs);
+    }
+    else if ((funcID == CLBLAS_SYRK) || (funcID == CLBLAS_SYR2K)) {
+        triMatrixStripeDivision(funcID, args, seq, totalCUs);
+    }
+    else {
+        stripeDivision(funcID, args, seq, totalCUs);
+    }
+
+    /* Some steps can be decomposed into several sequential substeps */
+
+    parseEnvImplementation();
+
+    // Function level decomposition
+    for (i = listNodeFirst(seq); i != seq; i = i->next) {
+        step = container_of(i, node, SolutionStep);
+        if (step->cmdQueue == NULL) {
+            continue;
+        }
+
+        if (step->funcID == CLBLAS_GEMM) {
+            fixupGemmOffsets(&step->args, step->extraFlags, 0);
+            continue;
+        }
+
+        i = decomposeProblemStep(step);
+    }
+
+	#ifdef DEBUG_2
+	printf("Finding a kernel for each step\n");
+	#endif
+
+    /* Find a kernel for each step */
+
+    for (i = listNodeFirst(seq); (i != seq) && (err == CL_SUCCESS);
+         i = i->next) {
+
+        DeviceIdent *ident;
+
+        step = container_of(i, node, SolutionStep);
+        if (step->cmdQueue == NULL) {
+            continue;
+        }
+
+        ident = &step->device.ident;
+
+        /*
+         * Set vendor dependent flags
+         *
+         * FIXME: thrown this kludge away when generator interface will
+         *        support passing ident info
+         */
+        if (ident->vendor == VENDOR_AMD) {
+            step->extraFlags |= (KEXTRA_VENDOR_AMD | KEXTRA_ENABLE_MAD);
+        }
+
+        if (!findBestPattern(step)) {
+            err = CL_OUT_OF_RESOURCES;
+            break;
+        }
+
+		#ifdef DEBUG_2
+		printf("Find best pattern finished\n");
+		#endif
+
+
+        pattern = &(clblasSolvers[step->funcID].memPatterns[step->patternID]);
+        firstDimIdx = 2 - pattern->nrLevels;
+        sid = makeSolverID(step->funcID, step->patternID);
+
+        err = getQueueDevice(step->cmdQueue, &key.device);
+        err = getQueueContext(step->cmdQueue, &key.context);
+
+        detectProblemTails(step);
+
+        extra.flags = step->extraFlags;
+        if (pattern->sops->fixupArgs) {
+            pattern->sops->fixupArgs(&step->args, &step->subdims[firstDimIdx],
+                                     &extra);
+        }
+        step->extraFlags = extra.flags;
+
+        key.nrDims = pattern->nrLevels;
+        memset(key.subdims, 0, sizeof(key.subdims));
+        memcpy(key.subdims, &step->subdims[firstDimIdx],
+               sizeof(SubproblemDim) * key.nrDims);
+
+        detectOffsets(step);
+
+        extra.flags = step->extraFlags;
+
+        need[CLBLAS_PREP_A_KERNEL] = isMatrixInImage(pattern, MATRIX_A);
+        need[CLBLAS_PREP_B_KERNEL] = isMatrixInImage(pattern, MATRIX_B);
+
+        /*
+         * Now, find and enqueue each kernel. Generate and build the kernel
+         * on the fly if this kernel is not presented neither in the cache
+         * no in the storage
+         */
+        for (ktype = CLBLAS_COMPUTING_KERNEL;
+             ktype < MAX_CLBLAS_KERNELS_PER_STEP; ktype++) {
+			 SubproblemDim prepDims[2];
+
+            if (!need[ktype]) {
+                continue;
+            }
+
+            extra.kernType = ktype;
+
+            err = selectVectorization(step, &extra);
+            if (err != CL_SUCCESS) {
+                break;
+            }
+
+            kernel = NULL;
+
+            //
+            // Now that the build options is a part of EXTRA structure,
+            // it is also a part of the kernelKey
+            // Setting of build options need to be done before
+            // findKernel()
+            //
+            memset(bopts, 0, BUILD_OPTS_MAXLEN*sizeof(char));
+            setupBuildOpts(bopts, key.device, pattern);
+            if (pattern->sops->setBuildOptions)
+            {
+                pattern->sops->setBuildOptions(bopts, (void*)(step));
+            }
+            memcpy(extra.buildOptions, bopts, BUILD_OPTS_MAXLEN);
+
+            if (areKernelsCacheable()) {
+                kernel = findKernel(clblasKernelCache, sid, &key, &extra);
+            }
+            if (kernel == NULL) {
+                if (!loadData && !avoidLoadFromStorage(step)) {
+                    size_t MNK = (step->args.M + step->args.N + step->args.K) / 3;
+                    loadData = !getKernelInfo(&step->device, pattern->name,
+                        extra.dtype, step->extraFlags, (int)MNK, &buffer[0],
+                        &sizeBuffer[0]);
+                }
+                if (buffer[ktype] != NULL){
+                    kernel = loadKernel((const unsigned char**)&buffer[ktype],
+                                        sizeBuffer[ktype], &key, &extra, &err);
+                }
+                else {
+                    SubproblemDim *dims;
+
+                    dims = (ktype == CLBLAS_COMPUTING_KERNEL) ? step->subdims :
+                                                                prepDims;
+
+					#ifdef DEBUG_2
+					printf("Build options used : %s\n", bopts);
+					#endif
+
+                    kernel = makeKernel(key.device, key.context,
+                                        pattern->sops->genKernel,
+                                        &dims[firstDimIdx], &step->pgran,
+                                        &extra, bopts, &err);
+                }
+
+                if (kernel == NULL) {
+                    break;
+                }
+
+                if (areKernelsCacheable()) {
+                    getKernel(kernel);
+                    if (addKernelToCache(clblasKernelCache, sid, kernel, &key,
+                                         clblasKernelExtraCmp)) {
+                        putKernel(clblasKernelCache, kernel);
+                    }
+                }
+            } else {
+				#ifdef DEBUG_CONTEXT
+				printf("KERNEL FOUND IN CACHE\n");
+				#endif
+			}
+            step->kernels[ktype] = kernel;
+        }
+    }
+
+    if (err != CL_SUCCESS) {
+        freeSolutionSeq(seq);
+    }
+
+    // free binary kernels
+    for (ik = 0; ik < MAX_CLBLAS_KERNELS_PER_STEP; ++ik) {
+        free(buffer[ik]);
+    }
+    return err;
+}
+
+static cl_uint
+getQueueMaxImages(cl_command_queue queue)
+{
+    cl_int err;
+    cl_device_id device;
+    cl_command_queue_properties props;
+    cl_bool imageSupport;
+
+    imageSupport = CL_FALSE;
+    err = getQueueDevice(queue, &device);
+    if (err != CL_SUCCESS) {
+        return 0;
+    }
+    err = clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(imageSupport),
+        &imageSupport, NULL);
+    if (!imageSupport) {
+        return 0;
+    }
+
+    props = 0;
+    err = getQueueProperties(queue, &props);
+    if (props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
+        return 0;
+    }
+
+    return 2;
+}
+
+static bool
+isTransBUsed(BlasFunctionID funcID)
+{
+    if ((CLBLAS_GEMM == funcID) || (CLBLAS_GEMM2 == funcID) || (CLBLAS_GEMM_TAIL == funcID)) {
+        return true;
+    }
+    else {
+        return false;
+    }
+}
+
+KernelExtraFlags
+clblasArgsToKextraFlags(const CLBlasKargs *args, BlasFunctionID funcID)
+{
+    KernelExtraFlags flags = KEXTRA_NO_FLAGS;
+
+    if (args->transA != clblasNoTrans) {
+        flags |= KEXTRA_TRANS_A;
+    }
+
+    if (isTransBUsed(funcID) && args->transB != clblasNoTrans) {
+        flags |= KEXTRA_TRANS_B;
+    }
+
+    if (isComplexType(args->dtype)) {
+        if (args->transA == clblasConjTrans) {
+            flags |= KEXTRA_CONJUGATE_A;
+        }
+        if (isTransBUsed(funcID) && args->transB == clblasConjTrans) {
+            flags |= KEXTRA_CONJUGATE_B;
+        }
+    }
+
+    if (args->order == clblasColumnMajor) {
+        flags |= KEXTRA_COLUMN_MAJOR;
+    }
+    if ((funcID != CLBLAS_TRMM) && (funcID != CLBLAS_TRSM)) {
+        // check if beta is zero
+        ArgMultiplier z;
+
+        memset(&z, 0, sizeof(z));
+        if (!memcmp(&args->beta, &z, sizeof(z))) {
+            flags |= KEXTRA_BETA_ZERO;
+        }
+    }
+
+    if (funcID != CLBLAS_GEMM) {
+        if (args->uplo == clblasUpper) {
+            flags |= KEXTRA_UPPER_TRIANG;
+        }
+        if (args->side == clblasRight) {
+            flags |= KEXTRA_SIDE_RIGHT;
+        }
+        if (args->diag == clblasUnit) {
+            flags |= KEXTRA_UNIT_DIAGONAL;
+        }
+    }
+    if (funcID == CLBLAS_GEMV || funcID == CLBLAS_SYMV) {
+        if (args->ldb.vector == 1) {
+            flags |= KEXTRA_INCX_ONE;
+        }
+        if (args->ldc.vector == 1) {
+            flags |= KEXTRA_INCY_ONE;
+        }
+    }
+
+    return flags;
+}
+
+static bool
+checkMatrixMemLevelSet(
+    MemoryPattern *pattern,
+    MatrixRole mrole,
+    meml_set_t mask)
+{
+    const CLBLASMpatExtra *extra = (const CLBLASMpatExtra*)pattern->extra;
+    meml_set_t mset;
+
+    if (mrole == MATRIX_C || extra == NULL) {
+        return false;
+    }
+
+    switch (mrole) {
+    case MATRIX_A:
+        mset = extra->aMset;
+        break;
+    case MATRIX_B:
+        mset = extra->bMset;
+        break;
+    default:
+        break;
+    }
+
+    return ((mset & mask) != 0);
+}
+
+/* Next three functions: stripeDivision(), rectDivision() and
+ * triMatrixStripeDivision(), split output matrix into set of non-intersected
+ * rectangles. Area of each rectangle depends on the number of Compute Units,
+ * available on a device of the given queue.
+ * Division is also aligned on the DIVISION_ALIGNMENT boundary. It is measured
+ * in number of elements.
+ */
+
+/* This constant is used in:
+ *     - stripeDivision()
+ *     - rectDivision()
+ *     - triMatrixStripeDivision()
+ *     - decomposeTRXMStep()
+ */
+static const size_t DIVISION_ALIGNMENT = 128;
+
+static size_t
+align(
+    size_t value,
+    size_t alignment)
+{
+    /* This implementation assumes that alignment is the power of 2. */
+    return (value + (alignment >> 1)) & (~(alignment - 1));
+}
+
+/* Stripe division is done according to the picture:
+ *
+ *      +------+--+----+--+
+ *      |      |  |    |  |
+ *      |      |  |    |  |
+ *      |  1   | 2|  3 | 4|
+ *      |      |  |    |  |
+ *      |      |  |    |  |
+ *      +------+--+----+--+
+ */
+static void
+stripeDivision(
+    BlasFunctionID funcID,
+    const CLBlasKargs *args,
+    ListHead *seq,
+    cl_uint totalCUs)
+{
+    SolutionStep *step;
+    ListNode *i;
+    cl_int err;
+    cl_device_id device;
+    cl_uint nrCU;
+    SubproblemDim size, offset, stepSize;
+    bool first = true;
+
+    kargsToProbDims(&offset, funcID, args, true);
+    kargsToProbDims(&size, funcID, args, false);
+
+    for (i = listNodeFirst(seq); i != seq; i = i->next) {
+        step = container_of(i, node, SolutionStep);
+        err = getQueueDevice(step->cmdQueue, &device);
+        nrCU = deviceComputeUnits(device, &err);
+
+        if (totalCUs == 0) {
+            step->cmdQueue = NULL;
+            continue;
+        }
+
+        stepSize = size;
+        if (!first) {
+            probDimsToKargs(&(step->args), funcID, &offset, true);
+        }
+
+        if (funcID == CLBLAS_GEMV) {
+            if (totalCUs != nrCU) {
+                stepSize.y = (size_t)(size.y * (double)nrCU / totalCUs + 0.5);
+                stepSize.y = align(stepSize.y, DIVISION_ALIGNMENT);
+                if (stepSize.y == 0) {
+                    step->cmdQueue = NULL;
+                }
+                else if (stepSize.y > size.y) {
+                    stepSize.y = size.y;
+                    totalCUs = nrCU;
+                }
+            }
+
+            offset.y += stepSize.y;
+            size.y -= stepSize.y;
+        }
+        else {
+            if (totalCUs != nrCU) {
+                stepSize.x = (size_t)(size.x * (double)nrCU / totalCUs + 0.5);
+                stepSize.x = align(stepSize.x, DIVISION_ALIGNMENT);
+                if (stepSize.x == 0) {
+                    step->cmdQueue = NULL;
+                }
+                else if (stepSize.x > size.x) {
+                    stepSize.x = size.x;
+                    totalCUs = nrCU;
+                }
+            }
+            offset.x += stepSize.x;
+            size.x -= stepSize.x;
+        }
+
+        totalCUs -= nrCU;
+        probDimsToKargs(&(step->args), funcID, &stepSize, false);
+        first = false;
+    }
+}
+
+/* Rectangular division is done according to the picture:
+ *
+ *      +------+-----+
+ *      |      |  2  |
+ *      |      |     |
+ *      |  1   +--+--+
+ *      |      |3 | 4|
+ *      |      |  |  |
+ *      +------+--+--+
+ *
+ * The longest side is divided first.
+ */
+static void
+rectDivision(
+     BlasFunctionID funcID,
+     const CLBlasKargs *args,
+     ListHead *seq,
+     cl_uint totalCUs)
+ {
+     SolutionStep *step, **sortedSteps;
+     ListNode *i, *j;
+     cl_int err;
+     cl_device_id device;
+     cl_uint nrCU, k, l;
+     SubproblemDim size, offset, stepSize;
+     unsigned int nrSteps = 0;
+
+     /* 1. Sort steps according to the number of CU they have */
+     /* NOTE: We expect small number of steps, so simple insertion sort
+      *       would be enough.
+      */
+
+     sortedSteps = calloc(listLength(seq), sizeof(*sortedSteps));
+     // assert(sortedSteps != NULL);
+
+     k = 0;
+     for (i = listNodeFirst(seq); i != seq; i = i->next, nrSteps++) {
+         step = container_of(i, node, SolutionStep);
+         err = getQueueDevice(step->cmdQueue, &device);
+
+         sortedSteps[k] = step;
+         nrCU = deviceComputeUnits(device, &err);
+
+         for (j = i->next; j != seq; j = j->next) {
+             step = container_of(i, node, SolutionStep);
+             err = getQueueDevice(step->cmdQueue, &device);
+
+             if (nrCU < deviceComputeUnits(device, &err)) {
+                 sortedSteps[k] = step;
+                 nrCU = deviceComputeUnits(device, &err);
+             }
+         }
+
+         k++;
+     }
+
+     /* 2. Calculate rectangle sizes */
+
+     kargsToProbDims(&offset, funcID, args, true);
+     kargsToProbDims(&size, funcID, args, false);
+     stepSize = size;
+
+     for (l = 0; l < k; l++) {
+         step = sortedSteps[l];
+         err = getQueueDevice(step->cmdQueue, &device);
+         nrCU = deviceComputeUnits(device, &err);
+
+         if (totalCUs == 0) {
+             step->cmdQueue = NULL;
+             continue;
+         }
+
+         stepSize = size;
+         if (l) {
+             probDimsToKargs(&(step->args), funcID, &offset, true);
+         }
+
+         if (size.y > size.x) {
+             if (totalCUs != nrCU) {
+                 stepSize.y = (size_t)(size.y * (double)nrCU / totalCUs + 0.5);
+                 stepSize.y = align(stepSize.y, DIVISION_ALIGNMENT);
+                 if (stepSize.y > size.y) {
+                     stepSize.y = size.y;
+                     totalCUs = nrCU;
+                 }
+                 else if (stepSize.y == 0) {
+                     step->cmdQueue = NULL;
+                 }
+             }
+             size.y -= stepSize.y;
+             offset.y += stepSize.y;
+         }
+         else {
+             if (totalCUs != nrCU) {
+                 stepSize.x = (size_t)(size.x * (double)nrCU / totalCUs + 0.5);
+                 stepSize.x = align(stepSize.x, DIVISION_ALIGNMENT);
+                 if (stepSize.x > size.x) {
+                     stepSize.x = size.x;
+                     totalCUs = nrCU;
+                 }
+                 else if (stepSize.x == 0) {
+                     step->cmdQueue = NULL;
+                 }
+             }
+             size.x -= stepSize.x;
+             offset.x += stepSize.x;
+         }
+
+         probDimsToKargs(&(step->args), funcID, &stepSize, false);
+
+         #ifdef DEBUG_2
+         printf("RectDivision:\n");
+         printf("\t offM=%d, offN=%d, M=%d, N=%d\n", step->args.offsetM, step->args.offsetN, step->args.M, step->args.N);
+         #endif
+         totalCUs -= nrCU;
+     }
+
+     free(sortedSteps);
+}
+
+/* Dividing triangular matrix (N x N) horizontally:
+ *
+ *      +----+
+ *      |\   |
+ *      +-\--+
+ *      |  \ |
+ *      |   \|
+ *      +----+
+ *
+ * Take into consideration the areas of triangles/trapezoids rather than
+ * areas of stripes.
+ */
+static void
+triMatrixStripeDivision(
+    BlasFunctionID funcID,
+    const CLBlasKargs *args,
+    ListHead *seq,
+    cl_uint totalCUs)
+{
+    SolutionStep *step;
+    ListNode *i;
+    cl_int err;
+    cl_device_id device;
+    cl_uint nrCU;
+    SubproblemDim size, offset, stepSize, stepOffset;
+    size_t top;
+
+    kargsToProbDims(&offset, funcID, args, true);
+    kargsToProbDims(&size, funcID, args, false);
+    top = 0;
+
+    if (args->uplo == clblasUpper) {
+        offset.y += size.y;
+    }
+    stepSize = size;
+
+    for (i = listNodeFirst(seq); i != seq; i = i->next) {
+        step = container_of(i, node, SolutionStep);
+        err = getQueueDevice(step->cmdQueue, &device);
+        nrCU = deviceComputeUnits(device, &err);
+
+        if (totalCUs == 0) {
+            step->cmdQueue = NULL;
+            continue;
+        }
+
+        if (args->uplo == clblasLower) {
+            stepOffset = offset;
+        }
+
+        if (totalCUs != nrCU) {
+            stepSize.y = (size_t)(
+                sqrt(top * top + (double)nrCU / totalCUs * size.y * (top + size.x)) - top);
+            stepSize.y = align(stepSize.y, DIVISION_ALIGNMENT);
+            if ((stepSize.y == 0) || (stepSize.y > size.y)) {
+                stepSize.y = size.y;
+                totalCUs = nrCU;
+            }
+            else if (stepSize.y == 0) {
+                step->cmdQueue = NULL;
+            }
+            /* We have to add special check because the direction of
+             * splitting is 'bottom -> top' for UPLO = clblasUpper.
+             */
+            else if (offset.y != align(offset.y, DIVISION_ALIGNMENT)) {
+                size_t o = align(offset.y - stepSize.y, DIVISION_ALIGNMENT);
+                if (o > offset.y) {
+                    o -= 2 * DIVISION_ALIGNMENT;
+                }
+                stepSize.y = offset.y - o;
+            }
+        }
+        else {
+            stepSize.y = size.y;
+        }
+
+        size.y -= stepSize.y;
+        top += stepSize.y;
+        if (args->uplo == clblasLower) {
+            offset.y += stepSize.y;
+        }
+        else {
+            offset.y -= stepSize.y;
+            stepOffset = offset;
+        }
+
+        probDimsToKargs(&(step->args), funcID, &stepOffset, true);
+        probDimsToKargs(&(step->args), funcID, &stepSize, false);
+
+        totalCUs -= nrCU;
+    }
+}
+
+static cl_bool
+findBestPattern(SolutionStep *step)
+{
+    cl_uint maxImages;
+
+    maxImages = getQueueMaxImages(step->cmdQueue);
+
+    do {
+        /* It may be non first attempt. Ensure that there are not
+         * hold images for this step
+         */
+        releaseStepImgs(step);
+
+        step->patternID = selectPattern( step, maxImages );
+
+        assert(step->patternID != (unsigned int)-1);
+
+		#ifdef DEBUG_2
+		printf("select Pattern Done\n");
+		#endif
+
+        getStepGranulation(step);
+		#ifdef DEBUG_2
+		printf("getStepGranulation done \n");
+		#endif
+
+        assertGranulation(step->subdims, mempat->nrLevels,
+                          &step->pgran, mempat->thLevel);
+        if (getStepResources(step))
+            break;
+    } while (maxImages-- != 0);
+
+    return (maxImages != (cl_uint)-1) ? CL_TRUE : CL_FALSE;
+}
+
+void
+detectProblemTails(SolutionStep *step)
+{
+    SubproblemDim globDim, offDim;
+    SubproblemDim *subdim;
+    KernelExtraFlags kflags = KEXTRA_NO_FLAGS;
+
+    subdim = step->subdims;
+
+    kargsToProbDims(&globDim, step->funcID, &step->args, false);
+    kargsToProbDims(&offDim, step->funcID, &step->args, true);
+
+	#ifdef DEBUG_2
+	printf("detectProblemTails: subdimy=%d, subdimx=%d, subdimBwidth=%d\n", subdim->y, subdim->x, subdim->bwidth);
+	#endif
+    if (globDim.y % subdim->y) {
+        kflags |= KEXTRA_TAILS_M;
+    }
+    if (globDim.x % subdim->x) {
+        kflags |= KEXTRA_TAILS_N;
+    }
+    if (globDim.bwidth % subdim->bwidth) {
+        kflags |= KEXTRA_TAILS_K;
+    }
+    if (clblasSolvers[step->funcID].memPatterns[step->patternID].nrLevels > 1) {
+        if (globDim.y % subdim[1].y) {
+            kflags |= KEXTRA_TAILS_M_LOWER;
+        }
+        if (globDim.x % subdim[1].x) {
+            kflags |= KEXTRA_TAILS_N_LOWER;
+        }
+        if (globDim.bwidth % subdim[1].bwidth) {
+            kflags |= KEXTRA_TAILS_K_LOWER;
+        }
+    }
+    else {
+        kflags |= (kflags & KEXTRA_TAILS_M) != 0 ? KEXTRA_TAILS_M_LOWER : 0;
+        kflags |= (kflags & KEXTRA_TAILS_N) != 0 ? KEXTRA_TAILS_N_LOWER : 0;
+        kflags |= (kflags & KEXTRA_TAILS_K) != 0 ? KEXTRA_TAILS_K_LOWER : 0;
+    }
+
+    // clean tails flags
+    step->extraFlags &= ~(KEXTRA_TAILS_M | KEXTRA_TAILS_N | KEXTRA_TAILS_K
+                          | KEXTRA_TAILS_M_LOWER
+                          | KEXTRA_TAILS_N_LOWER
+                          | KEXTRA_TAILS_K_LOWER);
+    // set tails flags
+    step->extraFlags |= kflags;
+}
+
+void
+detectOffsets(SolutionStep *step)
+{
+    const CLBlasKargs *args = &(step->args);
+    KernelExtraFlags kflags = step->extraFlags;
+
+    if (args->offsetM) {
+        kflags |= KEXTRA_STARTM_NOT_ZERO;
+    }
+    if (args->offsetN) {
+        kflags |= KEXTRA_STARTN_NOT_ZERO;
+    }
+    if (args->offA) {
+        kflags |= KEXTRA_A_OFF_NOT_ZERO;
+    }
+    if (args->offBX) {
+        kflags |= KEXTRA_BX_OFF_NOT_ZERO;
+    }
+    if (args->offCY) {
+        kflags |= KEXTRA_CY_OFF_NOT_ZERO;
+    }
+
+    step->extraFlags = kflags;
+}
+
+//-----------------------------------------------------------------------------
+
+static unsigned int
+legacySelectPattern(
+    BlasFunctionID funcID,
+    unsigned int maxImages)
+{
+    unsigned int id, i, n;
+    MatrixRole mrole;
+    MemoryPattern *pat;
+    int score, maxScore = -1;
+
+    id = -1;
+    /*
+     * Lookup all patterns, and assign a score per each matrix for
+     * each pattern:
+     * 0 - matrix is not cached
+     * 2 - matrix is cached and stored in an image
+     * 3 - matrix is cached and not stored in an image
+     *
+     * Find the pattern with the best score
+     */
+    pat = clblasSolvers[funcID].memPatterns;
+
+    for (i = 0; i < clblasSolvers[funcID].nrPatterns; i++, pat++) {
+        score = 0;
+        n = 0;
+
+        for (mrole = MATRIX_A; mrole <= MATRIX_B; mrole++) {
+            if (isMatrixCached(pat, mrole)) {
+                if (isMatrixInImage(pat, mrole)) {
+                    n++;
+                    score += 2;
+                }
+                else {
+                    score += 3;
+                }
+            }
+        }
+
+        if (n > maxImages) {
+            continue;
+        }
+
+        if (score > maxScore) {
+            maxScore = score;
+            id = i;
+        }
+    }
+
+    return id;
+}
+//-----------------------------------------------------------------------------
+
+unsigned int
+selectPattern( SolutionStep* pStep,
+    unsigned int maxImages )
+{
+    unsigned int i = 0;
+    int selPatt = -1;
+    int perf = -1;
+    int maxPerf = -1;
+    int funcID = pStep->funcID;
+    unsigned int kflags = pStep->extraFlags;
+
+    if (clblasSolvers[funcID].defaultPattern != -1) {
+// assert(clblasSolvers[funcID].defaultPattern < clblasSolvers[funcID].nrPatterns);
+        return clblasSolvers[funcID].defaultPattern;
+    }
+
+	// select best-performing pattern for current case
+	for( i = 0; i < clblasSolvers[funcID].nrPatterns; i++ ){
+
+		if( NULL != clblasSolvers[funcID].memPatterns[i].sops->getPatternPerf ){
+
+	        perf = clblasSolvers[funcID].memPatterns[i].sops->getPatternPerf(
+                kflags,
+                (void*)&pStep->args);
+
+            if( perf > maxPerf ){
+                selPatt = i;
+                maxPerf = perf;
+            }
+	    }
+	    // if not all patterns provide performace estimation functions
+	    // use legacy pattern selection
+	    else{
+	        return legacySelectPattern( funcID, maxImages );
+	    }
+	}
+
+    return selPatt;
+}
+
+//-----------------------------------------------------------------------------
+
+/*
+ * Check if tile sizes exceed the entire problem and adjust them
+ * accordingly if yes
+ */
+bool
+dimensionsExceedProblemSize(SolutionStep *step) {
+    SubproblemDim probDim;
+    SubproblemDim *dims = step->subdims;
+    BlasFunctionID funcID = step->funcID;
+    MemoryPattern *mempat =
+            &clblasSolvers[funcID].memPatterns[step->patternID];
+
+    /*
+     * Looks like kernels of other functions handle the case themselves
+     * and don't expect that everyone can adjust chosen decomposition
+     */
+    if (!( (funcID == CLBLAS_GEMV) ||
+           (funcID == CLBLAS_SYMV) ||
+           (funcID == CLBLAS_GEMM) ||
+           (funcID == CLBLAS_TRMM) ||
+           (funcID == CLBLAS_TRSM) ||
+           (funcID == CLBLAS_SYRK) ||
+           (funcID == CLBLAS_SYR2K)) ) {
+
+        return false;
+    }
+
+
+    kargsToProbDims(&probDim, step->funcID, &step->args, false);
+
+    if (mempat->nrLevels != 2) {
+        return false;
+    }
+    dims = &dims[1];
+
+    if (dims->x > probDim.x ||
+        dims->y > probDim.y ||
+        dims->bwidth > probDim.bwidth) {
+        return true;
+    }
+
+    return false;
+}
+
+void
+getMinimalStepGranulation(SolutionStep *step)
+{
+    SubproblemDim *decompDims = NULL;
+    SubproblemDim probDims[2];
+    size_t factor = 0;
+
+    // EINVAL
+    if( NULL == step ){
+        return;
+    }
+
+    if (step->funcID == CLBLAS_GEMM2)
+	{
+		return;
+	}
+
+    kargsToProbDims( probDims, step->funcID, &step->args, false);
+    decompDims = step->subdims;
+
+    // All exceeding dimensions are set to 1
+
+     if ( decompDims[1].itemX > probDims->x ) {
+
+         factor = decompDims[1].itemX;
+         decompDims[1].itemX = 1;
+         decompDims[1].x /= factor;
+         decompDims[0].itemX /= factor;
+         decompDims[0].x /= factor;
+     }
+
+     if ( decompDims[1].itemY > probDims->y ) {
+
+         factor = decompDims[1].itemY;
+         decompDims[1].itemY = 1;
+         decompDims[1].y /= factor;
+         decompDims[0].itemY /= factor;
+         decompDims[0].y /= factor;
+     }
+
+     if( decompDims[1].bwidth > probDims->bwidth ){
+         decompDims[0].bwidth /= decompDims[1].bwidth;
+         decompDims[1].bwidth = 1;
+     }
+}
+
+void
+getStepGranulation(SolutionStep *step)
+{
+	SubproblemDim *dims = step->subdims;
+    cl_device_id devID;
+    double time;
+    int status = GF_ERROR;
+    size_t MNK;
+
+	#ifdef DEBUG_2
+	printf("getStepGranulation called........\n");
+	#endif
+
+    MemoryPattern *mempat =
+            &clblasSolvers[step->funcID].memPatterns[step->patternID];
+
+	#ifdef DEBUG_2
+	printf("Got mempat structure.........0x%p\n", mempat);
+	#endif
+
+
+	#ifdef DEBUG_2
+	if ( mempat == NULL)
+	{
+		printf("mempat pointer is NULL...\n");
+	} else {
+		printf("mempat pointer is non-null..\n");
+		if (mempat->sops == NULL)
+			printf("sops is NULL\n");
+		else
+			if (mempat->sops->getFlags == NULL)
+				printf("getFlags() is NULL\n");
+		fflush(stdout);
+	}
+	#endif
+
+	getQueueDevice(step->cmdQueue, &devID);
+
+	#ifdef DEBUG_2
+	printf("QueueDevice done...\n");
+	#endif
+
+
+    // try to load decomposition info from the storage
+
+    /*
+     * FIXME: It's a workaround so that to avoid getting some decomposition
+     *         sizes leading to strange hang ups
+     */
+    if (!avoidLoadFromStorage(step)) {
+		#ifdef DEBUG_2
+		printf("!avoidLoadFromStorage...Inside if\n");
+		#endif
+
+        MNK = (step->args.M + step->args.N + step->args.K)/3;
+        if (mempat->sops->innerDecompositionAxis) {
+            size_t ld;
+            // bas - banks aligned size, in bytes, should be
+            // number of channels * bytes per channel
+            // here it is set to 8*256 = 2048 = 512 floats
+            size_t bas = 8*256;
+            if (mempat->sops->innerDecompositionAxis(&step->args) ==
+                    DECOMP_AXIS_X) {
+                ld = step->args.ldb.matrix;
+            }
+            else {
+                ld = step->args.lda.matrix;
+            }
+
+            if ((ld * dtypeSize(step->args.dtype)) % bas == 0) {
+                //special bad case
+                MNK = 0;
+            }
+        }
+
+        status = getGranularityInfo(&step->device, mempat->name,
+                                    step->args.dtype, step->extraFlags,
+                                    (int)MNK, dims, &step->pgran, &time);
+        /*
+         * Disable blocking for implementations dealing with cache reads
+         * from the global memory
+         */
+        //if (!(isLdsUsed(mempat) || (square && mempat->nrLevels == 2))) {
+        //    dims[0].bwidth = dims[1].bwidth;
+        //}
+    }
+	#ifdef DEBUG_2
+	printf("isLoadFromStorage done..\n");
+	#endif
+
+	//Query solver for default granulation
+    if (status == GF_ERROR) {
+		// temporary mock, untill all solvers will return required default problem granulation
+		// TODO: deprecate the getDefaultStepGranulation(step) function
+		if(NULL==mempat->sops->getDefaultDecomp)
+		{
+			getDefaultStepGranulation(step);
+		}
+		else
+		{
+			mempat->sops->getDefaultDecomp( &step->pgran,
+			    step->subdims,
+			    MAX_SUBDIMS,
+			    (void*)&step->args);
+		}
+    }
+    if (dimensionsExceedProblemSize(step)) {
+        getMinimalStepGranulation(step);
+    }
+}
+
+void
+getDefaultStepGranulation(SolutionStep *step)
+{
+    unsigned int nrFloats;
+    MemoryPattern *mempat =
+            &clblasSolvers[step->funcID].memPatterns[step->patternID];
+    SubproblemDim *dims = step->subdims;
+    cl_ulong ldsSize;
+    size_t wgX, wgY;
+    bool square;
+    SDimComponent component = SDIM_BWIDTH;
+    DataType dtype = step->args.dtype;
+    size_t tsize = dtypeSize(dtype);
+    unsigned int i;
+    SolverFlags sflags;
+    unsigned int bcoeff;
+    bool bothCached, fixedBw = false;
+    cl_device_id devID;
+    PGranularity *pgran = &step->pgran;
+	size_t maxWorkGroupSize;
+	int vecLen;
+	size_t subdimyFactor = 1;
+	size_t subdimxFactor = 1;
+
+	#ifdef DEBUG_2
+	printf("getDefaultStepGranualtion called...\n");
+	#endif
+    nrFloats = (unsigned int)(dtypeSize(dtype) / sizeof(cl_float));
+    square = ((mempat->sops->getFlags() & SF_TOP_INPUT_SQUARE_BLOCKS) != 0);
+    bothCached = isMatrixCached(mempat, MATRIX_A) &&
+                 isMatrixCached(mempat, MATRIX_B);
+    if (step->cmdQueue != NULL) {
+        getQueueDevice(step->cmdQueue, &devID);
+    }
+    else {
+        devID = step->device.id;
+    }
+    clGetDeviceInfo(devID, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(ldsSize),
+                    &ldsSize, NULL);
+	clGetDeviceInfo(devID, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+					sizeof(size_t), &maxWorkGroupSize, NULL);
+
+    /*
+     * Setup dimensions allowing to use more or less effectively the local
+     * memory or cache;
+     */
+
+    if (square) {
+        dims[0].x = (dtype == TYPE_COMPLEX_DOUBLE) ? 16 : 32;
+        /*
+         * FIXME: for now, we restrict ourselves with square blocks due
+         *        to compilation issues
+         */
+        dims[0].y = dims[0].x; //(dtype == TYPE_FLOAT) ? 32 : 16
+        dims[0].bwidth = dims[0].y;
+        bcoeff = nrFloats;
+        wgY = DEFAULT_BUFS_LSIZE_0;
+        wgX = DEFAULT_BUFS_LSIZE_1;
+	} else {
+        bcoeff = (dtype == TYPE_COMPLEX_DOUBLE) ? 2 : 1;
+
+        if (bothCached) {
+            wgY = DEFAULT_CACHED_BUFS_LSIZE_0;
+            wgX = DEFAULT_CACHED_BUFS_LSIZE_1;
+        }
+        else {
+            wgY = DEFAULT_BUFS_LSIZE_0;
+            wgX = DEFAULT_BUFS_LSIZE_1;
+        }
+
+		if (step->funcID == CLBLAS_GEMM2)
+		{
+			subdimyFactor = 2;
+			subdimxFactor = 1;
+            bcoeff = 4; // 16/bcoeff = 4 - Thats the panel width we want
+		}
+
+   		if ((step->funcID == CLBLAS_TRMV) || (step->funcID == CLBLAS_HEMV))  {
+			if (maxWorkGroupSize >= 256)
+			{
+				wgX = 16;
+				wgY = 16;
+			} else if (maxWorkGroupSize >= 128)
+			{
+				wgX = 8;
+				wgY = 16;
+			} else {
+				//
+				// PENDING: What if maxWorkGroupSize < 64 ????
+				//
+				wgX = 8;
+				wgY = 8;
+			}
+		}
+
+
+        /*
+         * Set block sizes such so the work group would access the whole
+         * memory channel or not exceed cache associativity for the modern
+         * AMD GPU families.
+         *
+         * FIXME: throw the hardcoded constants away
+         */
+        if (isMatrixInImage(mempat, MATRIX_A) ||
+            isMatrixAccessColMaj(step->funcID, step->extraFlags, MATRIX_A)) {
+
+            dims[0].y = (64 * subdimyFactor) / nrFloats;
+            fixedBw = true;
+        }
+        else {
+            dims[0].y = (32 * subdimyFactor);
+        }
+
+        if (isMatrixInImage(mempat, MATRIX_B) ||
+            isMatrixAccessColMaj(step->funcID, step->extraFlags, MATRIX_B)) {
+
+            dims[0].x = (64 * subdimxFactor) / nrFloats;
+            fixedBw = true;
+        }
+        else {
+            dims[0].x = (32 * subdimxFactor);
+        }
+
+   		if (step->funcID == CLBLAS_GEMM2)  {
+			int count=0;
+
+            //
+			// NOTE:
+			// wgX and wgY setting for this function must be the same as
+			// CLBLAS_GEMM_TAIL below.
+			//
+			//vecLen = sizeof(cl_float4) / dtypeSize(step->args.dtype);
+            //
+            // PENDING: 16x16 works best on CYPRESS and 16x8 for Cayman
+            //
+			wgY = 8*subdimyFactor;
+			wgX = 8*subdimxFactor;
+			while((wgY * wgX) > maxWorkGroupSize)
+			{
+				if (count & 1)
+				{
+					wgY /= 2;
+					dims[0].y /= 2;
+			    } else {
+					wgX /= 2;
+					dims[0].x /= 2;
+			    }
+				count++;
+		    }
+		}
+
+   		if (step->funcID == CLBLAS_GEMM_TAIL)  {
+			//
+			// NOTE: wgY and wgX must be same as what is set for CLBLAS_GEMM2 above
+			//
+			vecLen = 1;
+
+				//
+				// PENDING: What if maxWorkGroupSize < 64 ????
+				//
+				wgY = 8;
+				wgX = 8;
+				dims[0].y = wgY ;
+				dims[0].x = wgX ;
+			}
+
+        if((step->funcID == CLBLAS_TRSV) || (step->funcID == CLBLAS_TRSV_GEMV))
+        {
+            wgY = 8;
+            wgX = 8;
+            dims[0].y = 64;
+            dims[0].x = 64;
+        }
+
+        dims[0].bwidth = 16 / bcoeff;
+    }
+
+    /*
+     * Prevent using more than 1/2 of LDS so as to have at least 2 work groups
+     * per compute unit
+     */
+    if (ldsSize && mempat->sops->isFitToLDS) {
+        ldsSize /= 2;
+
+        while (!mempat->sops->isFitToLDS(dims, dtype, ldsSize, &step->args)) {
+            /*
+             * decrease current component and setup this one to decrease
+             * on the next step; do not grow down block width below the
+             * value with which the block line takes size of a float4 vector
+             */
+            if (square) {
+                dims[0].x /= 2;
+                dims[0].y /= 2;
+                dims[0].bwidth /= 2;
+            }
+            else {
+                switch (component) {
+                case SDIM_X:
+                    dims[0].x /= 2;
+                    if (dims[0].bwidth * tsize == sizeof(cl_float4)) {
+                        component = SDIM_Y;
+                    }
+                    else {
+                        component = SDIM_BWIDTH;
+                    }
+                    break;
+                case SDIM_Y:
+                    dims[0].y /= 2;
+                    component = SDIM_X;
+                    break;
+                case SDIM_BWIDTH:
+                    dims[0].bwidth /= 2;
+                    component = SDIM_Y;
+                    break;
+                }
+            }
+        }
+
+        assert(dims[0].x > 0 && dims[0].y > 0 &&
+               dims[0].bwidth * tsize >= sizeof(cl_float4));
+    }
+
+    /*
+     * adjust local size if a subproblem is not divisible
+     * between all local threads
+     */
+    for (; (wgY > 1) && (dims[0].y < wgY); wgY /= 2) { }
+    for (; (wgX > 1) && (dims[0].x < wgX); wgX /= 2) { }
+
+    sflags = mempat->sops->getFlags();
+    if (sflags & SF_WSPACE_2D) {
+        pgran->wgDim = 2;
+        dims[0].itemY = dims[0].y;
+        pgran->wgSize[0] = (unsigned int)wgY;
+        pgran->wgSize[1] = (unsigned int)wgX;
+    }
+    else {
+        pgran->wgDim = 1;
+        pgran->wgSize[0] = (unsigned int)(wgX * wgY);
+        pgran->wgSize[1] = 1;
+    }
+
+    /*
+     * Divide the work between threads
+     */
+    dims[1].itemX = dims[0].x / wgX;
+    dims[1].itemY = dims[0].y / wgY;
+    dims[1].x = dims[1].itemX;
+    dims[1].y = dims[1].itemY;
+
+    if ((mempat->nrLevels == 1) && square) {
+        dims[1].bwidth = dims[1].y;
+    }
+    else {
+        i = fixedBw ? 4 : (8 / nrFloats);
+        dims[1].bwidth = szmin(i, dims[0].bwidth);
+    }
+
+    dims[0].itemX = dims[0].x;
+    dims[0].itemY = dims[0].y;
+
+    /*
+     * FIXME: Now, there are issues with generating kernels with non square
+     *        tiles in LDS less TRSM due to some fundamental restriction
+     *        of the core generator logic. Deprecate this kludge when
+     *        they will be eliminated
+     */
+#if 1
+    if ((step->funcID == CLBLAS_TRSM) && (step->patternID == 2)) {
+        dims[1].bwidth = dims[1].y;
+    }
+#endif
+    if (funcHasTriangMatrix(step->funcID) && (pgran->wgDim == 1)) {
+        dims[0].itemY = SUBDIM_UNUSED;
+        if (mempat->nrLevels == 1) {
+            dims[1].itemY = SUBDIM_UNUSED;
+        }
+    }
+
+    if (!(isLdsUsed(mempat) || (square && mempat->nrLevels == 2))) {
+        dims[0].bwidth = dims[1].bwidth;
+    }
+    /*
+     * Ensure decomposition size for vectors in case
+     * of level 2 routines equal to 1.
+     */
+    if (funcBlasLevel(step->funcID) == 2) {
+        size_t xBlocks;
+
+        xBlocks = dims[0].x / dims[1].x;
+        dims[0].x = 1;
+        dims[1].itemX = 1;
+        dims[1].x = 1;
+        dims[0].bwidth = dims[1].bwidth * xBlocks;
+    }
+
+    // fixup work group size in respect with desired work dispatch order
+    if ((pgran->wgDim == 2) && mempat->sops->innerDecompositionAxis) {
+        if (mempat->sops->innerDecompositionAxis(&step->args) ==
+            DECOMP_AXIS_X) {
+
+            unsigned int u;
+
+            u = pgran->wgSize[0];
+            pgran->wgSize[0] = pgran->wgSize[1];
+            pgran->wgSize[1] = u;
+        }
+    }
+    //printf("GDSG: suby = %lu, subx = %lu, bwidth0=%lu, bwidth1=%lu\n", dims[0].y, dims[0].x, dims[0].bwidth, dims[1].bwidth);
+}
+
+static bool
+avoidLoadFromStorage(SolutionStep *step)
+{
+    bool notDiv;
+    MemoryPattern *mempat =
+            &clblasSolvers[step->funcID].memPatterns[step->patternID];
+    bool bothCached = isMatrixCached(mempat, MATRIX_A) &&
+                      isMatrixCached(mempat, MATRIX_B);
+
+    if (bothCached) {
+        return false;
+    }
+
+    if ((step->funcID == CLBLAS_GEMM2) && ((step->args.pigFuncID == CLBLAS_SYMM) || (step->args.pigFuncID == CLBLAS_HEMM)) )
+    {
+        // FIXME: Assuming that returning "true" will load defaultDecomposition sizes
+        //        But the statement below on TRSM is a bit confusing.
+        //        Returning FALSE  here will load from storage in getStepGranulation()
+        return true;
+    }
+
+    /*
+     * don't load from storage data for LDS gemm,
+     * not integrally divisible
+     */
+    notDiv = (step->args.M % 64) || (step->args.N % 64) || (step->args.K % 64);
+
+    return ((step->funcID == CLBLAS_GEMM) && notDiv);
+}
+
+static bool
+getStepResources(SolutionStep *step)
+{
+    int i = 0;
+    size_t tsize;
+    unsigned int vecLen;
+    size_t minWidth, minHeight, bestHeight, minSize, bestSize;
+    MatrixRole mrole;
+    cl_device_id devID;
+    cl_context ctx;
+    MemoryPattern *mempat;
+    SubproblemDim probDim;
+    CLBlasKargs *kargs = &step->args;
+    bool ret = true;
+
+    tsize = dtypeSize(kargs->dtype);
+    vecLen = (unsigned int)(sizeof(cl_float4) / tsize);
+    kargsToProbDims(&probDim, step->funcID, &step->args, false);
+    getQueueContext(step->cmdQueue, &ctx);
+    getQueueDevice(step->cmdQueue, &devID);
+
+    mempat = &(clblasSolvers[step->funcID].memPatterns[step->patternID]);
+
+    for (mrole = MATRIX_A, i = 0; mrole < MATRIX_C; mrole++) {
+        if (isMatrixInImage(mempat, mrole)) {
+            if (step->funcID == CLBLAS_TRSM) {
+                //blocks
+                unsigned int packRate;
+                clblasOrder packOrder;
+                size_t pitch;
+                size_t matrWidth, matrHeight;
+                CLBLASKernExtra extra;
+
+                memset(&extra, 0, sizeof(extra));
+                extra.dtype = kargs->dtype;
+                extra.flags = step->extraFlags;
+
+                mempat->sops->imgPackMode(&extra,
+                                          step->subdims, mrole,
+                                          &packRate, &packOrder);
+
+                // minimal size parameters
+                pitch = matrBlockPitch(step->subdims, mrole, kargs->dtype,
+                                        kargs->side);
+                matrWidth = matrBlockPitch(&probDim, mrole, kargs->dtype,
+                                           kargs->side);
+                matrHeight = matrBlockHeight(&probDim, mrole, kargs->side);
+
+                //One panel should fit to image
+                if (packOrder == clblasRowMajor) {
+                    minWidth = divRoundUp(matrWidth, pitch) * pitch / vecLen;
+                    minHeight = packRate;
+
+                    minSize = minWidth * minHeight;
+                    // size of image to store all blocks
+                    bestSize = minHeight * (minWidth + pitch / vecLen) *
+                               divRoundUp(matrHeight, packRate) / 2;
+                }
+                else {
+                    minWidth = pitch / vecLen;
+                    minHeight = divRoundUp(matrHeight, packRate) * packRate;
+
+                    minSize = minWidth * minHeight;
+                    bestSize = minWidth * (minHeight + packRate) *
+                               divRoundUp(matrWidth, pitch) / 2;
+                }
+                minSize = bestSize;
+            }
+            else {
+                //panels
+                getSuitableImageSizes(&minWidth, &minHeight, &bestHeight,
+                                      mrole, kargs, vecLen, step->subdims);
+                minSize = minWidth * minHeight;
+                bestSize = minWidth * bestHeight;
+            }
+
+            kargs->scimage[i] = getSCImage(ctx, devID, bestSize,
+                                           minSize, minWidth);
+            if (kargs->scimage[i] == NULL) {
+                ret = false;
+                break;
+            }
+
+            i++;
+        }
+    }
+
+    return ret;
+}
+
+static void
+getSuitableImageSizes(
+    size_t *minWidth,
+    size_t *minHeight,
+    size_t *bestHeight,
+    MatrixRole mrole,
+    CLBlasKargs *kargs,
+    unsigned int vecLen,
+    SubproblemDim *subdims)
+{
+    size_t alignedM, alignedN, alignedK;
+    alignedM = divRoundUp(kargs->M, subdims->y);
+    alignedM *= subdims->y;
+    alignedN = divRoundUp(kargs->N, subdims->x);
+    alignedN *= subdims->x;
+    alignedK = divRoundUp(kargs->K, subdims->bwidth);
+    alignedK *= subdims->bwidth;
+    switch (mrole) {
+        case MATRIX_A:
+            *minWidth = alignedK / vecLen;
+            *bestHeight = alignedM;
+            *minHeight = subdims->y;
+             break;
+        case MATRIX_B:
+            *minWidth = alignedK / vecLen;
+            *bestHeight = alignedN;
+            *minHeight = subdims->x;
+            break;
+        case MATRIX_C:
+            *minWidth = alignedN / vecLen;
+            *bestHeight = alignedM;
+            *minHeight = subdims->y;
+            break;
+        default:
+            break;
+    }
+}
+
+/*
+ * TRxM -> TRxM + GEMM + TRxM
+ *
+ * When talking about matrix A splitting the following numbering is used:
+ *
+ *     +---+---+
+ *     | 1 | 2 |
+ *     +---+---+
+ *     | 3 | 4 |
+ *     +---+---+
+ */
+static ListNode*
+decomposeTRXMStep(SolutionStep *step)
+{
+    CLBlasKargs *kargs = &(step->args);
+    SolutionStep *trxm1 = NULL, *gemm = NULL, *trxm2 = NULL, *tmp;
+    clblasUplo position;
+    SubproblemDim size, offset;
+    int swap;
+    cl_float f;
+    cl_double d;
+    clblasImplementation impl = clblasDefaultGemm;
+    size_t offsetK = 0;
+
+    // skip decomposition for a trmm case which works faster without it
+    if (step->funcID == CLBLAS_TRMM && !isDoubleBasedType(step->args.dtype) &&
+        isMatrixAccessColMaj(step->funcID, step->extraFlags, MATRIX_B)) {
+        return &(step->node);
+    }
+
+    /* Implementation specific checks */
+
+    if ((getGemmPreferredPattern() != clblasDefaultGemm) &&
+        (getGemmPreferredPattern() != clblasBlockGemmWithCaching)) {
+
+        return &(step->node);
+    }
+    if (step->funcID == CLBLAS_TRMM) {
+        impl = getTrmmPreferredPattern();
+        if ((impl != clblasDefaultTrmm) &&
+            (impl != clblasBlockTrmmWithCaching)) {
+
+            return &(step->node);
+        }
+    }
+    else {
+        impl = getTrsmPreferredPattern();
+        if ((impl != clblasDefaultTrsm) &&
+            (impl != clblasBlockTrsmWithCaching) &&
+            (impl != clblasBlockTrsmWithoutLds)) {
+
+            return &(step->node);
+        }
+    }
+
+    if ((kargs->side == clblasLeft) &&
+        (kargs->M < DECOMPOSITION_THRESHOLD(step->args.dtype))) {
+        return &(step->node);
+    }
+    if ((kargs->side == clblasRight) &&
+        (kargs->N < DECOMPOSITION_THRESHOLD(step->args.dtype))) {
+        return &(step->node);
+    }
+
+    trxm1 = calloc(1, sizeof(SolutionStep));
+    gemm = calloc(1, sizeof(SolutionStep));
+    trxm2 = calloc(1, sizeof(SolutionStep));
+    if ((trxm1 == NULL) || (gemm == NULL) || (trxm2 == NULL)) {
+        if (trxm1 != NULL) {
+            free(trxm1);
+        }
+        if (gemm != NULL) {
+            free(gemm);
+        }
+        if (trxm2 != NULL) {
+            free(trxm2);
+        }
+        return &(step->node);
+    }
+    memcpy(trxm1, step, sizeof(SolutionStep));
+    memcpy(gemm, step, sizeof(SolutionStep));
+    memcpy(trxm2, step, sizeof(SolutionStep));
+
+    gemm->funcID = CLBLAS_GEMM;
+    gemm->args.C = kargs->B;
+    gemm->args.ldc.matrix = kargs->ldb.matrix;
+    gemm->args.offCY = kargs->offBX;
+    switch (kargs->dtype) {
+    case TYPE_FLOAT:
+        if (step->funcID == CLBLAS_TRSM) {
+            if (gemm->args.alpha.argFloat != 0.0f) {
+                gemm->args.alpha.argFloat = -1 / gemm->args.alpha.argFloat;
+            }
+        }
+        gemm->args.beta.argFloat = 1.0f;
+        break;
+    case TYPE_DOUBLE:
+        if (step->funcID == CLBLAS_TRSM) {
+            if (gemm->args.alpha.argDouble != 0.0f) {
+                gemm->args.alpha.argDouble = -1 / gemm->args.alpha.argDouble;
+            }
+        }
+        gemm->args.beta.argDouble = 1.0f;
+        break;
+    case TYPE_COMPLEX_FLOAT:
+        if (step->funcID == CLBLAS_TRSM) {
+            f = CREAL(gemm->args.alpha.argFloatComplex) *
+                CREAL(gemm->args.alpha.argFloatComplex) +
+                CIMAG(gemm->args.alpha.argFloatComplex) *
+                CIMAG(gemm->args.alpha.argFloatComplex);
+            if (f != 0.0f) {
+                gemm->args.alpha.argFloatComplex = floatComplex(
+                    -CREAL(gemm->args.alpha.argFloatComplex) / f,
+                     CIMAG(gemm->args.alpha.argFloatComplex) / f);
+            }
+        }
+        gemm->args.beta.argFloatComplex = floatComplex(1.0f, 0.0f);
+        break;
+    case TYPE_COMPLEX_DOUBLE:
+        if (step->funcID == CLBLAS_TRSM) {
+            d = CREAL(gemm->args.alpha.argDoubleComplex) *
+                CREAL(gemm->args.alpha.argDoubleComplex) +
+                CIMAG(gemm->args.alpha.argDoubleComplex) *
+                CIMAG(gemm->args.alpha.argDoubleComplex);
+            if (d != 0.0f) {
+                gemm->args.alpha.argDoubleComplex = doubleComplex(
+                    -CREAL(gemm->args.alpha.argDoubleComplex) / d,
+                     CIMAG(gemm->args.alpha.argDoubleComplex) / d);
+            }
+        }
+        gemm->args.beta.argDoubleComplex = doubleComplex(1.0f, 0.0f);
+        break;
+    }
+
+    /* Actual position of matrix A's data to use */
+    if (kargs->transA == clblasNoTrans) {
+        position = kargs->uplo;
+    }
+    else {
+        position = (kargs->uplo == clblasUpper) ? clblasLower :
+                        clblasUpper;
+    }
+
+    /* Map trxm1 to A1 */
+    kargsToProbDims(&size, trxm1->funcID, &(trxm1->args), false);
+    size.y = align(size.y / 2, DIVISION_ALIGNMENT);
+    probDimsToKargs(&(trxm1->args), trxm1->funcID, &size, false);
+
+    /* Map trxm2 to A4 */
+    kargsToProbDims(&offset, trxm2->funcID, &(trxm2->args), true);
+    kargsToProbDims(&size, trxm2->funcID, &(trxm2->args), false);
+    offset.y += align(size.y / 2, DIVISION_ALIGNMENT);
+    size.y -= align(size.y / 2, DIVISION_ALIGNMENT);
+    probDimsToKargs(&(trxm2->args), trxm2->funcID, &offset, true);
+    probDimsToKargs(&(trxm2->args), trxm2->funcID, &size, false);
+
+
+    if (kargs->side == clblasLeft) {
+        trxm1->args.K = trxm1->args.M;
+        trxm2->args.K = trxm2->args.M;
+
+        gemm->args.transB = clblasNoTrans;
+
+        if (position == clblasUpper) {
+            /* Map gemm to A2 */
+            kargsToProbDims(&size, gemm->funcID, &(gemm->args), false);
+            size.y = align(size.y / 2, DIVISION_ALIGNMENT);
+            probDimsToKargs(&(gemm->args), gemm->funcID, &size, false);
+            offsetK = align(gemm->args.K / 2, DIVISION_ALIGNMENT);
+            gemm->args.K -= align(gemm->args.K / 2, DIVISION_ALIGNMENT);
+        }
+        else {
+            /* Map gemm to A3 */
+            kargsToProbDims(&offset, gemm->funcID, &(gemm->args), true);
+            kargsToProbDims(&size, gemm->funcID, &(gemm->args), false);
+            offset.y += align(size.y / 2, DIVISION_ALIGNMENT);
+            size.y -= align(size.y / 2, DIVISION_ALIGNMENT);
+            probDimsToKargs(&(gemm->args), gemm->funcID, &offset, true);
+            probDimsToKargs(&(gemm->args), gemm->funcID, &size, false);
+            gemm->args.K = align(gemm->args.K / 2, DIVISION_ALIGNMENT);
+        }
+    }
+    else {
+        trxm1->args.K = trxm1->args.N;
+        trxm2->args.K = trxm2->args.N;
+
+        gemm->args.transA = clblasNoTrans;
+        gemm->args.A = kargs->B;
+        gemm->args.lda.matrix = kargs->ldb.matrix;
+        gemm->args.offA = kargs->offBX;
+        gemm->args.transB = kargs->transA;
+        gemm->args.B = kargs->A;
+        gemm->args.ldb.matrix = kargs->lda.matrix;
+        gemm->args.offBX = kargs->offA;
+
+        if (position == clblasUpper) {
+            /* Map gemm to A2 */
+            kargsToProbDims(&offset, gemm->funcID, &(gemm->args), true);
+            kargsToProbDims(&size, gemm->funcID, &(gemm->args), false);
+            offset.x += align(size.x / 2, DIVISION_ALIGNMENT);
+            size.x -= align(size.x / 2, DIVISION_ALIGNMENT);
+            probDimsToKargs(&(gemm->args), gemm->funcID, &offset, true);
+            probDimsToKargs(&(gemm->args), gemm->funcID, &size, false);
+            gemm->args.K = align(gemm->args.K / 2, DIVISION_ALIGNMENT);
+        }
+        else {
+            /* Map gemm to A3 */
+            kargsToProbDims(&size, gemm->funcID, &(gemm->args), false);
+            size.x = align(size.x / 2, DIVISION_ALIGNMENT);
+            probDimsToKargs(&(gemm->args), gemm->funcID, &size, false);
+            offsetK = align(gemm->args.K / 2, DIVISION_ALIGNMENT);
+            gemm->args.K -= align(gemm->args.K / 2, DIVISION_ALIGNMENT);
+        }
+    }
+
+    trxm1->extraFlags = clblasArgsToKextraFlags(&(trxm1->args), trxm1->funcID);
+    gemm->extraFlags = clblasArgsToKextraFlags(&(gemm->args), gemm->funcID);
+    trxm2->extraFlags = clblasArgsToKextraFlags(&(trxm2->args), trxm2->funcID);
+
+    fixupGemmOffsets(&gemm->args, gemm->extraFlags, offsetK);
+
+    /* Swap trxm1 and trxm2 if needed. */
+
+    swap = 0;
+    if (kargs->side == clblasLeft) {
+        if ((step->funcID == CLBLAS_TRMM) && (position == clblasLower)) {
+            swap = 1;
+        }
+        if ((step->funcID == CLBLAS_TRSM) && (position == clblasUpper)) {
+            swap = 1;
+        }
+    }
+    else {
+        if ((step->funcID == CLBLAS_TRMM) && (position == clblasUpper)) {
+            swap = 1;
+        }
+        if ((step->funcID == CLBLAS_TRSM) && (position == clblasLower)) {
+            swap = 1;
+        }
+    }
+    if (swap) {
+        tmp = trxm1;
+        trxm1 = trxm2;
+        trxm2 = tmp;
+    }
+    /* Tie the sequence trmm1 - gemm - trmm2 together. */
+
+    trxm1->event = decomposeEventsAlloc();
+    trxm1->node.next = &(gemm->node);
+
+    gemm->numEventsInWaitList = 1;
+    gemm->eventWaitList = trxm1->event;
+    gemm->event = decomposeEventsAlloc();
+    gemm->node.prev = &(trxm1->node);
+    gemm->node.next = &(trxm2->node);
+
+    trxm2->numEventsInWaitList = 1;
+    trxm2->eventWaitList = gemm->event;
+    trxm2->node.prev = &(gemm->node);
+
+    /* Insert new sequence instead of current step */
+
+    trxm1->node.prev = step->node.prev;
+    (trxm1->node.prev)->next = &(trxm1->node);
+    step->node.prev = NULL;
+
+    trxm2->node.next = step->node.next;
+    (trxm2->node.next)->prev = &(trxm2->node);
+    step->node.next = NULL;
+
+    freeSolutionStep(&(step->node));
+
+    return &(trxm2->node);
+}
+
+/*
+ *  Decompose a SYRK problem in order to evaluate the diagonal part
+ *  separately. It's useful since the compiler allocates huge number
+ *  of registers for a code processing the diagonal.
+ */
+static ListNode*
+decomposeSYRKStep(SolutionStep *step)
+{
+    CLBlasKargs *kargs = &step->args;
+    SolutionStep *syrk2 = NULL;
+    size_t thresh;
+    ListNode *next;
+
+    /*
+     * Tail prediction. Believe that tile sizes will not exceed 8.
+     * Disable decomposition if there are not subproblem tails at
+     * the tile level because it can likely slowdown since diagonal
+     * update is optimized. Actual tail detection is done after
+     * the math decomposition. So the kludge is forced.
+     */
+    if ((kargs->M % 8 == 0) && (kargs->N % 8 == 0)) {
+        return &(step->node);
+    }
+
+    thresh = DECOMPOSITION_THRESHOLD(step->args.dtype);
+    if (kargs->M < thresh / 2) {
+        return &(step->node);
+    }
+
+    syrk2 = malloc(sizeof(SolutionStep));
+    if (syrk2 == NULL) {
+        return &(step->node);
+    }
+
+    step->extraFlags |= KEXTRA_SYRK_SEPARATE_DIAGONAL;
+    memcpy(syrk2, step, sizeof(SolutionStep));
+    syrk2->extraFlags |= KEXTRA_SYRK_EVALUATE_DIAGONAL;
+
+    next = step->node.next;
+
+    /* Synchronize the steps */
+
+    /*
+     * This is to not disturb synchronization between the current and the next
+     * step or to put the output user event to the tail of the chain if syrk2
+     * is the last step
+     */
+    syrk2->event = step->event;
+    step->event = decomposeEventsAlloc();
+    syrk2->numEventsInWaitList = 1;
+    syrk2->eventWaitList = step->event;
+
+    /* Insert the additional step to the list */
+    step->node.next = &syrk2->node;
+    syrk2->node.prev = &step->node;
+    syrk2->node.next = next;
+    next->prev = &syrk2->node;
+
+    return &(syrk2->node);
+}
+
+static ListNode*
+decomposeSYR2KStep(SolutionStep *step)
+{
+    CLBlasKargs *kargs = &(step->args);
+    SolutionStep *syrk1 = NULL, *syrk2 = NULL;
+    size_t thresh;
+    ListNode *node;
+
+    /* SYR2K implementation is done as blocked with cache-usage optimization
+     * only. Therefore, no implementation specific checks.
+     */
+
+    thresh = DECOMPOSITION_THRESHOLD(step->args.dtype);
+    if (kargs->M < thresh / 2) {
+        return &(step->node);
+    }
+
+    syrk1 = calloc(1, sizeof(SolutionStep));
+    syrk2 = calloc(1, sizeof(SolutionStep));
+    if ((syrk1 == NULL) || (syrk2 == NULL)) {
+        if (syrk1 != NULL) {
+            free(syrk1);
+        }
+        if (syrk2 != NULL) {
+            free(syrk2);
+        }
+        return &(step->node);
+    }
+    memcpy(syrk1, step, sizeof(SolutionStep));
+    memcpy(syrk2, step, sizeof(SolutionStep));
+
+    syrk2->args.A = kargs->B;
+    syrk2->args.lda.matrix = kargs->ldb.matrix;
+    syrk2->args.offA = kargs->offBX;
+    syrk2->args.B = kargs->A;
+    syrk2->args.ldb.matrix = kargs->lda.matrix;
+    syrk2->args.offBX = kargs->offA;
+    switch (kargs->dtype) {
+    case TYPE_FLOAT:
+        syrk2->args.beta.argFloat = 1.0f;
+        break;
+    case TYPE_DOUBLE:
+        syrk2->args.beta.argDouble = 1.0f;
+        break;
+    case TYPE_COMPLEX_FLOAT:
+        syrk2->args.beta.argFloatComplex = floatComplex(1.0f, 0.0f);
+        break;
+    case TYPE_COMPLEX_DOUBLE:
+        syrk2->args.beta.argDoubleComplex = doubleComplex(1.0f, 0.0f);
+        break;
+    }
+
+    syrk1->extraFlags = clblasArgsToKextraFlags(&(syrk1->args), syrk1->funcID);
+    syrk1->extraFlags &= ~KEXTRA_SYRK_2K_RANK;
+    syrk2->extraFlags = clblasArgsToKextraFlags(&(syrk2->args), syrk2->funcID);
+    syrk2->extraFlags &= ~KEXTRA_SYRK_2K_RANK;
+
+    /* Tie the sequence syrk1 - syrk2 together. */
+
+    syrk1->event = decomposeEventsAlloc();
+    syrk1->node.next = &(syrk2->node);
+
+    syrk2->numEventsInWaitList = 1;
+    syrk2->eventWaitList = syrk1->event;
+    syrk2->node.prev = &(syrk1->node);
+
+    /* Insert new sequence instead of current step */
+
+    syrk1->node.prev = step->node.prev;
+    (syrk1->node.prev)->next = &(syrk1->node);
+    step->node.prev = NULL;
+
+    syrk2->node.next = step->node.next;
+    (syrk2->node.next)->prev = &(syrk2->node);
+    step->node.next = NULL;
+
+    freeSolutionStep(&(step->node));
+
+    /*
+     * Now, decompose each of these steps to evaluate the diagonal
+     * part in a dedicated kernel
+     */
+    decomposeSYRKStep(syrk1);
+    node = decomposeSYRKStep(syrk2);
+    return node;
+}
diff --git a/src/library/blas/gens/asum.cpp b/src/library/blas/gens/asum.cpp
new file mode 100644
index 0000000..3260acb
--- /dev/null
+++ b/src/library/blas/gens/asum.cpp
@@ -0,0 +1,300 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+// #define DEBUG_ASUM
+
+#define WORKGROUPS_PER_CU  32
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include "blas_kgen.h"
+#include <kprintf.hpp>
+#include <asum.clT>
+#include <solution_seq.h>
+
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+
+extern "C"
+unsigned int dtypeSize(DataType type);
+
+
+static char Prefix[4];
+
+static SolverFlags
+solverFlags(void)
+{
+	#ifdef DEBUG_ASUM
+	printf("solverFlags called...\n");
+	#endif
+
+    return (SF_WSPACE_1D);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+
+static void
+    fixupArgs(void *args, SubproblemDim *subdims, void *extra);
+
+static void
+assignKargs(KernelArg *args, const void *params, const void* extra );
+
+extern "C"
+void initAsumRegisterPattern(MemoryPattern *mempat);
+
+static  KernelExtraFlags
+selectVectorization(
+    void *kargs,
+    unsigned int vlen );
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *kArgs);
+
+static SolverOps asumOps = {
+    generator,
+    assignKargs,
+    NULL,
+    NULL, // Prepare Translate Dims
+    NULL, // Inner Decomposition Axis
+    calcNrThreads,
+    NULL,
+    solverFlags,
+	fixupArgs,
+	NULL,
+	NULL,
+	setBuildOpts,
+	selectVectorization
+};
+
+static  KernelExtraFlags
+selectVectorization(
+	void *args,
+	unsigned int vlen )
+{
+	KernelExtraFlags kflags = KEXTRA_NO_FLAGS;
+	CLBlasKargs *kargs  = (CLBlasKargs *)args;
+
+    if( (((kargs->offBX) % vlen) != 0)  )
+    {
+        kflags = KEXTRA_NO_COPY_VEC_A;
+    }
+	return kflags;
+}
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+	const SolutionStep *step = (const SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+    if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
+	{
+		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		#ifdef DEBUG_DOT
+		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
+		#endif
+	}
+	if ( (kargs->dtype == TYPE_COMPLEX_FLOAT) || (kargs->dtype == TYPE_COMPLEX_DOUBLE))
+	{
+		strcat( buildOptStr, " -DCOMPLEX ");
+		#ifdef DEBUG_ASUM
+		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
+		#endif
+	}
+    if( (kargs->ldb.vector) != 1) {
+        strcat( buildOptStr, " -DINCX_NONUNITY ");
+    }
+    if( (kargs->ldb.vector) < 1) {
+        strcat( buildOptStr, " -DINCX_NEGATIVE ");
+    }
+	return;
+}
+
+
+static CLBLASMpatExtra mpatExtra;
+
+extern "C"
+void initAsumRegisterPattern(MemoryPattern *mempat)
+{
+	#ifdef DEBUG_ASUM
+	printf("initRegPattern called with mempat = 0x%p\n", mempat);
+	#endif
+
+	fflush(stdout);
+    mempat->name = "Register accumulation based swap";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &asumOps;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L2;
+    mpatExtra.bMset = CLMEM_LEVEL_L2;
+    mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY;
+    mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY;
+    mempat->extra = &mpatExtra;
+
+	Prefix[TYPE_FLOAT] = 'S';
+	Prefix[TYPE_DOUBLE] = 'D';
+	Prefix[TYPE_COMPLEX_FLOAT] = 'C';
+	Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *_extra)
+{
+    DUMMY_ARG_USAGE(subdims);
+    const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra;
+    CLBlasKargs *kargs = (CLBlasKargs *)args;
+    SolutionStep *step = container_of(kargs, args, SolutionStep);
+    TargetDevice *kDevice = &(step->device);
+
+    cl_int err;
+    unsigned int numComputeUnits = deviceComputeUnits( (kDevice->id), &err );
+    if(err != CL_SUCCESS) {
+        numComputeUnits = 1;
+    }
+
+    unsigned int vecLen = extra->vecLenA;
+	unsigned int blockSize = pgran->wgSize[0] * pgran->wgSize[1];
+
+	unsigned int wgToSpawn = ((kargs->N - 1)/ (blockSize*vecLen)) + 1;
+    wgToSpawn = min( wgToSpawn, (numComputeUnits * WORKGROUPS_PER_CU) );
+
+	threads[0] = wgToSpawn * blockSize;
+	threads[1] = 1;
+}
+
+//
+// FIXME: Report correct return value - Needs change in KPRINTF
+//
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+
+	DUMMY_ARG_USAGE(subdims);
+	size_t BLOCKSIZE  = pgran->wgSize[0];
+	char tempTemplate[32*1024];
+
+	if ( buf == NULL) // return buffer size
+	{
+		buflen = (32 * 1024 * sizeof(char));
+        return (ssize_t)buflen;
+	}
+	CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
+
+	#ifdef DEBUG_ASUM
+ 	printf("ASUM GENERATOR called....\n");
+	printf("dataType : %c\n", Prefix[extraFlags->dtype]);
+	#endif
+
+    unsigned int vecLenA = extraFlags->vecLenA;
+
+	#ifdef DEBUG_ASUM
+	printf("Vector length used : %d\n\n", vecLenA);
+	#endif
+
+	bool doVLOAD = false;
+	if( extraFlags->flags &  KEXTRA_NO_COPY_VEC_A )
+	{
+		doVLOAD = true;
+		#ifdef DEBUG_ASUM
+		printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
+		#endif
+	}
+	else
+	{
+		#ifdef DEBUG_ASUM
+		printf("Using Aligned Data Pointer \n");
+		#endif
+	}
+    strcpy( tempTemplate, (char*)asum_kernel );
+	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD, BLOCKSIZE);
+    kobj.spit((char*)buf, tempTemplate);
+
+    return (32 * 1024 * sizeof(char));
+}
+
+/*
+__kernel void %PREFIXasum_kernel( __global %TYPE *_X, __global %TYPE *scratchBuff, uint N, uint offx, int incx)
+
+*/
+
+static void
+assignKargs(KernelArg *args, const void *params, const void* )
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+	cl_int incx;
+
+    INIT_KARG(&args[0], blasArgs->B);
+	INIT_KARG(&args[1], blasArgs->D);
+    initSizeKarg(&args[2], blasArgs->N);
+    initSizeKarg(&args[3], blasArgs->offBX);
+    incx = blasArgs->ldb.vector;
+    INIT_KARG(&args[4], incx);
+	return;
+}
+
+/** The purpose of this function is to add an work-group size indicator in
+    kernelKey, so that a different kernel is generated when work-group size is changed.
+    Reduction loop is unrolled in kprintf based on work-group size.
+
+    Member of SubproblemDim- bwidth, will be used to store work-group size of the current kernel
+    this will become a kernelKey, and kernel cache will be accordingly managed.
+    Note -- SubproblemDim is a member of kernelKey
+**/
+static void
+fixupArgs(void *args, SubproblemDim *subdims, void *extra)
+{
+    DUMMY_ARG_USAGE(extra);
+    CLBlasKargs *kargs = (CLBlasKargs*)args;
+    SolutionStep *step = container_of(kargs, args, SolutionStep);
+
+    subdims->bwidth = (step->pgran.wgSize[0]) * (step->pgran.wgSize[1]);
+}
+
diff --git a/src/library/blas/gens/axpy_reg.cpp b/src/library/blas/gens/axpy_reg.cpp
new file mode 100644
index 0000000..0f8ced0
--- /dev/null
+++ b/src/library/blas/gens/axpy_reg.cpp
@@ -0,0 +1,279 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/*
+ * axpy generator
+ */
+//#define DEBUG_AXPY
+
+#define WORKGROUPS_PER_CU  32
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include "blas_kgen.h"
+#include <kprintf.hpp>
+#include <axpy.clT>
+#include <solution_seq.h>
+
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+
+extern "C"
+unsigned int dtypeSize(DataType type);
+
+
+static char Prefix[4];
+
+static SolverFlags
+solverFlags(void)
+{
+	#ifdef DEBUG_AXPY
+	printf("solverFlags called......\n");
+	#endif
+
+    return (SF_WSPACE_1D);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+
+static void
+assignKargs(KernelArg *args, const void *params, const void* extra );
+
+extern "C"
+void initAxpyRegisterPattern(MemoryPattern *mempat);
+
+static  KernelExtraFlags
+selectVectorization(
+    void *kargs,
+    unsigned int vlen );
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *kArgs);
+
+static SolverOps axpyOps = {
+    generator,
+    assignKargs,
+    NULL,
+    NULL, // Prepare Translate Dims
+    NULL, // Inner Decomposition Axis
+    calcNrThreads,
+    NULL,
+    solverFlags,
+	NULL,
+	NULL,
+	NULL,
+	setBuildOpts,
+	selectVectorization
+};
+
+static  KernelExtraFlags
+selectVectorization(
+	void *args,
+	unsigned int vlen )
+{
+	KernelExtraFlags kflags = KEXTRA_NO_FLAGS;
+	CLBlasKargs *kargs  = (CLBlasKargs *)args;
+
+    if( (((kargs->offBX) % vlen) != 0) || (((kargs->offCY) % vlen) != 0) )
+    {
+        kflags = KEXTRA_NO_COPY_VEC_A;
+    }
+	return kflags;
+}
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+	const SolutionStep *step = (const SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
+	{
+		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		#ifdef DEBUG_AXPY
+		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
+		#endif
+	}
+	if( (kargs->ldb.vector) != 1) {
+        strcat( buildOptStr, " -DINCX_NONUNITY ");
+    }
+    if( (kargs->ldc.vector) != 1) {
+        strcat( buildOptStr, " -DINCY_NONUNITY ");
+    }
+
+	return;
+}
+
+
+static CLBLASMpatExtra mpatExtra;
+
+extern "C"
+void initAxpyRegisterPattern(MemoryPattern *mempat)
+{
+	#ifdef DEBUG_AXPY
+	printf("initRegPattern called with mempat = 0x%p\n", mempat);
+	#endif
+
+	fflush(stdout);
+    mempat->name = "Register accumulation based swap";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &axpyOps;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L2;
+    mpatExtra.bMset = CLMEM_LEVEL_L2;
+    mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY;
+    mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY;
+    mempat->extra = &mpatExtra;
+
+	Prefix[TYPE_FLOAT] = 'S';
+	Prefix[TYPE_DOUBLE] = 'D';
+	Prefix[TYPE_COMPLEX_FLOAT] = 'C';
+	Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *_extra)
+{
+    DUMMY_ARG_USAGE(subdims);
+    const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra;
+    CLBlasKargs *kargs = (CLBlasKargs *)args;
+    SolutionStep *step = container_of(kargs, args, SolutionStep);
+    TargetDevice *kDevice = &(step->device);
+
+    cl_int err;
+    unsigned int numComputeUnits = deviceComputeUnits( (kDevice->id), &err );
+    if(err != CL_SUCCESS) {
+        numComputeUnits = 1;
+    }
+
+    unsigned int vecLen = extra->vecLenA;
+	unsigned int blockSize = pgran->wgSize[0] * pgran->wgSize[1];
+
+	unsigned int wgToSpawn = ((kargs->N - 1)/ (blockSize*vecLen)) + 1;
+    wgToSpawn = min( wgToSpawn, (numComputeUnits * WORKGROUPS_PER_CU) );
+
+	threads[0] = wgToSpawn * blockSize;
+	threads[1] = 1;
+}
+
+//
+// FIXME: Report correct return value - Needs change in KPRINTF
+//
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+
+    DUMMY_ARGS_USAGE_2(pgran, subdims);
+	char tempTemplate[32*1024];
+
+	if ( buf == NULL) // return buffer size
+	{
+		buflen = (32 * 1024 * sizeof(char));
+        return (ssize_t)buflen;
+	}
+	CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
+
+	#ifdef DEBUG_AXPY
+ 	printf("AXPY GENERATOR called....\n");
+	printf("dataType : %c\n", Prefix[extraFlags->dtype]);
+	#endif
+
+    unsigned int vecLenA = extraFlags->vecLenA;
+
+	#ifdef DEBUG_AXPY
+	printf("Vector length used : %d\n\n", vecLenA);
+	#endif
+
+	bool doVLOAD = false;
+	if( extraFlags->flags &  KEXTRA_NO_COPY_VEC_A )
+	{
+		doVLOAD = true;
+		#ifdef DEBUG_AXPY
+			printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
+		#endif
+	}
+	else
+	{
+		#ifdef DEBUG_AXPY
+			printf("Using Aligned Data Pointer .......\n");
+		#endif
+	}
+    strcpy( tempTemplate, (char*)axpy_kernel );
+	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD);
+    kobj.spit((char*)buf, tempTemplate);
+
+    return (32 * 1024 * sizeof(char));
+}
+
+/*
+__kernel void %PREFIXaxpy_kernel( %TYPE _alpha, __global %TYPE *_X, __global %TYPE *_Y,
+                                        uint N, uint offx, int incx, uint offy, int incy )
+
+*/
+static void
+assignKargs(KernelArg *args, const void *params, const void* )
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+	cl_int incx, incy;
+
+    assignScalarKarg(&args[0], &(blasArgs->alpha), blasArgs->dtype);
+    INIT_KARG(&args[1], blasArgs->A);
+	INIT_KARG(&args[2], blasArgs->B);
+    initSizeKarg(&args[3], blasArgs->N);
+    initSizeKarg(&args[4], blasArgs->offBX);
+    incx = blasArgs->ldb.vector;
+    INIT_KARG(&args[5], incx);
+    initSizeKarg(&args[6], blasArgs->offCY);
+    incy = blasArgs->ldc.vector;
+    INIT_KARG(&args[7], incy);
+
+	return;
+}
diff --git a/src/library/blas/gens/blas_kgen.c b/src/library/blas/gens/blas_kgen.c
new file mode 100644
index 0000000..595fe10
--- /dev/null
+++ b/src/library/blas/gens/blas_kgen.c
@@ -0,0 +1,1580 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * common stuff for blas related
+ * kernel generators
+ */
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include <list.h>
+#include <clblas_stddef.h>
+
+#include <matrix_props.h>
+#include <matrix_dims.h>
+#include <dis_warning.h>
+
+#include "blas_kgen.h"
+#include "gen_helper.h"
+#include "tile_iter.h"
+#include "kerngen.h"
+
+#define IDX_INVAL ((unsigned int)-1)
+
+enum {
+    COORD_STRLEN = 64
+};
+
+static unsigned int
+getTmpVecLen(
+    const BlasGenSettings *gset,
+    UpdateResultFlags uflags,
+    const char **vecName)
+{
+    const CLBLASKernExtra *kextra = gset->kextra;
+    unsigned int vecLen;
+
+    if (isComplexType(kextra->dtype) || (uflags & (UPRES_GENERIC |
+                                         UPRES_NO_VECTORIZATION))) {
+        vecLen = 1;
+    }
+    else {
+        vecLen = (gset->flags & BGF_DISTINCT_VECLEN) ? kextra->vecLenC :
+                                                       kextra->vecLen;
+        getVectorTypeName(kextra->dtype, vecLen, vecName, NULL);
+    }
+
+    return vecLen;
+}
+
+/*
+ * Try to transform kernel string to integer.
+ * Return -1. If this is not a number.
+ */
+static int
+stringToInt(const char *str, unsigned int *num)
+{
+    char *end;
+    unsigned int n;
+    int ret = -1;
+
+    n = (unsigned int)strtol(str, &end, 10);
+    // believe it is a number if the string has been parsed completely
+    if ((end != str) && (*end == '\0')) {
+        *num = n;
+        ret = 0;
+    }
+
+    return ret;
+}
+
+void
+sprintfVecChunk(
+    char *chunk,
+    unsigned int vecLen,
+    unsigned int clen,
+    unsigned int vecOff)
+{
+    const char *vect = "0123456789abcdef";
+
+    if (clen == vecLen) {
+        chunk[0] = '\0';
+    }
+    else {
+        snprintf(chunk, clen + 3, ".s%s", vect + vecOff);
+        chunk[clen + 2] = '\0';
+    }
+}
+
+unsigned int
+getVecLen(const BlasGenSettings *gset, BlasFunctionID funcID, MatrixRole mrole)
+{
+    unsigned int vecLen = 0;
+    const CLBLASKernExtra *kextra = gset->kextra;
+
+    DUMMY_ARG_USAGE(funcID);
+
+    if (!(gset->flags & BGF_DISTINCT_VECLEN)) {
+        vecLen = umin(kextra->vecLenA, kextra->vecLenB);
+        vecLen = umin(vecLen, kextra->vecLenC);
+    }
+    else {
+        switch (mrole) {
+        case MATRIX_A:
+            vecLen = kextra->vecLenA;
+            break;
+        case MATRIX_B:
+            vecLen = kextra->vecLenB;
+            break;
+        case MATRIX_C:
+            vecLen = kextra->vecLenC;
+            break;
+        default:
+            break;
+        }
+    }
+
+    return vecLen;
+}
+
+void
+genScaleLeadingDimensions(struct KgenContext *ctx, const BlasGenSettings *gset)
+{
+    const KernelVarNames *kvars;
+    unsigned int vecLen;
+    bool done = false;
+
+    if (!(gset->flags & BGF_LD_IN_VECTORS)) {
+        return;
+    }
+
+    kvars = &gset->varNames;
+
+    vecLen = getVecLen(gset, CLBLAS_GEMM, MATRIX_A);
+    if ((kvars->lda != NULL) && (vecLen > 1)) {
+        kgenPrintf(ctx, "%s /= %u;\n", kvars->lda, vecLen);
+        done = true;
+    }
+
+    vecLen = getVecLen(gset, CLBLAS_GEMM, MATRIX_B);
+    if ((kvars->ldb != NULL) && (vecLen > 1) && (kvars->ldb != kvars->lda)) {
+        kgenPrintf(ctx, "%s /= %u;\n", kvars->ldb, vecLen);
+        done = true;
+    }
+
+    vecLen = getVecLen(gset, CLBLAS_GEMM, MATRIX_C);
+    if ((kvars->ldc != NULL) && (vecLen > 1) &&
+        (kvars->ldc != kvars->lda) && (kvars->ldc != kvars->ldb)) {
+
+        kgenPrintf(ctx, "%s /= %u;\n", kvars->ldc, vecLen);
+        done = true;
+    }
+
+    if (done) {
+        kgenAddBlankLine(ctx);
+    }
+}
+
+void
+getPrivateAreaInfo(
+    const BlasGenSettings *gset,
+    BlasFunctionID funcID,
+    MatrixRole mrole,
+    PrivateArea *area)
+{
+    const CLBLASKernExtra *kextra = gset->kextra;
+    const SubproblemDim *dim = &gset->subdims[1];
+
+    area->vecLen = getVecLen(gset, funcID, mrole);
+    getVectorTypeName(kextra->dtype, area->vecLen, &area->typeName, NULL);
+    if (mrole == MATRIX_C) {
+        area->size = (unsigned int)(divRoundUp(dim->x, area->vecLen) * dim->y);
+    }
+    else {
+        size_t h = (mrole == MATRIX_A) ? dim->y : dim->x;
+
+        area->size = (unsigned int)(h * dim->bwidth / area->vecLen);
+    }
+}
+
+void
+declarePrivateArea(
+    struct KgenContext *ctx,
+    const PrivateArea *area,
+    const char *baseName,
+    PrivateStorageType storType)
+{
+    char tmp[1024];
+    unsigned int i;
+
+    // TODO: separate case for size equal to 1
+    if (storType == PRIV_STORAGE_ARRAY) {
+        sprintf(tmp, "%s %s[%u];\n", area->typeName, baseName, area->size);
+    }
+    else {
+        char *p;
+
+        sprintf(tmp, "%s %s0", area->typeName, baseName);
+        p = tmp + strlen(tmp);
+        for (i = 1; i < area->size; i++) {
+            sprintf(p, ", %s%u", baseName, i);
+            p += strlen(p);
+        }
+        strcpy(p, ";\n");
+    }
+
+    kgenAddStmt(ctx, tmp);
+}
+
+int
+defaultTilePostFetch(
+    struct KgenContext *ctx,
+    MatrixRole mrole,
+    void *priv)
+{
+    char tmp[1024], cond[128];
+    Kstring src;
+    TilePostFetchPrivate *pfPriv = (TilePostFetchPrivate*)priv;
+    bool distVect = (pfPriv->gset->flags & BGF_DISTINCT_VECLEN);
+    const KernelVarNames *vnames = &pfPriv->gset->varNames;
+    const CLBLASKernExtra *kextra = pfPriv->gset->kextra;
+    const SubproblemDim *dim = &pfPriv->gset->subdims[1];
+    BlasFunctionID funcID = pfPriv->funcID;
+    const Tile* tile;
+    bool partA;
+    unsigned int step;
+    unsigned int i, j;
+    int ret = 0;
+    unsigned int maxJ = 0;
+    unsigned int maxI = 0;
+
+    if (!isNeedZeroTileTail(funcID, dim, kextra, mrole, distVect)) {
+        return 0;
+    }
+
+    if (mrole == MATRIX_A) {
+        tile = &pfPriv->gset->tileA;
+        maxJ = tile->nrCols;
+        maxI = tile->nrRows;
+    }
+    else {
+        tile = &pfPriv->gset->tileBX;
+        maxJ = tile->nrRows;
+        maxI = tile->nrCols;
+    }
+
+    partA = (mrole == MATRIX_A) && tile->trans &&
+            !(pfPriv->gset->flags & BGF_WHOLE_A);
+    step = tileLineSegmentLen(tile);
+    step = (tile->trans ^ (mrole == MATRIX_A)) ? 1 : step;
+
+    for (j = 0; (j < maxJ) && !ret; j++) {
+        unsigned int k;
+
+        k = umax(j, (unsigned int)pfPriv->fetchNumA);
+        if (k) {
+            sprintf(tmp, " + %u", k);
+        }
+        else {
+            tmp[0] = '\0';
+        }
+        sprintf(cond, "(%s%s < %s)", vnames->k, tmp, vnames->sizeK);
+
+        for (i = 0; (i < maxI) && !ret; i += step) {
+            if (mrole != MATRIX_A) {
+                sprintfTileElement(&src, tile, j, i, step);
+            }
+            else {
+                sprintfTileElement(&src, tile, i, j, step);
+            }
+            sprintf(tmp, "%s = %s ? %s : 0;\n", src.buf, cond, src.buf);
+            ret = kgenAddStmt(ctx, tmp);
+        }
+    }
+
+    if (partA) {
+        pfPriv->fetchNumA++;
+    }
+
+    if ((tile->nrCols * tile->nrRows / tile->vecLen > 1) && !ret) {
+        ret = kgenAddBlankLine(ctx);
+    }
+
+    return ret;
+}
+
+char
+dtypeToBlasPrefix(DataType dtype)
+{
+    char c;
+
+    if (dtype == TYPE_FLOAT) {
+        c = 's';
+    }
+    else {
+        c = dtypeToPrefix(dtype);
+    }
+
+    return c;
+}
+
+TileMulFlags
+kextraToTilemulFlags(BlasFunctionID funcID, KernelExtraFlags kflags)
+{
+    TileMulFlags mf = TILEMUL_NO_FLAGS;
+
+    if (isMatrixAccessColMaj(funcID, kflags, MATRIX_A)) {
+        mf |= TILEMUL_TRA;
+    }
+    if (isMatrixConj(kflags, MATRIX_A)) {
+        mf |= TILEMUL_CONJA;
+    }
+    if (!isMatrixAccessColMaj(funcID, kflags, MATRIX_B)) {
+        mf |= TILEMUL_TRB;
+    }
+    if (isMatrixConj(kflags, MATRIX_B)) {
+        mf |= TILEMUL_CONJB;
+    }
+
+    return mf;
+}
+
+void
+getResultGPRsInfo(
+    DataType dtype,
+    const SubproblemDim *dims,
+    unsigned int vecLen,
+    unsigned int *nrRegs,
+    const char **typeName)
+{
+    if (isComplexType(dtype)) {
+        if (nrRegs) {
+            *nrRegs = (unsigned int)(dims->x * dims->y);
+        }
+        if (typeName != NULL) {
+            *typeName = dtypeBuiltinType(dtype);
+        }
+    }
+    else {
+        // handle different vecLen values and fetch vector sizes
+        if (nrRegs) {
+            *nrRegs = (unsigned int)(divRoundUp(dims->x, vecLen) * dims->y);
+        }
+        if (typeName != NULL) {
+            getVectorTypeName(dtype, vecLen, typeName, NULL);
+        }
+    }
+}
+
+static void genVectorCPtr( struct KgenContext *pCtx,
+    const BlasGenSettings *pGSet,
+    const char* GPtrName,
+    const char* VCPtrName )
+{
+    const char *typeName;
+    unsigned int vecLen = 0;
+
+    vecLen = getVecLen( pGSet, 0, MATRIX_C );
+    vecLen = vecLen > pGSet->tileCY.vecLen ?
+        pGSet->tileCY.vecLen :
+        vecLen;
+
+    getVectorTypeName( pGSet->kextra->dtype,
+        vecLen,
+        &typeName,
+        NULL );
+
+    if ( 0 == (pGSet->flags & BGF_LD_IN_VECTORS) ) {
+
+        vecLen = 1;
+    }
+    // Blas function ID is omitted
+    if ( isComplexType( pGSet->kextra->dtype ) ) {
+        vecLen *= 2;
+    }
+
+    if ( isDoubleBasedType(pGSet->kextra->dtype) ) {
+
+        if ( 1 == vecLen ) {
+
+            kgenPrintf(
+                pCtx,
+                "__global %s *%s = %s.d;\n",
+                typeName,
+                VCPtrName,
+                GPtrName);
+        }
+        else {
+
+            kgenPrintf( pCtx,
+                "__global %s *%s = %s.d%dv;\n",
+                typeName,
+                VCPtrName,
+                GPtrName,
+                vecLen);
+        }
+    }
+    else {
+
+        if ( 1 == vecLen ) {
+
+            kgenPrintf(
+                pCtx,
+                "__global %s *%s = %s.f;\n",
+                typeName,
+                VCPtrName,
+                GPtrName);
+        }
+        else {
+
+            kgenPrintf( pCtx,
+                "__global %s *%s = %s.f%dv;\n",
+                typeName,
+                VCPtrName,
+                GPtrName,
+                vecLen);
+        }
+    }
+}
+
+static void
+updateOptimResultGen(
+    struct KgenContext *pCtx,
+    const BlasGenSettings *pGSet,
+    BlasFunctionID funcID,
+    UpdateResultOp op,
+    UpdateResultFlags flags)
+{
+    KernelExtraFlags kflags = pGSet->kextra->flags;
+    Tile tempCTile;
+    Tile fullCTile;
+    unsigned int physVecLenC;
+    DataType dtype;
+    const KernelVarNames *pVNames = NULL;
+    PhysTileIterator physIter;
+    PhysTileIterator blkIter;
+    char cPtrName[] = "pC";
+    const char *typeNameC;
+    bool phyTrans = 0;
+    unsigned int vecLen = 0;
+    unsigned int nBlocks = 0;
+    unsigned int i = 0;
+
+    Kstring cElem;
+    Kstring tempCElem;
+    Kstring kstrFirst;
+    Kstring kstrSecond;
+    Kstring kstrThird;
+    Kstring expr;
+
+    //EINVAL
+    if ( NULL == pCtx ||
+        NULL == pGSet ) {
+
+        return;
+    }
+
+    dtype = pGSet->kextra->dtype;
+    pVNames = &pGSet->varNames;
+    phyTrans = ( (flags & UPRES_COLUMN_MAJOR ) != 0 );
+
+    physVecLenC = getVecLen( pGSet, funcID, MATRIX_C );
+    getVectorTypeName( dtype,
+        getVecLen( pGSet,0,MATRIX_C ),
+        &typeNameC,
+        NULL );
+
+    // declare private C pointer
+    genVectorCPtr( pCtx, pGSet, "uC", "pC" );
+
+    kgenAddBlankLine( pCtx );
+
+    // calculate the number of blocks, update should be divided on
+    nBlocks = pGSet->tileCY.nrCols * pGSet->tileCY.nrRows/(
+        pGSet->tileA.nrCols*pGSet->tileA.nrRows +
+        pGSet->tileBX.nrCols*pGSet->tileBX.nrRows );
+
+    if( pGSet->tileCY.nrCols * pGSet->tileCY.nrRows%(
+        pGSet->tileA.nrCols*pGSet->tileA.nrRows +
+        pGSet->tileBX.nrCols*pGSet->tileBX.nrRows ) ){
+
+        nBlocks++;
+    }
+
+    nBlocks = roundUpPow2( (int)nBlocks );
+
+    // declare the temporary C tile
+    // temporary C tile must have the same transposition as C matrix
+    // for read-write optimization it also has the same vectorization
+    if ( phyTrans ) {
+
+        if ( nBlocks > pGSet->tileCY.nrCols ) {
+            nBlocks = pGSet->tileCY.nrCols;
+        }
+
+        initTile( &tempCTile,
+            "tempC",
+            pGSet->tileCY.nrRows,
+            pGSet->tileCY.nrCols/nBlocks,
+            pGSet->tileCY.vecLen,
+            dtype,
+            PRIV_STORAGE_VARIABLE_SET,
+            phyTrans,
+            true );
+
+        initTile( &fullCTile,
+            "fullC",
+            pGSet->tileCY.nrRows,
+            pGSet->tileCY.nrCols,
+            pGSet->tileCY.vecLen,
+            dtype,
+            PRIV_STORAGE_VARIABLE_SET,
+            phyTrans,
+            true);
+    }
+    else {
+
+        if ( nBlocks > pGSet->tileCY.nrRows ) {
+            nBlocks = pGSet->tileCY.nrRows;
+        }
+
+        initTile( &tempCTile,
+            "tempC",
+            pGSet->tileCY.nrRows/nBlocks,
+            pGSet->tileCY.nrCols,
+            pGSet->tileCY.vecLen,
+            dtype,
+            PRIV_STORAGE_VARIABLE_SET,
+            phyTrans,
+            true );
+
+        initTile( &fullCTile,
+            "fullC",
+            pGSet->tileCY.nrRows,
+            pGSet->tileCY.nrCols,
+            pGSet->tileCY.vecLen,
+            dtype,
+            PRIV_STORAGE_VARIABLE_SET,
+            phyTrans,
+            true);
+    }
+
+    declareOneTileStorage( pCtx, &tempCTile );
+
+    // splitting update result on several blocks to prevent
+    // increasing GPR usage
+    for ( i = 0; i < nBlocks; i++ ) {
+
+        kgenAddBlankLine(pCtx);
+
+        // fetch ------------------------------------------------------------------
+        vecLen = umin( physVecLenC, pGSet->tileCY.vecLen );
+        vecLen = umin( vecLen, tileLineSegmentLen(&tempCTile) );
+
+        iterInit( &blkIter, &tempCTile, vecLen, 0 );
+        iterInit( &physIter, &fullCTile, vecLen, 0 );
+
+        iterSeekPhys( &physIter, blkIter.nrLines * i, blkIter.vec );
+
+        if (op == UPRES_SUM) {
+            for ( ; 0 == iterIsEnd( &blkIter ); iterIterate( &blkIter ),
+                                               iterIterate( &physIter ) ) {
+
+                emptyKstring( &kstrFirst );
+                emptyKstring( &kstrSecond );
+                emptyKstring( &kstrThird );
+                emptyKstring( &cElem );
+                emptyKstring( &tempCElem );
+
+                sprintfTileElement( &tempCElem,
+                    &tempCTile,
+                    blkIter.row,
+                    blkIter.col,
+                    vecLen);
+
+                ksprintf( &kstrFirst, "%d", physIter.line );
+                ksprintf( &kstrSecond, "%s", pVNames->ldc );
+                ksprintf( &kstrThird, "%d", blkIter.vec );
+
+                sprintfFastScalarMad( &expr,
+                    &kstrFirst,
+                    &kstrSecond,
+                    vecLen,//physVecLenC,//scale ldc
+                    &kstrThird);
+
+                kgenPrintf( pCtx,
+                    "%s = %s[%s];\n",
+                    tempCElem.buf,
+                    cPtrName,
+                    expr.buf );
+
+            }
+        }
+
+        // beta ---------------------------------------------------------------
+        if ( flags & UPRES_WITH_BETA ) {
+
+            if ( isComplexType(dtype) ||
+                ( pGSet->tileCY.trans != tempCTile.trans ) ) {
+                vecLen = 1;
+            }
+            //TODO: for real datatype find longest available veclen can be used
+            //to generate more compact code
+            else {
+                vecLen = pGSet->tileCY.vecLen;
+            }
+            vecLen = umin( vecLen, tileLineSegmentLen(&tempCTile) );
+
+            iterInit( &blkIter, &tempCTile, vecLen, 0 );
+
+            for ( ; 0 == iterIsEnd( &blkIter ); iterIterate( &blkIter ) ) {
+
+                sprintfTileElement( &tempCElem,
+                    &tempCTile,
+                    blkIter.row,
+                    blkIter.col,
+                    vecLen);
+
+                if ( isComplexType(dtype) ) {
+                    //complex mad
+                    ksprintf( &kstrSecond, "%s", pVNames->beta );
+                    sprintfComplexMulUpdate( &expr,
+                        &tempCElem,
+                        &tempCElem,
+                        &kstrSecond,
+                        NULL,
+                        isDoubleBasedType(dtype),
+                        0,
+                        0,
+                        0 );
+                    kgenPrintf( pCtx, "%s", expr.buf );
+                }
+                else {
+                    if ((kflags & KEXTRA_ENABLE_MAD) != 0) {
+                        kgenPrintf( pCtx,
+                            "%s = mad(%s, %s, 0);\n",
+                            tempCElem.buf,
+                            tempCElem.buf,
+                            pVNames->beta);
+                    }
+                    else {
+                        kgenPrintf( pCtx,
+                            "%s = %s * %s;\n",
+                            tempCElem.buf,
+                            tempCElem.buf,
+                            pVNames->beta);
+                    }
+                }
+            }
+        }
+
+        // alpha---------------------------------------------------------------
+        if ( (phyTrans == pGSet->tileCY.trans) && (!isComplexType(dtype)) ) {
+
+            vecLen = pGSet->tileCY.vecLen;
+        }
+        else {
+            vecLen = 1;
+        }
+        vecLen = umin( vecLen, tileLineSegmentLen(&tempCTile) );
+
+        iterInit( &blkIter, &tempCTile, vecLen, 0 );
+        iterInit( &physIter, &fullCTile, vecLen, 0 );
+
+        iterSeekPhys( &physIter, blkIter.nrLines * i, blkIter.vec );
+
+        for ( ; 0 == iterIsEnd( &blkIter ); iterIterate( &blkIter ),
+                                            iterIterate( &physIter) ) {
+
+            const Kstring *dst;
+
+            dst = (flags & UPRES_PRIV_DEST) ? &cElem : &tempCElem;
+
+            sprintfTileElement( &tempCElem,
+                &tempCTile,
+                blkIter.row,
+                blkIter.col,
+                vecLen);
+
+            sprintfTileElement( &cElem,
+                &pGSet->tileCY,
+                physIter.row,
+                physIter.col,
+                vecLen);
+
+            // complex
+            if ( isComplexType(dtype) ) {
+
+                ksprintf( &kstrSecond, "%s", pVNames->alpha );
+
+                // upres op: sum or set, if set, third argument
+                // of complex mad() is zero
+                sprintfComplexMulUpdate( &expr,
+                    dst,
+                    &cElem,
+                    &kstrSecond,
+                    (op == UPRES_SUM) ? &tempCElem : NULL,
+                    isDoubleBasedType(dtype),
+                    0,
+                    0,
+                    0);
+                kgenPrintf( pCtx, "%s", expr.buf );
+
+            }
+            // real
+            else {
+
+                // upres op: sum or set, if set, third argument
+                // of mad() is zero
+                if ((kflags & KEXTRA_ENABLE_MAD) != 0) {
+                    kgenPrintf( pCtx,
+                        "%s = mad(%s, %s, %s);\n",
+                        dst,
+                        cElem.buf,
+                        pVNames->alpha,
+                        (op == UPRES_SUM) ? tempCElem.buf : "0" );
+                }
+                else {
+                    kgenPrintf( pCtx,
+                        "%s = %s * %s + %s;\n",
+                        dst,
+                        cElem.buf,
+                        pVNames->alpha,
+                        (op == UPRES_SUM) ? tempCElem.buf : "0" );
+                }
+            }
+        }
+
+        if (flags & UPRES_PRIV_DEST) {
+            return;
+        }
+
+        // store---------------------------------------------------------------
+        vecLen = umin( physVecLenC, pGSet->tileCY.vecLen );
+        vecLen = umin( vecLen, tileLineSegmentLen( &tempCTile ) );
+
+        iterInit( &blkIter, &tempCTile, vecLen, 0 );
+        iterInit( &physIter, &fullCTile, vecLen, 0 );
+
+        iterSeekPhys( &physIter, blkIter.nrLines * i, blkIter.vec );
+
+        for ( ; 0 == iterIsEnd( &blkIter ); iterIterate( &blkIter ),
+                                            iterIterate( &physIter ) ) {
+
+            emptyKstring( &kstrFirst );
+            emptyKstring( &kstrSecond );
+            emptyKstring( &kstrThird );
+            emptyKstring( &cElem );
+            emptyKstring( &tempCElem );
+
+            sprintfTileElement( &tempCElem,
+                &tempCTile,
+                blkIter.row,
+                blkIter.col,
+                vecLen);
+
+            ksprintf( &kstrFirst, "%d", physIter.line );
+            ksprintf( &kstrSecond, "%s", pVNames->ldc );
+            ksprintf( &kstrThird, "%d", blkIter.vec );
+
+            sprintfFastScalarMad( &expr,
+                &kstrFirst,
+                &kstrSecond,
+                vecLen,//physVecLenC,//scale ldc
+                &kstrThird);
+
+            kgenPrintf( pCtx,
+                "%s[%s] = %s;\n",
+                cPtrName,
+                expr.buf,
+                tempCElem.buf );
+
+        }
+    }
+
+}
+
+int
+genUpdateResultSingle(
+    struct KgenContext *ctx,
+    const char *dst,
+    const char *src,
+    const BlasGenSettings *gset,
+    UpdateResultOp op,
+    UpdateResultFlags flags)
+{
+    char tmp[1024];
+    char *p;
+    const char *opStr;
+    UpdateResultFlags m;
+    int r;
+    bool isComplex = isComplexType(gset->kextra->dtype);
+
+    // copy destination with respective operator and additional operations
+    if (flags & UPRES_WITH_BETA) {
+        if (isComplex) {
+            sprintf(tmp, "%s = %s * betaR + %s.yx * betaI + ",
+                    dst, dst, dst);
+        }
+        else {
+            sprintf(tmp, "%s = %s * beta + ", dst, dst);
+        }
+    }
+    else {
+        opStr = (op == UPRES_SET) ? "=" : "+=";
+        sprintf(tmp, "%s %s ", dst, opStr);
+    }
+
+    m = UPRES_WITH_BETA | UPRES_GENERIC;
+    if (isComplex && ((flags & m) == m)) {
+        strcat(tmp, "\n                    ");
+    }
+    p = tmp + strlen(tmp);
+
+    // multiply source
+    if (flags & UPRES_WITHOUT_ALPHA) {
+        sprintf(p, "%s;\n", src);
+    }
+    else {
+        if (isComplex) {
+            sprintf(p, "%s * alphaR + %s.yx * alphaI;\n", src, src);
+        }
+        else {
+            sprintf(p, "%s * alpha;\n", src);
+        }
+    }
+
+    r = kgenAddStmt(ctx, tmp);
+
+    return (r) ? -EOVERFLOW : 0;
+}
+
+static void
+updateGenericResultGen(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    size_t pitch,
+    UpresVarNames* uvars,
+    UpdateResultOp op,
+    UpdateResultFlags flags,
+    const char *cachedName)
+{
+    char tmp[1024], dst[128], src[128];
+    const char *boundNames[2] = {uvars->nrRows, uvars->nrCols};
+    const char *vecType = NULL;
+    const char *vFieldVectorized;
+    DataType dtype = gset->kextra->dtype;
+    unsigned int wvlen;
+    unsigned int sizes[2];
+    const char*  vfield = dtypeUPtrField(dtype);
+    bool tra = ((flags & UPRES_COLUMN_MAJOR) != 0);
+    bool row = ((flags & UPRES_TAIL_ROW));
+    bool col = ((flags & UPRES_TAIL_COL));
+    bool iwc = ((flags & UPRES_INDEXING_WITH_CONSTANTS) != 0) ||
+                (gset->tileCY.storType != PRIV_STORAGE_ARRAY);
+    int l0;
+    int l1;
+    bool revert = false;
+
+    Kstring kstr;
+    int rowId;
+    int colId;
+
+    sizes[0] = (unsigned int)gset->subdims[1].y;
+    sizes[1] = (unsigned int)gset->subdims[1].x;
+
+    if (iwc) {
+        const char* l0var =  boundNames[tra];
+        revert =  (tra && col) || (!tra && row);
+
+        if (revert) {
+            sprintf(tmp, "uC.%s += (%s-1) * %s;\n", vfield, l0var, uvars->ld);
+        }
+        else {
+            sprintf(tmp, "\n");
+        }
+        kgenAddStmt(ctx, tmp);
+
+    }
+    wvlen = getTmpVecLen(gset, flags, &vecType);
+    if (!iwc) {
+        getVectorTypeName(dtype, wvlen, NULL, &vFieldVectorized);
+        sprintf(tmp, "res.%s = c;\n", vFieldVectorized);
+        kgenAddStmt(ctx, tmp);
+    }
+
+    if (flags & (UPRES_TAIL_ROW | UPRES_TAIL_COL)) {
+        char offStr[64];
+        char *p = offStr;
+
+        offStr[0] = '\0';
+        if (flags & UPRES_TAIL_ROW) {
+            sprintf(offStr, " + (%u - %s) * %lu",
+                    sizes[0], uvars->nrRows, pitch);
+            p += strlen(offStr);
+        }
+        if (flags & UPRES_TAIL_COL) {
+            sprintf(p, " + (%u - %s)", sizes[1], uvars->nrCols);
+        }
+        if (iwc) {
+            sprintf(tmp, "res.%s = uC.%s%s;\n", vfield, vfield, offStr);
+            sprintf(tmp, "\n");
+        }
+        else {
+            sprintf(tmp, "res.%s = res.%s%s;\n", vfield, vfield, offStr);
+        }
+        kgenAddStmt(ctx, tmp);
+
+    }
+    if (iwc) {
+        int l0st = 1; int l0en = sizes[tra];
+        int l1st = 1; int l1en = sizes[1-tra];
+
+        const char* l0var =  boundNames[tra];
+        const char* l1var = boundNames[1-tra];
+
+        for (l0 = l0en; l0 >= l0st; l0--) {
+
+            sprintf(tmp, "if (%s) ",l0var);
+            kgenBeginBranch(ctx, tmp);
+
+            sprintf(tmp, "switch (%s)", l1var);
+            kgenBeginBranch(ctx, tmp);
+
+            for (l1 = l1en; l1 >= l1st; l1--) {
+                sprintf(tmp, "case %d:\n", l1);
+                kgenAddStmt(ctx, tmp);
+
+                if (tra) {
+                    rowId = (row)? (l1en-l1): (l1-l1st);
+                    colId = (col)? (l0-l0st): (l0en-l0);
+                }
+                else {
+                    ///////////////////////////
+                    rowId = (row)? (l0-l0st): (l0en-l0);
+                    colId = (col)? (l1en-l1) : (l1-l1st);
+                }
+
+                if ((tra && row) || (!tra && col)) {
+                     sprintf(dst, "uC.%s[(%s+%d) %% %i]",
+                             vfield, l1var, (l1en - l1),  (int)l1en);
+                }
+                else {
+                   sprintf(dst, "uC.%s[%d]", vfield, (l1-l1st));
+                }
+
+                sprintfTileElement(&kstr, &gset->tileCY, rowId, colId, wvlen);
+
+                if (flags & UPRES_PRIV_DEST) {
+                    genUpdateResultSingle(ctx, kstr.buf, dst, gset, op, flags);
+                }
+                else {
+                    genUpdateResultSingle(ctx, dst, kstr.buf, gset, op, flags);
+                }
+            }
+            kgenEndBranch(ctx, NULL);
+
+            if (revert) {
+                sprintf(tmp, "uC.%s -= %s;\n", vfield, uvars->ld);
+            }
+            else {
+                sprintf(tmp, "uC.%s += %s;\n", vfield, uvars->ld);
+            }
+
+            kgenAddStmt(ctx, tmp);
+
+            sprintf(tmp, "%s--;\n", l0var);
+            kgenAddStmt(ctx, tmp);
+            kgenEndBranch(ctx, NULL);
+        }
+
+    }
+    else {
+        sprintf(tmp, "for (i = 0; i < %s; i++)", boundNames[tra]);
+        kgenBeginBranch(ctx, tmp);
+        sprintf(tmp, "for (j = 0; j < %s; j++)", boundNames[1 - tra]);
+        kgenBeginBranch(ctx, tmp);
+        sprintf(dst, "uC.%s[i * %s + j]", vfield, uvars->ld);
+        if (cachedName) {
+            unsigned int i;
+            char tmpcachedName[80] = " = ";
+            strcat(tmpcachedName, cachedName);
+            for (i = 3; i < strlen(tmpcachedName); i++) {
+                if (strncmp(tmpcachedName+i, "%u", 2) == 0) {
+                    tmpcachedName[i+1] = 's';
+                }
+            }
+            sprintf(tmp, tmpcachedName, "i", "[j]");
+            strcat(dst, tmp);
+        }
+        // result (res) can be transposed independently of the matrix C
+        // If the transposition of "C" and "result" is not consistent
+        // then change the calculation of the index for "result"
+        if (gset->tileCY.trans ^ tra) {
+            sprintf(src, "res.%s[j * %lu + i]", vfield, pitch);
+        }
+        else {
+            sprintf(src, "res.%s[i * %lu + j]", vfield, pitch);
+        }
+        if (flags & UPRES_PRIV_DEST) {
+            genUpdateResultSingle(ctx, src, dst, gset, op, flags);
+        }
+        else {
+            genUpdateResultSingle(ctx, dst, src, gset, op, flags);
+        }
+        kgenEndBranch(ctx, NULL);
+        kgenEndBranch(ctx, NULL);
+    }
+}
+
+//-----------------------------------------------------------------------------
+
+int
+updateResultGen(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    BlasFunctionID funcID,
+    UpdateResultOp op,
+    UpdateResultFlags flags,
+    const UpresVarNames *uvarNames)
+{
+    char tmp[1024];
+    char *p = tmp;
+    const char *typeName;
+    const char *vecType = NULL;
+    const char *vfield;
+    const char *suff1;
+    const char *suff2;
+    int ret = 0;
+    unsigned int sizes[2];
+    bool generic, tra;
+    unsigned int wvlen;     // length of vectors to copy with
+    unsigned int uplen;     // length of vectors to update result with
+    size_t pitch;
+    char LG;
+    DataType dtype = gset->kextra->dtype;
+    unsigned int vecLen;
+    bool isInlined = (flags & UPRES_INLINE);
+    UpresVarNames uvars;
+
+    vecLen = (gset->flags & BGF_DISTINCT_VECLEN) ? gset->kextra->vecLenC :
+                                                   gset->kextra->vecLen;
+    sizes[0] = (unsigned int)gset->subdims[1].y;
+    sizes[1] = (unsigned int)gset->subdims[1].x;
+
+    if (isComplexType(dtype)) {
+        vecLen = 1;
+    }
+
+    if ((flags & UPRES_WITH_BETA) && (op != UPRES_SUM)) {
+        return -EINVAL;
+    }
+
+    tra = ((flags & UPRES_COLUMN_MAJOR) != 0);
+    generic = ((flags & UPRES_GENERIC) != 0);
+    typeName = dtypeBuiltinType(dtype);
+    vfield = dtypeUPtrField(dtype);
+    pitch = roundUp(sizes[1], vecLen);
+
+    // select write vectorization
+    wvlen = getTmpVecLen(gset, flags, &vecType);
+    uplen = (tra ^ gset->tileCY.trans
+             || (flags & UPRES_NO_VECTORIZATION)) ? 1 : vecLen;
+
+    suff1 = (generic) ? "Generic" : "";
+    suff2 = (flags & UPRES_PRIV_DEST) ? "Rev" : "";
+    LG = (flags & UPRES_USE_LDS) ? 'L' : 'G';
+
+    if (!isInlined) {
+        const char *outTypeName;
+        const char *memPref = (flags & UPRES_USE_LDS) ? "__local" :
+                                                           "__global";
+
+        getResultGPRsInfo(dtype, NULL, vecLen, NULL, &outTypeName);
+
+        // define the function
+        sprintf(tmp, "void\n"
+                     "updateResult%s%s%c(\n"
+                     "    %s %s *C,\n"
+                     "    %s *c,\n"
+                     "    %s alpha,\n"
+                     "    uint startRow,\n"
+                     "    uint startCol,\n"
+                     "    uint ld",
+                     suff1, suff2, LG, memPref, typeName,
+                     outTypeName, typeName);
+
+        p += strlen(p);
+        if (flags & UPRES_WITH_BETA) {
+            sprintf(p, ",\n    %s beta", typeName);
+            p += strlen(p);
+        }
+        if (generic) {
+            sprintf(p, ",\n    uint nrRows,\n"
+                       "    uint nrCols");
+        }
+
+        uvars.result = "C";
+        uvars.ld = "ld";
+        uvars.startRow = "startRow";
+        uvars.startCol = "startCol";
+        uvars.nrRows = "nrRows";
+        uvars.nrCols = "nrCols";
+
+        strcat(p, ")\n");
+        kgenDeclareFunction(ctx, tmp);
+        kgenBeginFuncBody(ctx);
+    }
+    else {
+        memcpy(&uvars, uvarNames, sizeof(uvars));
+    }
+
+    // declare local variables
+    sprintf(tmp, "%cPtr uC;\n", LG);
+    kgenAddStmt(ctx, tmp);
+    if (generic) {
+        kgenAddStmt(ctx, "int i, j;\n"
+                         "PPtr res;\n");
+    }
+    else {
+        /*
+         * temporary pointer to pass correctly over the
+         * destination array since destination rows can be
+         * not aligned on a vector bound
+         */
+        if (sizes[1 - tra] % wvlen != 0) {
+            sprintf(tmp, "%cPtr tmpC;\n", LG);
+            kgenAddStmt(ctx, tmp);
+        }
+        if (wvlen > uplen) {
+            sprintf(tmp, "%s tmp;\n", vecType);
+            kgenAddStmt(ctx, tmp);
+        }
+    }
+    if (isComplexType(dtype) && !(flags & UPRES_WITHOUT_ALPHA)) {
+        declareComplexMultParts(ctx, "alpha", typeName);
+        if (flags & UPRES_WITH_BETA) {
+            declareComplexMultParts(ctx, "beta", typeName);
+        }
+
+    }
+    kgenAddBlankLine(ctx);
+
+    // LD is scaled
+    if ( gset->flags & BGF_LD_IN_VECTORS ) {
+
+        vecLen = getVecLen(gset, 0, MATRIX_C);
+    }
+    else {
+
+        vecLen = 1;
+    }
+
+    if (tra) {
+
+        if ( vecLen > 1 ) {
+
+            sprintf(tmp,
+                "uC.%s = %s + (%s * %s + %s)/%d;\n",
+                vfield,
+                uvars.result,
+                uvars.startCol,
+                uvars.ld,
+                uvars.startRow,
+                vecLen);
+        }
+        else {
+
+            sprintf(tmp,
+                "uC.%s = %s + %s * %s + %s;\n",
+                vfield,
+                uvars.result,
+                uvars.startCol,
+                uvars.ld,
+                uvars.startRow);
+        }
+    }
+    else {
+
+        if ( vecLen > 1 ) {
+
+            sprintf(tmp,
+                "uC.%s = %s + (%s * %s + %s)/%d;\n",
+                vfield,
+                uvars.result,
+                uvars.startRow,
+                uvars.ld,
+                uvars.startCol,
+                vecLen);
+
+        }
+        else {
+
+            sprintf(tmp,
+                "uC.%s = %s + %s * %s + %s;\n",
+                vfield,
+                uvars.result,
+                uvars.startRow,
+                uvars.ld,
+                uvars.startCol);
+        }
+    }
+    kgenAddStmt(ctx, tmp);
+
+    if ((sizes[1 - tra] % wvlen != 0) && !generic) {
+        kgenAddStmt(ctx, "tmpC = uC;\n");
+    }
+    ret = kgenAddBlankLine(ctx);
+
+    if (generic) {
+        updateGenericResultGen(ctx, gset, pitch, &uvars, op, flags,
+                               uvarNames ? uvarNames->cachedName : NULL);
+    }
+    else {
+        updateOptimResultGen(ctx,
+        gset,
+        funcID,
+        op,
+        flags);
+    }
+
+    if (!isInlined) {
+        ret = kgenEndFuncBody(ctx);
+    }
+
+    return (ret) ? -EOVERFLOW : 0;
+}
+
+TailFetch
+checkForTailFetches(
+    BlasFunctionID funcID,
+    const SubproblemDim *dim,
+    const CLBLASKernExtra *kextra,
+    MatrixRole mrole,
+    bool distVect,
+    bool lowerTails)
+{
+    TailFetch ret = FETCH_NO_TAILS;
+    size_t x;
+    KernelExtraFlags tailFlag;
+    unsigned int vecLen;
+    KernelExtraFlags tailFlagM, tailFlagN, tailFlagK;
+
+    tailFlagM = lowerTails ? KEXTRA_TAILS_M_LOWER : KEXTRA_TAILS_M;
+    tailFlagN = lowerTails ? KEXTRA_TAILS_N_LOWER : KEXTRA_TAILS_N;
+    tailFlagK = lowerTails ? KEXTRA_TAILS_K_LOWER : KEXTRA_TAILS_K;
+
+    if (mrole == MATRIX_A) {
+        x = dim->y;
+        tailFlag = tailFlagM;
+        vecLen = (distVect) ? kextra->vecLenA : kextra->vecLen;
+    }
+    else {
+        x = dim->x;
+        tailFlag = tailFlagN;
+        vecLen = (distVect) ? kextra->vecLenB : kextra->vecLen;
+    }
+
+    if (isMatrixAccessColMaj(funcID, kextra->flags, mrole)) {
+        if ((kextra->flags & tailFlag) && (x != vecLen)) {
+            ret |= FETCH_TAIL_COL;
+        }
+        if (kextra->flags & tailFlagK) {
+            ret |= FETCH_TAIL_ROW;
+        }
+    }
+    else if (kextra->flags & tailFlagK) {
+        ret |= FETCH_TAIL_COL;
+    }
+
+    return ret;
+}
+
+bool
+isNeedZeroTileTail(
+    BlasFunctionID funcID,
+    const SubproblemDim *dim,
+    const CLBLASKernExtra *kextra,
+    MatrixRole mrole,
+    bool distVect)
+{
+    bool trans;
+    TailFetch tf;
+
+    trans = isMatrixAccessColMaj(funcID, kextra->flags, mrole);
+    tf = checkForTailFetches(funcID, dim, kextra, mrole, distVect, true);
+
+    return (trans && (tf & FETCH_TAIL_ROW)) ||
+           (!trans && (tf & FETCH_TAIL_COL));
+}
+
+TailStatus
+checkGenAdjustTailCoords(
+    struct KgenContext *ctx,
+    BlasFunctionID funcID,
+    const BlasGenSettings *gset,
+    int *error)
+{
+    char tmp[1024];
+    const SubproblemDim *dim = &gset->subdims[1];
+    const KernelVarNames *varNames = &gset->varNames;
+    KernelExtraFlags kflags = gset->kextra->flags;
+    TailStatus status = 0;
+    int err = 0;
+    int n = 0;
+
+    if (!isMatrixAccessColMaj(funcID, kflags, MATRIX_A) &&
+        (kflags & KEXTRA_TAILS_M_LOWER)) {
+
+        status |= TAIL_A_RAISED;
+        sprintf(tmp, "if (%s + %lu > %s) {\n"
+                     "    %s -= %lu - %s %% %lu;\n"
+                     "}\n",
+                varNames->coordA, dim->y, varNames->sizeM,
+                varNames->coordA, dim->y, varNames->sizeM,
+                dim->y);
+        if (ctx != NULL) {
+            err = kgenAddStmt(ctx, tmp);
+            n++;
+        }
+    }
+
+    if (!isMatrixAccessColMaj(funcID, kflags, MATRIX_B) &&
+        (kflags & KEXTRA_TAILS_N_LOWER) && !err) {
+
+        status |= TAIL_B_RAISED;
+        sprintf(tmp, "if (%s + %lu > %s) {\n"
+                     "    %s -= %lu - %s %% %lu;\n"
+                     "}\n",
+                varNames->coordB, dim->x, varNames->sizeN,
+                varNames->coordB, dim->x, varNames->sizeN,
+                dim->x);
+        if (ctx != NULL) {
+            err = kgenAddStmt(ctx, tmp);
+            n++;
+        }
+    }
+
+    if (n && !err) {
+        err = kgenAddBlankLine(ctx);
+    }
+
+    if (error != NULL) {
+        *error = err;
+    }
+
+    return status;
+}
+
+int
+checkGenRestoreTailCoords(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    TailStatus status)
+{
+    char tmp[1024];
+    const SubproblemDim *dim = &gset->subdims[1];
+    const KernelVarNames *varNames = &gset->varNames;
+    int ret = 0;
+    int n = 0;
+
+    if (status & TAIL_A_RAISED) {
+        sprintf(tmp, "if ((%s + %lu == %s) && (%s %% %lu)) {\n"
+                     "    %s += %lu - %s %% %lu;\n"
+                     "}\n",
+                varNames->coordA, dim->y, varNames->sizeM,
+                varNames->sizeM, dim->y, varNames->coordA,
+                dim->y, varNames->sizeM, dim->y);
+        ret = kgenAddStmt(ctx, tmp);
+        n++;
+    }
+
+    if ((status & TAIL_B_RAISED) && !ret) {
+
+        sprintf(tmp, "if ((%s + %lu == %s) && (%s %% %lu)) {\n"
+                     "    %s += %lu - %s %% %lu;\n"
+                     "}\n",
+                varNames->coordB, dim->x, varNames->sizeN,
+                varNames->sizeN, dim->x, varNames->coordB,
+                dim->x, varNames->sizeN, dim->x);
+        kgenAddStmt(ctx, tmp);
+        n++;
+    }
+
+    if (n) {
+        ret = kgenAddBlankLine(ctx);
+    }
+
+    return (ret) ? -EOVERFLOW : 0;
+}
+
+UpdateResultFlags
+tailStatusToUpresFlags(TailStatus status)
+{
+    UpdateResultFlags flags = 0;
+
+    if (status & TAIL_A_RAISED) {
+        flags |= UPRES_TAIL_ROW;
+    }
+    if (status & TAIL_B_RAISED) {
+        flags |= UPRES_TAIL_COL;
+    }
+
+    return flags;
+}
+
+int
+declareComplexMultParts(
+    struct KgenContext *ctx,
+    const char *baseName,
+    const char *typeName)
+{
+    char tmp[1024];
+    int r;
+
+    sprintf(tmp, "%s %sR = (%s)(%s.x);\n"
+                 "%s %sI = (%s)(-%s.y, %s.y);\n",
+            typeName, baseName, typeName, baseName,
+            typeName, baseName, typeName, baseName, baseName);
+    r = kgenAddStmt(ctx, tmp);
+
+    return (r) ? -EOVERFLOW : 0;
+}
+
+void
+sprintfFastScalarMad(
+    Kstring *expr,
+    const Kstring *first,
+    const Kstring *second,
+    unsigned int scale,
+    const Kstring *third)
+{
+    unsigned int u1 = 0, u2 = 0, u3 = 0;
+    bool isNum1, isNum2, isNum3;
+    int shift;
+    bool done = false;
+    const char *thirdStr;
+    const char *suff3;
+
+    // clear up what are these arguments
+    if (isKstringEmpty(first)) {
+        isNum1 = true;
+    }
+    else {
+        isNum1 = !stringToInt(first->buf, &u1);
+    }
+
+    if (isKstringEmpty(second)) {
+        isNum2 = true;
+    }
+    else {
+        isNum2 = !stringToInt(second->buf, &u2);
+    }
+
+    if (!scale) {
+        scale = 1;
+    }
+
+    if ((third == NULL) || isKstringEmpty(third)) {
+        thirdStr = "0";
+        isNum3 = true;
+    }
+    else {
+        thirdStr = third->buf;
+        isNum3 = !stringToInt(thirdStr, &u3);
+    }
+    suff3 = (isNum3) ? "u" : "";
+
+    // singular case at which only the third component can contribute
+    if ( (isNum1 && (u1 == 0)) ||
+         (isNum2 && (u2 /scale == 0))) {
+
+        kstrcpy(expr, thirdStr);
+        return;
+    }
+
+    if (isNum1 && isNum2) {
+        if (isNum3) {
+            ksprintf(expr, "%u", u1 * u2 / scale + u3);
+        }
+        else {
+            ksprintf(expr, "%u + %s", u1 * u2 / scale, thirdStr);
+        }
+        done = true;
+    }
+    else if (isNum1) {
+        /*
+         * If the third argument is not used, then try to build the expression
+         * using only shifts if 'scale' and the 'second argument' are both of
+         * power of 2. Otherwise use mad24.
+         */
+        if (isRoundedPow2(u1) && isRoundedPow2(scale)) {
+            shift = findHighestSetBit(scale) - findHighestSetBit(u1);
+            if (isNum3 && (u3 == 0)) {
+                if (shift < 0) {
+                    ksprintf(expr, "(%s << %d)", second->buf, -shift);
+                }
+                else if (shift > 0) {
+                    ksprintf(expr, "(%s >> %d)", second->buf, shift);
+                }
+                else {
+                    kstrcpy(expr, second->buf);
+                }
+            }
+            else if (shift > 0) {
+                ksprintf(expr, "(%s >> %d) + %s",
+                         second->buf, shift, thirdStr);
+            }
+            else if (shift == 0) {
+                ksprintf(expr, "%s + %s", second->buf, thirdStr);
+            }
+            else {
+                ksprintf(expr, "mad24(%uu, %s, %s%s)",
+                         1u << -shift, second->buf, thirdStr, suff3);
+            }
+            done = true;
+        }
+    }
+
+    if (!done) {
+        /*
+         * Append unsiged suffixes to avoid cases at which one
+         * operand is signed and the other is unsigned. Typically,
+         * OpenCL compilers are strict and reject such expressions.
+         */
+        if (isNum2) {
+            if (u2 / scale == 1) {
+                if (isNum3 && (u3 == 0)) {
+                    kstrcpy(expr, first->buf);
+                }
+                else {
+                    ksprintf(expr, "%s + %s", first->buf, thirdStr);
+                }
+            }
+            else {
+                ksprintf(expr, "mad24(%s, %uu, %s%s)",
+                         first->buf, u2 / scale, thirdStr, suff3);
+            }
+        }
+        else {
+            const char *suff1 = (isNum1) ? "u" : "";
+            Kstring tmp;
+            const char *p = NULL;
+
+            if (scale == 1) {
+                p = second->buf;
+            }
+            else {
+                p = tmp.buf;
+                if (isRoundedPow2(scale)) {
+                    shift = findHighestSetBit(scale);
+                    ksprintf(&tmp, "(%s >> %d)", second->buf, shift);
+                }
+                else {
+                    ksprintf(&tmp, "%s / %d", second->buf, scale);
+                }
+            }
+
+            ksprintf(expr, "mad24(%s%s, %s, %s%s)",
+                     first->buf, suff1, p, thirdStr, suff3);
+        }
+    }
+}
diff --git a/src/library/blas/gens/blas_kgen.h b/src/library/blas/gens/blas_kgen.h
new file mode 100644
index 0000000..694bafd
--- /dev/null
+++ b/src/library/blas/gens/blas_kgen.h
@@ -0,0 +1,895 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Something specific for BLAS generators
+ *
+ * NOTE:
+ *      1) All the blas kernel generators should
+ *         perceive fields of the SubproblemDim
+ *         structure as following:
+ *         'y' - rows of matrix A, i. e. M dimension
+ *               of matrix C
+ *         'x' - columns of matrix B and C
+ *         'bwidth' - block width in K dimension
+ *
+ *      2) At generating copying functions and their calls one should
+ *         keep in mind, all the matrix blocks are copied in
+ *         the local memory such that sequentially accessed elements
+ *         are located in memory sequentially. In this context
+ *         transposing is perceived as transposing at copying
+ *         to/from the local memory, not matrix storage way in
+ *         the array passed to kernel.
+ */
+
+#ifndef BLAS_KGEN_H_
+#define BLAS_KGEN_H_
+
+#include <clBLAS.h>
+
+#include <cltypes.h>
+#include <kerngen.h>
+#include <mempat.h>
+#include <dblock_kgen.h>
+
+#include <blas_funcs.h>
+#include <matrix_props.h>
+
+#include "tile.h"
+#include "fetch.h"
+
+#define BLAS_KGEN_FORMAT 1
+
+#define genInternalLoopEnd(ctx) kgenEndBranch(ctx, NULL)
+
+enum {
+    MAX_OPENCL_VECTOR_LENGTH = 16
+};
+
+typedef enum TailFetch {
+    FETCH_NO_TAILS = 0,
+    FETCH_TAIL_ROW = 0x01,
+    FETCH_TAIL_COL = 0x02
+} TailFetch;
+
+/**
+ * @internal
+ * @brief Blas generator flags
+ * @ingroup GEN_SETTINGS
+ */
+typedef enum BlasGenFlags {
+    BGF_EXPLICIT_INLINE = 0x01,
+    BGF_DISTINCT_VECLEN = 0x02,
+    // TODO: replace with a flags with inverse semantics
+    BGF_WHOLE_A = 0x04,
+    /** Leading dimension are in vectors rather than in elements */
+    BGF_LD_IN_VECTORS = 0x08,
+    /**
+     * Objects in the global memory are accessed through the unified pointers.
+     * This feature is deprecated and should be not used in new generators.
+     * It is left for backward compatibility
+     */
+    BGF_UPTRS = 0x10
+} BlasGenFlags;
+
+/**
+ * @internal
+ * @brief Flags showing how problem tails are handled
+ * @ingroup TAILS_HANDLING
+ */
+typedef enum TailStatus {
+    /** Tail of the matrix A is raised */
+    TAIL_A_RAISED = 0x01,
+    /** Tail of the matrix B is raised */
+    TAIL_B_RAISED = 0x02
+} TailStatus;
+
+/**
+ * @internal
+ * @brief Tiles multiplier flags
+ * @ingroup BLAS_MAJOR_SUBGENS
+ */
+typedef enum TileMulFlags {
+    TILEMUL_NO_FLAGS = 0,              /**< No flags */
+    TILEMUL_TRA = 0x01,                /**< Transposed matrix A */
+    TILEMUL_TRB = 0x02,                /**< Transposed matrix B */
+    TILEMUL_CONJA = 0x04,              /**< Conjugated elements of A */
+    TILEMUL_CONJB = 0x08,              /**< Conjugated elements of B */
+    TILEMUL_C_COLUMN_MAJOR = 0x10,     /**< Column major block for matrix C */
+    TILEMUL_NOT_FETCH_B = 0x20,        /**< Do not fetch matrix B block */
+    TILEMUL_EXTERN_RDECL = 0x40,       /**< External register tiles declaration,
+                                          the generator must not declare them
+                                          itself */
+
+    /**
+     * Deprecated. Use the repsective mode being a part of FetchAddr mode.
+     * He is left just for backward compatibility to don't break the working
+     * code and will be removed soon
+     */
+    TILEMUL_WRAP_AROUND_TAIL = 0x80,   /**< Sizes used for column skew are
+                                            rounded to next vecLen bound */
+    /** Use global cyclic along subproblem A coordinate.
+     * Deprecated. Don't use it */
+    TILEMUL_GLOBAL_CYCLIC_A = 0x100,
+    /** Use global cyclic along subproblem B coordinate.
+     * Deprecated don't use it */
+    TILEMUL_GLOBAL_CYCLIC_B = 0x200,
+    /* Deprecated. Don't use it */
+    TILEMUL_GLOBAL_CYCLIC_K = 0x400,   /**< Use global cyclic along K */
+    /** Use skew along subproblem A coordinate */
+    TILEMUL_SKEW_A = 0x800,
+    /** Use skew along subproblem B coordinate. Deprecated */
+    TILEMUL_SKEW_B = 0x1000,
+    /* Deprecated */
+    TILEMUL_SKEW_K = 0x2000,           /**< Use skew along K */
+    /** Use size of whole matrix for cyclic addressing. Deprecated */
+    TILEMUL_GLOBAL_CYCLIC = TILEMUL_GLOBAL_CYCLIC_A |
+                            TILEMUL_GLOBAL_CYCLIC_B |
+                            TILEMUL_GLOBAL_CYCLIC_K,
+    // Deprecated
+    TILEMUL_SKEWS = TILEMUL_SKEW_A | TILEMUL_SKEW_B | TILEMUL_SKEW_K,
+    /** Optimize coordinates calculations by storing coordinates values */
+    // Deprecated
+    TILEMUL_OPTIMIZE_COORD_CALC = 0x4000,
+    /** Use bwidth0 stride */
+    TILEMUL_BW_STRIDE = 0x8000,
+    /** Optimize coordinates calculations by using vectors
+     *  and pointer increments */
+    // Deprecated
+    TILEMUL_OPTIMIZE_VEC_COORDS = 0x10000,
+    /** Do not increment K*/
+    TILEMUL_NOT_INC_K = 0x20000,
+    /**
+     * Use variants with explicit vectorization. Useful on platforms with
+     * true SIMD.
+     */
+    TILEMUL_FORCE_VECTORIZATION = 0x40000
+} TileMulFlags;
+
+
+/**
+ * @internal
+ * @brief Tiles multiplier core
+ * @ingroup BLAS_MAJOR_SUBGENS
+ */
+typedef enum TileMulCore {
+    /** Use multiplication and addition operations */
+    TILEMUL_MULADD,
+    /** Use the 'dot' function where possible */
+    TILEMUL_DOT,
+    /** Use the 'mad' function */
+    TILEMUL_MAD
+} TileMulCore;
+
+/**
+ * @internal
+ * @brief Update result operations
+ * @ingroup BLAS_MAJOR_SUBGENS
+ */
+typedef enum UpdateResultOp {
+    /** Just set the values stored in a target buffer */
+    UPRES_SET,
+    /** Summarize values stored in a target buffer with the temporary result */
+    UPRES_SUM
+} UpdateResultOp;
+
+/**
+ * @internal
+ * @brief Update result generator flags
+ * @ingroup BLAS_MAJOR_SUBGENS
+ */
+typedef enum UpdateResultFlags {
+    /** Resulting matrix is stored in the column major form */
+    UPRES_COLUMN_MAJOR = 0x01,
+    /** Generic version, non optimal sizes */
+    UPRES_GENERIC = 0x02,
+    /** Multiply result on beta */
+    UPRES_WITH_BETA = 0x04,
+    /** do not multiply on the alpha scalar */
+    UPRES_WITHOUT_ALPHA = 0x08,
+    /**
+     * Destination is private memory;
+     * if not set destination is in the global one
+     */
+    UPRES_PRIV_DEST = 0x10,
+    /** Use the local memory instead the global memory */
+    UPRES_USE_LDS = 0x20,
+    /** Generate the inline version */
+    UPRES_INLINE = 0x40,
+    /** Disable vectorization at memory access */
+    UPRES_NO_VECTORIZATION = 0x80,
+    /** For the generic version useful data reside at the tile rows' tail */
+    UPRES_TAIL_ROW = 0x100,
+    /** For the generic version useful data reside at the tile columns' tail */
+    UPRES_TAIL_COL = 0x200,
+    /** Generate condition whether coordinates don't exceed problem bounds */
+    UPRES_EXCEED_PROBLEM_CONDITION = 0x400,
+    /****/
+    UPRES_INDEXING_WITH_CONSTANTS = 0x800,
+    /** Write result to C instead of B for functions with triangular matrix */
+    UPRES_TRIANG_WRITE_C = 0x1000
+} UpdateResultFlags;
+
+typedef struct PrivateArea {
+    const char *typeName;
+    unsigned int vecLen;
+    unsigned int size;
+} PrivateArea;
+
+/**
+ * @internal
+ * @defgroup GEN_SETTINGS Generator settings
+ * @ingroup BLAS_GENERATORS
+ */
+/*@{*/
+
+/**
+ * @internal
+ * @brief Kernel variable and argument names
+ */
+typedef struct KernelVarNames {
+    const char *A;          /**< Matrix A variable name */
+    const char *B;          /**< Matrix B variable name */
+    const char *C;
+    const char *LDS;		/**< LDS pointer name */
+    const char *coordA;     /**< Variable for subproblem A coordinate */
+    const char *coordB;     /**< Variable for subproblem B coordinate */
+    const char *k;          /**< Variable for incrementable K offset value*/
+    const char *skewA;      /**< Variable for skews along A */
+    const char *skewB;      /**< Variable for skews along B */
+    const char *skewK;      /**< Variable for skews along K */
+    const char *sizeM;      /**< Matrix A size M */
+    const char *sizeN;      /**< Matrix B size N */
+    const char *sizeK;      /**< Matrixes size K */
+    const char *lda;        /**< Leading dimension of matrix A */
+    const char *ldb;        /**< Leading dimension of matrix B */
+    const char *ldc;        /**< Leading dimension of matrix C, in vectors */
+    const char *vectCoordA; /**< Vector containing indexes of tile a elements
+                                 in matrix A */
+    const char *vectCoordB; /**< Vector containing indexes of tile b elements
+                                 in matrix B*/
+    const char *startM;
+    const char *startN;
+    const char *startK;
+    const char *alpha;
+    const char *beta;
+} KernelVarNames;
+
+/**
+ * @internal
+ * @brief Blas generator settings
+ *
+ * This structure is designed to be used with most of subgenerators
+ * and generator helpers. It is assumed to be initialized once at the
+ * generator beginning and modified as few as possible over the rest of
+ * the process.
+ */
+typedef struct BlasGenSettings {
+    /**
+     * Subproblem dimensions:
+     *
+     * work group dimensions are at index 0
+     * work item dimensions are at index 1
+     */
+    SubproblemDim subdims[2];
+    const PGranularity *pgran;      /**< Data parallelism granularity */
+    const CLBLASKernExtra *kextra;  /**< Kernel extra */
+    BlasGenFlags flags;             /**< Global generator flags */
+    KernelVarNames varNames;        /**< Kernel variables and argument names */
+    Tile tileA;
+    Tile tileBX;
+    Tile tileCY;
+} BlasGenSettings;
+
+/*@}*/
+
+/**
+ * @internal
+ * @brief Variable names for the inline version of a function updating result
+ * @ingroup BLAS_MAJOR_SUBGENS
+ */
+typedef struct UpresVarNames {
+    const char *result;     /**< Name of an output matrix */
+    /** Leading dimension of a matrix stored in the global memory */
+    const char *ld;
+    const char *startRow;   /**< Start row to update from */
+    const char *startCol;   /**< Start column to update from */
+    const char *nrRows;     /**< Number of rows */
+    const char *nrCols;     /**< Number of columns */
+    const char *cachedName; /**< Name of lds chached values */
+} UpresVarNames;
+
+/**
+ * @internal
+ * @brief Options for matrix tiles multiplication generator
+ * @ingroup BLAS_MAJOR_SUBGENS
+ */
+typedef struct TileMulOpts {
+    CLMemType memA;             /**< type of memory matrix A is located on */
+    CLMemType memB;             /**< type of memory matrix B is located on */
+    TileMulFlags flags;         /**< Flags on objects and computing specifics */
+    TileMulCore core;           /**< Multiply and add core */
+    int (*postFetch)(
+        struct KgenContext *ctx,
+        MatrixRole mrole,
+        void *arg);             /**< Tile post fetch callback */
+    void *postFetchPriv;        /**< Postfetch callback's private date */
+    struct FetchContext *fctx;
+} TileMulOpts;
+
+typedef struct ZeroFuncs {
+    char names[MATRIX_ROLES_NUMBER][FUNC_NAME_MAXLEN];
+} ZeroFuncs;
+
+/**
+ * @internal
+ * @brief Private data for fetch postprocessing callback
+ * @ingroup TAILS_HANDLING
+ */
+typedef struct TilePostFetchPrivate {
+    BlasFunctionID funcID;
+    const BlasGenSettings *gset;
+    const char *regName;
+    int fetchNumA;
+    int wholeA;
+} TilePostFetchPrivate;
+
+void
+getPrivateAreaInfo(
+    const BlasGenSettings *gset,
+    BlasFunctionID funcID,
+    MatrixRole mrole,
+    PrivateArea *area);
+
+void
+declarePrivateArea(
+    struct KgenContext *ctx,
+    const PrivateArea *area,
+    const char *baseName,
+    PrivateStorageType storType);
+
+/*
+ * Declare separately the real and imaginary part of
+ * a complex multiplier.
+ *
+ * @ctx: generator context
+ * @baseName: variable's base name matching to an existing variable
+ *            with not sepated parts
+ * @typeName: variable type name
+ *
+ * Rule naming
+ *      real part:      <baseName>R
+ *      imaginary part: <baseName>I
+ *
+ * On success returns 0, and -EOVERFLOW at source buffer
+ * overflowing
+ */
+int
+declareComplexMultParts(
+    struct KgenContext *ctx,
+    const char *baseName,
+    const char *typeName);
+
+/**
+ * @internal
+ * @defgroup CHECK_DECOMP_CACL_GRAN  Checking decomposition and calculate
+ *                                   parallelism granularity
+ * @ingroup BLAS_GENERATORS
+ */
+
+/*@{*/
+
+/**
+ * @brief Sanity check for decomposition
+ *
+ * @param[in] subdims           Subproblem dimensions. 2 levels.
+ * @param[in] minSize           Minimum size for any of the dimension
+ *                              components
+ * @param[in] maxSize           Maxium size which can't be exceeded by
+ *                              any of the dimension components at the tile
+ *                              layer
+ * @param[in] maxRegs           Maximum registers it's allowed to use
+ * @param[in] dtype             BLAS data type
+ * @param[in] wholeA            Is matrix A stored in registers entirely or
+ *                              partially
+ *
+ * The function rejects only decompositions that are completely invalid or lead
+ * to consumption of too many registers or just have component values at the
+ * tile layer that are out of the range [\b MinSize, \b MaxSize].
+ * Completely invalid decompositions are those which don't allow to divide
+ * problem integrally among work items, e. g. zeroed components are wrong,
+ * the step components (x, y, bwidth) of the 0-th level not integrally
+ * divisible on respective size components (itemX, itemY, bwidth) of the 1-st
+ * level are wrong as well. The decomposition is also wrong if the size
+ * components are not integrally divisible on the step components and not equal
+ * to #SUBDIM_UNUSED.
+ *
+ * @return true if the decomposition is valid, or false otherwise
+ */
+bool
+decompSanityCheck(
+    const SubproblemDim *subdims,
+    unsigned int minSize,
+    unsigned int maxSize,
+    unsigned int maxRegs,
+    DataType dtype,
+    bool wholeA);
+
+/**
+ * @brief Calculate granularity in case when a work item is responsible
+ *        for its own part of solution not overlapping with those of other
+ *        items
+ *
+ * @param[out] pgran            Location to store calculated granularity
+ * @pararm[in] subdims          Subproblem dimensions
+ * @param[in] xdim              Dimension in the OpenCL work space X component
+ *                              of decomposition is mapped on
+ * @param[in] level             Function BLAS level. Reserved for future use.
+ *
+ * If value of \b xdim is -1, then the function assumes that OpenCL work
+ * space is single dimensional, and puts the product of granularity against
+ * X and Y component to 0-th element of \b wgSize field. If its value is
+ * 0 or 1, the function assumes that OpenCL work space is 2D and puts
+ * granularity against X component to \b xdim element of \b wgSize field
+ * of the granularity decriptor. Granularity against Y component is put to
+ * 1 - \b xdim element. Other values are invalid and forces abort in debug
+ * build. The function initializes the \b wgDim field properly.
+ *
+ * NOTE: Now, only this function is supported only for level 3 and
+ *       must not be called for level 2
+ */
+void
+calcPgranDedicated(
+    PGranularity *pgran,
+    const SubproblemDim *subdims,
+    int xdim,
+    int level);
+
+/**
+ * @brief Calculate granularity in case when several items evaluate the same
+ *        part of solution together
+ *
+ * @param[out] pgran            Location to store calculated granularity
+ * @pararm[in] subdims          Subproblem dimensions
+ * @param[in] xdim              Dimension in the OpenCL work space X component
+ *                              of decomposition is mapped on
+ * @param[in] ydim              Dimension in the OpenCL work space Y component
+ *                              of decomposition is mapped on
+ * @param[in] level             Function BLAS level. Reserved for future use
+ *
+ * If \b xdim and \b ydim values are equal, then the function puts the product
+ * of granularity against X and Y component to \b xdim element of \b wgSize
+ * field. If not, it puts separated granularity for X and Y in \b xdim and
+ * \b ydim element respectively. Both the values must be non negative and less
+ * than 3 (since OpenCL workspace cannot have more than 3 dimensions).
+ * If some of these parameters is zero, then the other one must be zero as well.
+ * If some of these parameters is 2, then the other one must be 1. These
+ * restrictions are caused by needs in reflecting \b bwidth in granularity
+ * in case of multidimensional decomposition. For 2D and 3D decompositions
+ * granularity for bwidth is calculated as well, and it is always mapped
+ * onto 0-th workspace dimension. If some of these parameters are wrong,
+ * it forces abort in debug build. The function sets the \b wgDim field
+ * to maximum of xdim and ydim plus 1.
+ *
+ * NOTE: Now, only this function is supported only for level 3 and
+ *       must not be called for level 2
+ */
+void
+calcPgranCooperative(
+    PGranularity *pgran,
+    const SubproblemDim *subdims,
+    int xdim,
+    int ydim,
+    int level);
+
+/*@}*/
+
+/**
+ * @internal
+ * @defgroup COMMON_MATH_OPERATIONS Constructing useful math expression
+ * @ingroup BLAS_GENERATORS
+ */
+/*@{*/
+
+/**
+ * @brief Sprintf a complex MAD operation
+ *
+ * Operations:
+ *     - \f$ dst \leftarrow a * b + c \f$
+ *     - \f$ dst \leftarrow conj(a) * b + c \f$
+ *     - \f$ dst \leftarrow a * conj(b) + c \f$
+ *     - \f$ dst \leftarrow conj(a) * conj(b) + c \f$
+ *
+ *  @param[out] expr            String object to hold the target expression
+ *  @param[in] dst              Destination argument
+ *  @param[in] a                The first multiplier
+ *  @param[in] b                The second multiplier
+ *  @param[in] c                Added argument
+ *  @param[in] isDouble         If set, the arguments have double precision
+ *  @param[in] isConjA          If set, the argument A should be conjugated
+ *  @param[in] isConjB          If set, the argument B should be conjugated
+ *  @param[in] TileMulCore      Multiplying core
+ *
+ *  The \b c argument can be NULL. In this case it is ignored, and the function
+ *  produces pure multiplication
+ */
+void
+sprintfComplexMulUpdate(
+    Kstring *expr,
+    const Kstring *dst,
+    const Kstring *a,
+    const Kstring *b,
+    const Kstring *c,
+    bool isDouble,
+    bool conjA,
+    bool conjB,
+    TileMulCore core);
+
+/**
+ * @brief Sprintf expression of fast scalar mad
+ *
+ * @param[out] expr         Output expression
+ * @param[in]  first        First multiplier
+ * @param[in]  second       Second multiplier
+ * @param[in]  scale        Scale of the second argument, i. e. it's divider.
+ *                          Ignored if zero.
+ * @param[in]  third        Added argument. Ignored if NULL.
+ *
+ * It can use mad24. So, expected result should not exceed 2^24
+ */
+void
+sprintfFastScalarMad(
+    Kstring *expr,
+    const Kstring *first,
+    const Kstring *second,
+    unsigned int scale,
+    const Kstring *third);
+
+/*@}*/
+
+/**
+ * @internal
+ * @defgroup BLAS_GEN_MISC_FUNCTIONS Miscellaneous functions
+ * @ingroup BLAS_GENERATORS
+ */
+
+/*@{*/
+
+/**
+ * @brief Default function prefix for the data type
+ *
+ * @param[in] dtype     One of the data types supported by the library
+ */
+char
+dtypeToBlasPrefix(DataType dtype);
+
+/**
+ * @brief Convert kernel extra flags to tilemul flags
+ *
+ * @param[in] funcID        BLAS function ID
+ * @param[in] kflags        Kernel flags
+ */
+TileMulFlags
+kextraToTilemulFlags(BlasFunctionID funcID, KernelExtraFlags kflags);
+
+/**
+ * @brief Get vector length elements should be fetched from (stored to)
+ *        the global memory
+ *
+ * @param[in] gset          Generator settings
+ * @param[in] funcID        BLAS function ID (deprecated)
+ * @param[in] mrole         Role of the matrix to get vectorization for
+ */
+unsigned int
+getVecLen(const BlasGenSettings *gset, BlasFunctionID funcID, MatrixRole mrole);
+
+/**
+ * @brief Sprintf chunk (set of components) of an OpenCL vector type
+ *
+ * @param[out] chunk        Buffer to sprintf to
+ * @param[in] vecLen        Entire vector length
+ * @param[in] clen          Length of the chunk
+ * @param[in] vecOff        Starting component offset
+ */
+void
+sprintfVecChunk(
+    char *chunk,
+    unsigned int vecLen,
+    unsigned int clen,
+    unsigned int vecOff);
+
+/**
+ * @brief Generate code containing scaling of leading dimensions on
+ *        vector size
+ *
+ * @param[out] ctx          Generator context
+ * @param[in] gset          Generator settings
+ *
+ * The function first checks whether the scaling is actually needed.
+ * If vector size is 1. If some of the kernel variables for 'lda', 'ldb'
+ * or 'ldc' is NULL, the function skips code generation for the dimension.
+ * Calling this function has no effect if the @ref BGF_LD_IN_VECTORS generator
+ * flag is not set. If some of the leading dimensions are not unique, only
+ * one of the instances is scaled. Originality of the dimensions is detected
+ * by values of the respective pointers being a part of @ref KernelVarNames.
+ * For example, 'lda' and 'ldb' pointers are the same, only 'lda' is scaled.
+ */
+void
+genScaleLeadingDimensions(struct KgenContext *ctx, const BlasGenSettings *gset);
+
+/*@}*/
+
+/**
+ * @internal
+ * @brief Generate default post processing logic after tile fetch
+ *
+ * @param[out] ctx      Generator context
+ * @param[in] mrole     Matrix role
+ * @priv[out]           Handler's private data
+ *
+ * @ingroup TAILS_HANDLING
+ */
+int
+defaultTilePostFetch(
+    struct KgenContext *ctx,
+    MatrixRole mrole,
+    void *priv);
+
+void
+getResultGPRsInfo(
+    DataType dtype,
+    const SubproblemDim *dims,
+    unsigned int vecLen,
+    unsigned int *nrRegs,
+    const char **typeName);
+
+/**
+ * @internal
+ * @defgroup BLAS_MAJOR_SUBGENS Major subgenerators
+ * @ingroup BLAS_GENERATORS
+ */
+/*@{*/
+
+/**
+ * @internal
+ * @brief Tiles fetching and multiplication inlined code generator
+ *
+ * @param[out] ctx          Generator context
+ * @param[in] gset          Generator settings
+ * @param[in] mulOpts       TileMul-specific generator settings
+ *
+ * This function generates code which fetches tiles a and b from global or local
+ * memory into private memory, multiply them storing result into tile c in
+ * private memory and increment coordinate k. Caller is responsible for loop
+ * along K.\n
+ * All combinations of tiles a and b orientations are supported. Generated
+ * code fetches tiles by vectors which size can be different for tiles a and b.
+ * Complex types and conjugated tiles are supported. Global cycling is supported
+ * for global memory fetching - this mean that if tile overlaps matrix
+ * the tail of tile will be fetched from the beginning instead of accessing
+ * memory outside the matrix.\n
+ * Second level of subdimensions is used for tiles sizes.\n
+ * Generated code will fetch tiles a, b, multiply them and add result to tile c
+ * in private memory, then increment k. By default, k is incremented by
+ * second level bwidth but it is incremented by first level bwidth if
+ * @ref TILEMUL_BW_STRIDE flag is set. It is used if whole work group goes
+ * along K loop.\n
+ * Each tile can be fetched from global memory or from local memory.
+ * If tile is fetched from local memory then leading dimensions for local
+ * memory area are taken from first level subdimensions.\n
+ * Post-fetch callback generator function can be called after fetching tiles
+ * for zeroing tails or setting diagonal elements to one. This function is
+ * provided by caller.\n
+ * If second level bwidth is not equal to first level bwidth, and
+ * @ref TILEMUL_BW_STRIDE flag is not set then TileMul generates
+ * loop from zero to first level bwidth with second level bwidth step. The
+ * most common case is second level bwidth equal to first level bwidth where
+ * single iteration of multiplication is generated.\n
+ *
+ * If the caller assume for efficient fetching from the global memory and the
+ * tilemul logic is generated within a loop, prepareFetchCycle() should be
+ * called before generation of the loop.
+ *
+ * @return 0 on success
+ * @return -EOVERFLOW on source buffer overflowing
+ * @return -EINVAL if input arguments are invalid
+ */
+int
+tileMulGen(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    const TileMulOpts *mulOpts);
+
+/**
+ * @internal
+ * @brief Tiles pure multiplication code generator
+ *
+ * @param[out] ctx          Generator context
+ * @param[in] gset          Generator settings
+ * @param[in] mulOpts       TileMul-specific generator settings
+ *
+ * This function multiply two tiles, a and b, storing result in tile c. No
+ * additional operations are made. It just performs tiles multiplication without
+ * fetching, post-fetch processing and incrementing coordinates which can be
+ * made by caller.
+ *
+ * @return 0 on success
+ * @return -EOVERFLOW on source buffer overflowing
+ */
+int
+genMulTiles(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    const TileMulOpts *mulOpts);
+
+/**
+ * @internal
+ * @brief Update result generator
+ *
+ * @param[out] ctx          Generator context
+ * @param[in] gset          Generator settings
+ * @param[in] op            Update operation
+ * @param[in] flags         Update result flags
+ * @argNames
+ *
+ * It generates a function applying an operation to the temporary result
+ * stored in the private memory and updating the target result.
+ *\n
+ * The code can be generated as well in the form of callable function
+ * as in the inlined form.
+ *\n
+ * List of taken argument differs depending on specified flags. In general,
+ * these functions are defined as: \n
+ * @code
+ * void
+ * funcName(
+ *     <input type> C,
+ *     <output type> *c,
+ *     <input type> alpha,
+ *     size_t startRow,
+ *     size_t startCol,
+ *     size_t ld
+ *     [,<input type> beta]
+ *     [,size_t nrRows]
+ *     [,size_t nrCols])
+ * @endcode
+ *
+ * @return 0 on success, -EOVERFLOW at source buffer overflowing.
+ */
+int
+updateResultGen(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    BlasFunctionID funcId,
+    UpdateResultOp op,
+    UpdateResultFlags flags,
+    const UpresVarNames *uvarNames);
+
+/**
+ * @internal
+ * @brief Produce a code updating a single result element
+ *
+ * @param[out] ctx      Generator context
+ * @param[in] dst       Destination element expression
+ * @param[in] src       Source element expression
+ * @param[in] gset      Generator settings
+ * @param[in] op        Update operation
+ * @param[in] flags     Flags showing specifics of the code needed to be
+ *                      generated
+ *
+ * @return 0 on success, -EOVERFLOW if the source buffer is exceeded.
+ */
+int
+genUpdateResultSingle(
+    struct KgenContext *ctx,
+    const char *dst,
+    const char *src,
+    const BlasGenSettings *gset,
+    UpdateResultOp op,
+    UpdateResultFlags flags);
+
+/*@}*/
+
+TailFetch
+checkForTailFetches(
+    BlasFunctionID funcID,
+    const SubproblemDim *dim,
+    const CLBLASKernExtra *kextra,
+    MatrixRole mrole,
+    bool distVect,
+    bool lowerTails);
+
+bool
+isNeedZeroTileTail(
+    BlasFunctionID funcID,
+    const SubproblemDim *dim,
+    const CLBLASKernExtra *kextra,
+    MatrixRole mrole,
+    bool distVect);
+
+/**
+ * @internal
+ * @brief Generate tail coordinates adjustment if needed
+ *
+ * @param[out] ctx              Generator context
+ * @param[in] funcID            BLAS function ID
+ * @param[in] gset              Generator settings
+ * @param[out] *error           Location to store error.
+ *                              Ignored if NULL.
+ *
+ * Adjust coordinates if work is distributed over matrix rows so as
+ * a tile would not exceed the matrix bound. Cyclic addressing is not
+ * applicable for that since skew over rows can be used for performance goals.
+ *
+ * If it's needed, issues an expression like
+ *
+ * if (coord.y + dy > M) {
+ *     coord.y -= dy - M % dy;
+ * }
+ *
+ * Return status showing if the tails have been actually adjusted or not.
+ * If \b ctx is NULL the function doesn't try to generate a code, but just
+ * return actual tail handling status
+ *
+ * @ingroup TAILS_HANDLING
+ */
+TailStatus
+checkGenAdjustTailCoords(
+    struct KgenContext *ctx,
+    BlasFunctionID funcID,
+    const BlasGenSettings *gset,
+    int *error);
+
+/**
+ * @internal
+ * @brief Generate restoring original coordinates if needed
+ *
+ * @param[out] ctx              Generator context
+ * @param[in] gset              Generator settings
+ * @param[in] status            Tails handling status
+ *
+ * Coordinates restoring is needed to have ability to write back result to
+ * a correct location.
+ *
+ * If it's needed, issues an expression like
+ *
+ * if (coord.y + dy == M) {
+ *     coord.y += dy - M % dy;
+ * }
+ *
+ * @ingroup TAILS_HANDLING
+ */
+int
+checkGenRestoreTailCoords(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    TailStatus status);
+
+/**
+ * @internal
+ * @brief Convert tail handling status to the respective flags
+ *        of the update result generator
+ *
+ * @param[in] status            Status of the handling to convert to
+ *                              the update result flags
+ *
+ * @ingroup TAILS_HANDLING
+ */
+UpdateResultFlags
+tailStatusToUpresFlags(TailStatus status);
+
+#endif /* BLAS_KGEN_H_ */
diff --git a/src/library/blas/gens/blas_subgroup.c b/src/library/blas/gens/blas_subgroup.c
new file mode 100644
index 0000000..9c87d53
--- /dev/null
+++ b/src/library/blas/gens/blas_subgroup.c
@@ -0,0 +1,528 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include "blas_subgroup.h"
+#include <stdio.h>
+#include <clblas_stddef.h>
+
+#include <matrix_props.h>
+#include <matrix_dims.h>
+#include <dis_warning.h>
+
+#include "blas_kgen.h"
+#include "gen_helper.h"
+#include "tile_iter.h"
+#include "kerngen.h"
+
+static int
+calcMergeStepSubgrN(
+    const BlasGenSettings* pGSet,
+    DataType dtype);
+
+static int declareSubgrLDS(
+    struct KgenContext* pCtx,
+    const BlasGenSettings* pGSet,
+    DataType dtype);
+
+//-----------------------------------------------------------------------------
+// calculates best number of subgroups to be engaged in each merge step
+// simultaneously
+// Calculation is based on the register usage estimation
+// in order not to limit
+// the number of workgroups scheduled on the SIMD engine
+static int
+calcMergeStepSubgrN(
+    const BlasGenSettings* pGSet,
+    DataType dtype)
+{
+    // hardware-specific options
+    const int deviceLDS = 32768;
+    const unsigned int gprsPerUnit = 240;
+
+    int vecLenA = 0;
+    int vecLenB = 0;
+    int vecLenC = 0;
+
+    int vecNumA = 0;
+    int vecNumB = 0;
+    int vecNumC = 0;
+
+    int subgPerStep = 0;
+    int bestLDS = 0;
+    int gprsUsed = 0;
+    int subgNum = 0;
+
+    int itemsPerSubgroup = 0;
+
+    if( NULL == pGSet || NULL == pGSet->pgran ){
+        return -EINVAL;
+    }
+
+    itemsPerSubgroup = pGSet->subdims[0].bwidth/
+        pGSet->subdims[1].bwidth;
+
+    subgNum = (pGSet->subdims[0].x/pGSet->subdims[1].x)*
+        (pGSet->subdims[0].y/pGSet->subdims[1].y);
+
+    vecLenA = pGSet->tileA.vecLen;
+    vecLenB = pGSet->tileBX.vecLen;
+    vecLenC = pGSet->tileCY.vecLen;
+
+    vecNumA = tileVectorsNum( &pGSet->tileA );
+    vecNumB = tileVectorsNum( &pGSet->tileBX );
+    vecNumC = tileVectorsNum( &pGSet->tileCY );
+
+    // registers hold 4-vectors of 32-bit floats or 2-vectors of doubles
+    switch(dtype){
+
+        case TYPE_FLOAT:
+
+            // each register holds 4 4-byte float values
+            // 10 registers are used address, etc
+            gprsUsed =  vecNumA * (vecLenA/4) +
+                        vecNumB * (vecLenB/4) +
+                        vecNumC * (vecLenC/4) + 10;
+
+            bestLDS = deviceLDS/(gprsPerUnit/gprsUsed);
+
+            subgPerStep = bestLDS/(itemsPerSubgroup *
+                                   vecNumC *
+                                   vecLenC * 4 );//4-byte floats
+            break;
+
+        case TYPE_DOUBLE:
+
+            // each register can hold 2 double values
+            // 10 registers are used for address, etc
+            gprsUsed =  vecNumA * (vecLenA/2) +
+                        vecNumB * (vecLenB/2) +
+                        vecNumC * (vecLenC/2) + 10;
+
+            bestLDS = deviceLDS/(gprsPerUnit/gprsUsed);
+
+            subgPerStep = bestLDS/(itemsPerSubgroup *
+                                   vecNumC *
+                                   vecLenC * 8 );//8-byte doubles
+            break;
+
+        case TYPE_COMPLEX_FLOAT:
+
+            // each register holds 2 4-byte float-based complex values
+            // 10 registers are used address, etc
+            gprsUsed =  vecNumA * (vecLenA/2) +
+                        vecNumB * (vecLenB/2) +
+                        vecNumC * (vecLenC/2) + 10;
+
+            bestLDS = deviceLDS/(gprsPerUnit/gprsUsed);
+
+            subgPerStep = bestLDS/(itemsPerSubgroup *
+                                   vecNumC *
+                                   vecLenC * 8 );//2x4-byte floats
+            break;
+
+        case TYPE_COMPLEX_DOUBLE:
+
+            // each register can hold 1 double-based complex value
+            // 10 registers are used for address, etc
+            gprsUsed =  vecNumA * (vecLenA) +
+                        vecNumB * (vecLenB) +
+                        vecNumC * (vecLenC) + 10;
+
+            bestLDS = deviceLDS/(gprsPerUnit/gprsUsed);
+
+            subgPerStep = bestLDS/(itemsPerSubgroup *
+                                   vecNumC *
+                                   vecLenC * 16 );//2x8-byte double
+            break;
+
+    }
+
+    if( 0==subgPerStep ){
+        subgPerStep = 1;
+    }
+
+    // do not exceed physical number of subgroups in workgroup
+    if( subgPerStep > subgNum ){
+        subgPerStep = subgNum;
+    }
+
+    return subgPerStep;
+}
+
+//-----------------------------------------------------------------------------
+// Add LDS array declaration(based on C matrix parameters) to the context
+// each row of C Matrix block may be splitted into separate vectors
+
+static int declareSubgrLDS(
+    struct KgenContext* pCtx,
+    const BlasGenSettings* pGSet,
+    DataType dtype)
+{
+    int vecLenC = 0;
+    int vecNumC = 0;
+    const char* typeName;
+    const KernelVarNames *vnames = NULL;
+    char tmp[512];
+    int itemsPerSubgroup = 0;
+    int subgrPerStep = 0;
+
+    if( NULL == pCtx || NULL == pGSet ){
+        return -EINVAL;
+    }
+
+    itemsPerSubgroup = pGSet->subdims[0].bwidth / pGSet->subdims[1].bwidth;
+    subgrPerStep = calcMergeStepSubgrN(pGSet, dtype);
+
+    vecLenC = pGSet->tileCY.vecLen;
+    vecNumC = tileVectorsNum( &pGSet->tileCY );
+    typeName = dtypeBuiltinType(dtype);
+    vnames = &pGSet->varNames;
+
+    switch(dtype){
+
+        case TYPE_FLOAT:
+        case TYPE_DOUBLE:
+
+            if( vecLenC > 1){
+                sprintf(
+                    tmp,
+                    "__local %s%d a%s[%d*%d*%d];\n"
+                    "__local %s%d *%s = a%s;\n",
+                    typeName,
+                    vecLenC,
+                    vnames->LDS,
+                    itemsPerSubgroup,
+                    subgrPerStep,
+                    vecNumC,
+                    typeName,
+                    vecLenC,
+                    vnames->LDS,
+                    vnames->LDS);
+            }
+            else{
+                sprintf(
+                    tmp,
+                    "__local %s a%s[%d*%d*%d];\n"
+                    "__local %s *%s = a%s;\n",
+                    typeName,
+                    vnames->LDS,
+                    itemsPerSubgroup,
+                    subgrPerStep,
+                    vecNumC,
+                    typeName,
+                    vnames->LDS,
+                    vnames->LDS);
+            }
+
+            break;
+
+        case TYPE_COMPLEX_FLOAT:
+
+            sprintf(
+                tmp,
+                "__local float%d a%s[%d*%d*%d];\n"
+                "__local float%d *%s = a%s;\n",
+                vecLenC*2,
+                vnames->LDS,
+                itemsPerSubgroup,
+                subgrPerStep,
+                vecNumC,
+                vecLenC*2,
+                vnames->LDS,
+                vnames->LDS);
+
+            break;
+
+        case TYPE_COMPLEX_DOUBLE:
+
+             sprintf(
+                tmp,
+                "__local double%d a%s[%d*%d*%d];\n"
+                "__local double%d *%s = a%s;\n",
+                vecLenC*2,
+                vnames->LDS,
+                itemsPerSubgroup,
+                subgrPerStep,
+                vecNumC,
+                vecLenC*2,
+                vnames->LDS,
+                vnames->LDS);
+
+            break;
+
+    }
+
+    kgenAddStmt( pCtx, tmp );
+
+    return 0;
+}
+
+//-----------------------------------------------------------------------------
+
+int
+mergeUpdateResult( struct KgenContext* pCtx,
+    BlasFunctionID funcID,
+    struct BlasGenSettings* pGSet,
+    struct SubgVarNames* pSubgVNames,
+    UpdateResultFlags upResFlags,
+    UpresProcPtr upresProcPtr )
+{
+    char tmp[2048];
+    int subgN = 0;
+    int subgItems = 0;
+    int aBlkH = 0;
+    DataType dtype;
+    Tile tileC;
+    Tile tileScratch;
+    KernelVarNames* pVNames;
+    unsigned int vecLenC;
+    unsigned int vecNumC;
+
+    int subgPerStep = 0;
+
+    if( NULL == pCtx || NULL == pGSet ){
+        return -EINVAL;
+    }
+
+    dtype = pGSet->kextra->dtype;
+    subgN = ( pGSet->subdims[0].x/pGSet->subdims[1].x ) *
+        ( pGSet->subdims[0].y/pGSet->subdims[1].y );
+
+    subgItems = pGSet->subdims[0].bwidth/
+        pGSet->subdims[1].bwidth;
+
+    aBlkH = pGSet->subdims[1].y;
+    pVNames = &pGSet->varNames;
+
+    // calculate best number of subgroups to be engaged in each merge step
+    subgPerStep = calcMergeStepSubgrN( pGSet, dtype );
+
+    vecLenC = pGSet->tileCY.vecLen;
+    vecNumC = tileVectorsNum( &pGSet->tileCY );
+
+    kgenAddStmt(pCtx,"//-----MergeUpdateResult\n");
+    kgenAddBlankLine(pCtx);
+
+    // declare local data storage array
+    kgenAddStmt( pCtx, "// veclenC scratch[SUBG_ITEMS*MSTEP_SUBG*vecNumC]\n");
+    declareSubgrLDS( pCtx,
+        pGSet,
+        dtype);
+
+    kgenAddBlankLine( pCtx );
+
+    kgenAddStmt(pCtx,
+                "//LDS block has the same vectorization as C matrix block\n");
+    kgenAddStmt(
+        pCtx,
+        "//VNUM_C*((get_local_id(1)%MSTEP_SUBG)*SUBG_ITEMS"
+        " +get_local_id(0) );\n");
+
+    sprintf(tmp,
+        "scratch += "
+            "%d*("
+                "(%s.y%%%d)*%d +"
+                "%s.x );\n",
+            vecNumC,
+            pSubgVNames->itemId,
+            subgPerStep,
+            subgItems,
+            pSubgVNames->itemId );
+    kgenAddStmt(pCtx, tmp);
+
+
+    sprintf(
+        tmp,
+        "\nfor( uint mstep = 0; mstep < %d; mstep += %d )",
+        subgN,
+        subgPerStep);
+    kgenBeginBranch(pCtx,tmp);
+    kgenAddBlankLine(pCtx);
+
+    sprintf(
+        tmp,
+        "if( (%s.y >= mstep)&&(%s.y < (mstep+%d)) )",
+        pSubgVNames->itemId,
+        pSubgVNames->itemId,
+        subgPerStep);
+    kgenBeginBranch(pCtx,tmp);
+
+    // the LDS block size is similar to C matrix block size
+    kgenAddBlankLine(pCtx);
+    initTile(&tileC,
+            "c",
+            (unsigned int)pGSet->subdims[1].y,
+            (unsigned int)pGSet->subdims[1].x,
+            vecLenC,
+            dtype,
+            pGSet->tileCY.storType,
+            pGSet->tileCY.trans,
+            pGSet->tileCY.packed);
+
+    initTile(&tileScratch,
+            "scratch",
+            (unsigned int)pGSet->subdims[1].y,
+            (unsigned int)pGSet->subdims[1].x,
+            vecLenC,
+            dtype,
+            PRIV_STORAGE_ARRAY,
+            pGSet->tileCY.trans,
+            pGSet->tileCY.packed);
+
+    genTileCopy(pCtx,
+                &tileScratch,
+                &tileC,
+                TILECOPY_ASSIGN);
+
+    genZeroTile(pCtx,
+                &tileC);
+
+    // split merge if
+    kgenEndBranch( pCtx, NULL ); // merge step if
+    kgenAddBlankLine( pCtx );
+
+    //splitting if on two, to prevent barrier issue
+    kgenAddBarrier( pCtx, CLK_LOCAL_MEM_FENCE );
+    kgenAddBlankLine( pCtx );
+    //----------------------------------------------
+
+    sprintf( tmp,
+        "if( (%s.y >= mstep)&&(%s.y < (mstep+%d)) )",
+        pSubgVNames->itemId,
+        pSubgVNames->itemId,
+        subgPerStep);
+    kgenBeginBranch(pCtx,tmp);
+
+    sprintf( tmp,
+        "if ( 0 == %s.x )",
+        pSubgVNames->itemId );
+    kgenBeginBranch( pCtx, tmp );
+
+    kgenAddBlankLine(pCtx);
+
+    // Zero element of each subgroup also performs LDS merge
+    sprintf(
+        tmp,
+        "for(uint k = 0; k < %d * %d; k += %d)",
+        subgItems,
+        aBlkH,
+        aBlkH);
+
+    kgenBeginBranch(pCtx, tmp);
+    kgenAddBlankLine(pCtx);
+
+    genTileCopy(pCtx,
+                &tileC,
+                &tileScratch,
+                TILECOPY_ADD_ASSIGN );
+    kgenAddStmt(pCtx,
+                "//Adding the LDS block size in vectors\n");
+    sprintf(tmp,
+            "%s += %d;",
+            pVNames->LDS,
+            vecNumC);
+    kgenAddStmt(pCtx, tmp);
+    kgenAddBlankLine(pCtx);
+
+    kgenEndBranch( pCtx, NULL ); // merge for()
+    kgenAddBlankLine( pCtx );
+
+    // Write into global memory -------------------------------
+    if ( NULL != upresProcPtr ) {
+
+        (*upresProcPtr)( pCtx,
+            funcID,
+            pGSet,
+            upResFlags /*| UPRES_INDEXING_WITH_CONSTANTS*/,
+            NULL,
+            NULL,
+            NULL );
+    }
+
+    kgenAddBlankLine(pCtx);
+
+    kgenEndBranch(pCtx, NULL); // merge and global write if
+    kgenEndBranch(pCtx, NULL); // LDS write if
+
+    kgenAddBarrier(pCtx, CLK_LOCAL_MEM_FENCE);
+    //LDS write for
+    kgenEndBranch(pCtx, NULL);
+
+
+    return 0;
+}
+
+//-----------------------------------------------------------------------------
+
+int
+subgGetDefaultDecomp(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    void* pArgs )
+{
+    int itemsPerSubg = 8;
+    int subgA = 4;
+    int subgB = 2;
+
+    int bw1 = 8;
+    int x1 = 4;
+    int y1 = 4;
+    CLBlasKargs *kargs;
+
+    if ( NULL == pArgs ) {
+        return -EINVAL;
+    }
+
+    kargs = (CLBlasKargs *)pArgs;
+
+    if( isComplexType(kargs->dtype) ){
+        bw1 /= 2;
+    }
+    if( isDoubleBasedType(kargs->dtype) ){
+        bw1 /= 2;
+    }
+
+    subdims[1].bwidth = bw1;
+    subdims[1].x = subdims[1].itemX = x1;
+    subdims[1].y = subdims[1].itemY = y1;
+
+    subdims[0].bwidth = bw1 * itemsPerSubg;
+    subdims[0].itemX = x1 * subgB;
+    subdims[0].x = x1*subgB;
+
+    subdims[0].itemY = y1*subgA;
+    subdims[0].y = y1*subgA;
+
+    switch ( pgran->wgDim ) {
+
+        case 1:
+            pgran->wgSize[0] = 64;
+            pgran->wgSize[1] = 1;
+            break;
+
+        case 2:
+            pgran->wgSize[0] = itemsPerSubg;
+            pgran->wgSize[1] = 64/itemsPerSubg;
+            break;
+
+        default:
+            pgran->wgSize[0] = 64;
+            pgran->wgSize[1] = 1;
+            break;
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/src/library/blas/gens/blas_subgroup.h b/src/library/blas/gens/blas_subgroup.h
new file mode 100644
index 0000000..c23c119
--- /dev/null
+++ b/src/library/blas/gens/blas_subgroup.h
@@ -0,0 +1,69 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#ifndef SUBGROUP_H
+#define SUBGROUP_H
+
+#include <clBLAS.h>
+
+#include <cltypes.h>
+#include <kerngen.h>
+#include <mempat.h>
+#include <dblock_kgen.h>
+
+#include <blas_funcs.h>
+#include <matrix_props.h>
+#include "blas_kgen.h"
+
+#include "tile.h"
+#include "fetch.h"
+
+typedef int
+(*UpresProcPtr)( struct KgenContext*,
+    BlasFunctionID,
+    const BlasGenSettings *,
+    UpdateResultFlags,
+    const char *,
+    const char *,
+    const char *);
+
+/**
+*/
+typedef struct SubgVarNames {
+
+    const char* subgCoord;  // 2-vector of subgroup ID by X and Y
+    const char* itemId;     // 2-vector of subgroup item id/subgroupID
+} SubgVarNames;
+
+/**
+*/
+int
+mergeUpdateResult( struct KgenContext* pCtx,
+    BlasFunctionID funcID,
+    struct BlasGenSettings* pGSet,
+    SubgVarNames* pSubgVNames,
+    UpdateResultFlags upResFlags,
+    UpresProcPtr upresProcPtr );
+
+/**
+*/
+int
+subgGetDefaultDecomp(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    void* pArgs );
+
+#endif
diff --git a/src/library/blas/gens/clTemplates/asum.cl b/src/library/blas/gens/clTemplates/asum.cl
new file mode 100644
index 0000000..f525660
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/asum.cl
@@ -0,0 +1,78 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+static const char *asum_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+__kernel void %PREFIXasum_kernel( __global %TYPE *_X, __global %PTYPE *scratchBuff, uint N, uint offx, int incx)
+{
+	__global %TYPE *X = _X + offx;
+    %TYPE asum = (%TYPE) 0.0;
+
+    #ifdef INCX_NEGATIVE
+        if( get_global_id(0) == 0 ) {
+            scratchBuff[0] = (%PTYPE)0.0;
+        }
+        return;
+    #endif
+
+
+    int gOffset;
+    for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1)<N; gOffset+=( get_global_size(0) * %V ) )
+    {
+        %TYPE%V vReg1;
+
+        #ifdef INCX_NONUNITY
+            %VLOADWITHINCX( vReg1, (X + (gOffset*incx)), incx);
+        #else
+            vReg1 = %VLOAD( 0, (X + gOffset) );
+        #endif
+        vReg1 = fabs( vReg1 );
+
+        asum += %REDUCE_SUM( vReg1 );          // Add-up elements in the vector to give a scalar
+    }
+
+    for( ; gOffset<N; gOffset++ )
+    {
+        %TYPE sReg1 = X[gOffset * incx];
+        sReg1 = fabs( sReg1 );
+        //%TYPE res;
+        %ADD( asum, asum, sReg1 );
+    }
+
+    %REDUCTION_BY_SUM(asum);
+
+    %PTYPE answer;
+
+    #ifdef COMPLEX
+        answer = asum.even + asum.odd;
+    #else
+        answer = asum;
+    #endif
+
+
+    if( (get_local_id(0)) == 0 ) {
+        scratchBuff[ get_group_id(0) ] = answer;
+    }
+}
+\n";
+
diff --git a/src/library/blas/gens/clTemplates/axpy.cl b/src/library/blas/gens/clTemplates/axpy.cl
new file mode 100644
index 0000000..68f05f7
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/axpy.cl
@@ -0,0 +1,78 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+static const char *axpy_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+__kernel void %PREFIXaxpy_kernel( %TYPE alpha, __global %TYPE *_X, __global %TYPE *_Y, uint N, uint offx, int incx, uint offy, int incy )
+{
+	__global %TYPE *X = _X + offx;
+	__global %TYPE *Y = _Y + offy;
+
+    if ( incx < 0 ) {
+        X = X + (N - 1) * abs(incx);
+    }
+    if ( incy < 0 ) {
+        Y = Y + (N - 1) * abs(incy);
+    }
+
+    int gOffset;
+    for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1)<N; gOffset+=( get_global_size(0) * %V ) )
+    {
+        %TYPE%V vReg1, vReg2;
+
+        #ifdef INCX_NONUNITY
+            %VLOADWITHINCX( vReg1, (X + (gOffset*incx)), incx);
+        #else
+            vReg1 = %VLOAD( 0, (X + gOffset) );
+        #endif
+
+        #ifdef INCY_NONUNITY
+            %VLOADWITHINCX( vReg2, (Y + (gOffset*incy)), incy);
+        #else
+            vReg2 = %VLOAD( 0, (Y + gOffset) );
+        #endif
+
+        %VMAD( vReg2, alpha, vReg1 );
+
+        #ifdef INCY_NONUNITY
+            %VSTOREWITHINCX( (Y + (gOffset * incy)), vReg2, incy );
+        #else
+            %VSTORE( vReg2, 0 ,(Y + (gOffset * incy)) );
+        #endif
+    }
+
+    // Loop for the last thread to handle the tail part of the vector
+    // Using the same gOffset used above
+    for( ; gOffset<N; gOffset++ )
+    {
+        %TYPE sReg1, sReg2;
+        sReg1 = X[gOffset * incx];
+        sReg2 = Y[gOffset * incy];
+
+        %MAD( sReg2, alpha, sReg1 );
+        Y[gOffset * incy] = sReg2;
+        }
+}
+\n";
+
diff --git a/src/library/blas/gens/clTemplates/copy.cl b/src/library/blas/gens/clTemplates/copy.cl
new file mode 100644
index 0000000..51957b6
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/copy.cl
@@ -0,0 +1,68 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+
+static const char *copy_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+__kernel void %PREFIXcopy_kernel( __global %TYPE *_X, __global %TYPE *_Y, uint N, uint offx, int incx, uint offy, int incy )
+{
+	__global %TYPE *X = _X + offx;
+	__global %TYPE *Y = _Y + offy;
+
+    if ( incx < 0 ) {
+        X = X + (N - 1) * abs(incx);
+    }
+    if ( incy < 0 ) {
+        Y = Y + (N - 1) * abs(incy);
+    }
+
+    int gOffset;
+    for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1)<N; gOffset+=( get_global_size(0) * %V ) )
+    {
+        %TYPE%V vReg1;
+
+        #ifdef INCX_NONUNITY
+            %VLOADWITHINCX( vReg1, (X + (gOffset*incx)), incx);
+        #else
+            vReg1 = %VLOAD( 0, (X + gOffset) );
+        #endif
+
+        #ifdef INCY_NONUNITY
+            %VSTOREWITHINCX( (Y + (gOffset * incy)), vReg1, incy );
+        #else
+            %VSTORE( vReg1, 0 ,(Y + (gOffset * incy)) );
+        #endif
+    }
+
+    // Loop for the last thread to handle the tail part of the vector
+    // Using the same gOffset used above
+    for( ; gOffset<N; gOffset++ )
+    {
+        %TYPE sReg1;
+        sReg1 = X[gOffset * incx];
+        Y[gOffset * incy] = sReg1;
+    }
+}
+\n";
+
diff --git a/src/library/blas/gens/clTemplates/dot.cl b/src/library/blas/gens/clTemplates/dot.cl
new file mode 100644
index 0000000..5c18142
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/dot.cl
@@ -0,0 +1,86 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+
+static const char *dot_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+__kernel void %PREFIXdot_kernel( __global %TYPE *_X, __global %TYPE *_Y, __global %TYPE *scratchBuff,
+                                        uint N, uint offx, int incx, uint offy, int incy, int doConj )
+{
+	__global %TYPE *X = _X + offx;
+	__global %TYPE *Y = _Y + offy;
+    %TYPE dotP = (%TYPE) 0.0;
+
+    if ( incx < 0 ) {
+        X = X + (N - 1) * abs(incx);
+    }
+    if ( incy < 0 ) {
+        Y = Y + (N - 1) * abs(incy);
+    }
+
+    int gOffset;
+    for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1)<N; gOffset+=( get_global_size(0) * %V ) )
+    {
+        %TYPE%V vReg1, vReg2, res;
+
+        #ifdef INCX_NONUNITY
+            %VLOADWITHINCX( vReg1, (X + (gOffset*incx)), incx);
+        #else
+            vReg1 = %VLOAD( 0, (X + gOffset) );
+        #endif
+
+        #ifdef INCY_NONUNITY
+            %VLOADWITHINCX( vReg2, (Y + (gOffset*incy)), incy);
+        #else
+            vReg2 = %VLOAD( 0, (Y + gOffset) );
+        #endif
+
+        %CONJUGATE(doConj, vReg1);
+        %VMUL( res, vReg1, vReg2 );
+        dotP += %REDUCE_SUM( res );          // Add-up elements in the vector to give a scalar
+    }
+
+    // Loop for the last thread to handle the tail part of the vector
+    // Using the same gOffset used above
+    for( ; gOffset<N; gOffset++ )
+    {
+        %TYPE sReg1, sReg2, res;
+        sReg1 = X[gOffset * incx];
+        sReg2 = Y[gOffset * incy];
+
+        %CONJUGATE(doConj, sReg1);
+            %MUL( res, sReg1, sReg2 );
+            %ADD( dotP, dotP, res );
+        }
+
+    // Note: this has to be called outside any if-conditions- because REDUCTION uses barrier
+    // dotP of work-item 0 will have the final reduced item of the work-group
+    %REDUCTION_BY_SUM( dotP );
+
+    if( (get_local_id(0)) == 0 ) {
+        scratchBuff[ get_group_id(0) ] = dotP;
+    }
+}
+\n";
+
diff --git a/src/library/blas/gens/clTemplates/gbmv.cl b/src/library/blas/gens/clTemplates/gbmv.cl
new file mode 100644
index 0000000..3cdadae
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/gbmv.cl
@@ -0,0 +1,292 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// Row-Major Non-transpose case
+static const char *gbmv_RNT_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#define  H  ( %DEF_H )
+#define  TARGET_ROWS  ( %DEF_TARGET_ROWS )
+
+__kernel void %PREFIXgbmv_RNT_kernel( __global const %TYPE * _A, __global %TYPE * _y_vector, __global %TYPE const* restrict _x_vector,
+                                        uint M, uint N, uint KL, uint KU, uint lda, int incx, int incy, uint offa, uint offx, uint offy
+#ifndef TBMV_ONLY
+                                    ,%TYPE alpha, %TYPE beta
+#endif
+                                                              )
+{
+    __global const %TYPE * A;
+	__global %TYPE const* restrict X;
+	__global %TYPE* Y;
+	__local %TYPE localRed[ (H+1) * TARGET_ROWS ];  // To avoid bank-conflict
+
+	A = _A + offa;
+	if ( incx < 0 )                     // Goto end of vector
+		X = _x_vector + offx + ( N - 1 ) * abs(incx);
+	else
+		X = _x_vector + offx;
+
+	if( incy < 0 )
+		Y = _y_vector + offy + ( M - 1 ) * abs(incy);
+    else
+		Y = _y_vector + offy;
+
+    int gId = get_group_id( 0 );
+    int lId = get_local_id( 0 );
+    int threadRow = (lId / H);
+    int threadCol = (lId % H);
+    int row = ( gId * TARGET_ROWS ) + threadRow;
+    int AStartColIndex = max( (int)(KL-row), 0 );
+    int XStartIndex = ( row <= KL )? 0: (int)(row-KL);
+    bool diagPresent = ( row < N ) ? true: false;
+    int numSubDiags = min( row, max( 0, min( ((int)min( KL, N )), (int)(N+KL-row) ) ) );
+    int numSupDiags = max( 0, min( (int)KU, (int)(N-1-row) ) );
+    %TYPE reg1, reg2, sum;
+
+    if( row < M )
+    {
+        sum = %MAKEVEC(0.0);
+        localRed[ lId ] = %MAKEVEC(0.0);
+        // Sub-diagonal iteration
+        #ifdef GIVEN_SHBMV_UPPER
+            int symmStartRow = max( 0, (row - (int)KU) );       // row - (BW-1) = KU
+            int symmStartCol = min( (int)KU, row );             // row - (BW-1) = KU
+        #endif
+        for( int i=threadCol; i<numSubDiags; i+= H )
+        {
+            #ifdef GIVEN_SHBMV_UPPER
+                reg1 = A[ ((symmStartRow+i) * lda) + (symmStartCol - i) ];
+                %CONJUGATE(1 , reg1);                           // Hermitian transpose- will be ignored for real cases
+            #else
+                reg1 = A[ (row * lda) + (AStartColIndex + i) ];
+            #endif
+
+            #ifdef DO_CONJ
+                %CONJUGATE(1 , reg1);
+            #endif
+            reg2 = X[ (XStartIndex + i) * incx ];
+            %MAD( sum, reg1, reg2 );
+        }
+        #ifdef GIVEN_SHBMV_UPPER
+            AStartColIndex = 0;
+        #else
+            AStartColIndex += numSubDiags;
+        #endif
+        XStartIndex += numSubDiags;
+
+        // Calculate diagonal component -- only by first thread of the row
+        if( diagPresent )
+        {
+            if( threadCol == 0 )
+            {
+                reg2 = X[ XStartIndex * incx ];
+                #ifndef UNIT_DIAG
+                    reg1 = A[ (row * lda) + AStartColIndex ];
+                    #ifdef DO_CONJ
+                        %CONJUGATE(1 , reg1);
+                    #endif
+                    #ifdef HBMV_ONLY
+                        reg1.odd = 0.0;                 // Imaginary part of diagonal is assumed to be zero
+                    #endif
+                    %MAD( sum, reg1, reg2 );
+                #else
+                    sum += reg2;
+                #endif
+            }
+            AStartColIndex ++;
+            XStartIndex ++;
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if( row < M )
+    {
+	    // Super-diagonal iteration
+	    #ifdef GIVEN_SHBMV_LOWER
+            AStartColIndex = (KL+1) - 2;            // KL+1 is BW
+        #endif
+
+        for( int i=threadCol; i<numSupDiags; i+= H )
+        {
+            #ifdef GIVEN_SHBMV_LOWER
+                reg1 = A[ ((row+i+1) * lda) + (AStartColIndex - i) ];
+                %CONJUGATE(1 , reg1);                           // Hermitian transpose- will be ignored for real cases
+            #else
+                reg1 = A[ (row * lda) + (AStartColIndex + i) ];
+            #endif
+
+            #ifdef DO_CONJ
+                %CONJUGATE(1 , reg1);
+            #endif
+            reg2 = X[ (XStartIndex + i) * incx ];
+            %MAD( sum, reg1, reg2 );
+        }
+        localRed[ (threadRow * (H+1)) + threadCol ] = sum;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Update the Y vector
+    if( (threadCol == 0) && (row < M) )
+    {
+        sum = %MAKEVEC(0.0);
+        for( int i=0; i<H; i++ )
+        {
+            %ADD( sum, sum, localRed[ (threadRow * (H+1)) + i ] );
+        }
+        #ifndef TBMV_ONLY
+            %MUL( reg1, alpha, sum );
+            %MUL( reg2, beta, Y [ row * incy ] );
+            %ADD( Y[ row * incy ], reg1, reg2 );
+        #else
+            Y[ row * incy ] = sum;
+        #endif
+    }
+
+}
+";
+
+
+
+// Row-Major Transpose case
+static const char *gbmv_RT_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#define TARGET_ROWS  ( %DEF_TARGET_ROWS )
+#define HEIGHT ( %DEF_H)
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+__kernel void %PREFIXgbmv_RT_kernel( __global const %TYPE * _A, __global %TYPE * _y_vector, __global %TYPE const* restrict _x_vector,
+                                    uint M, uint N, uint KL, uint KU, uint lda, int incx, int incy, uint offa, uint offx, uint offy
+#ifndef TBMV_ONLY
+                                    ,%TYPE alpha, %TYPE beta
+#endif
+                                    )
+{
+    __global const %TYPE * A = _A + offa;
+    __global %TYPE const * restrict X;
+    __global %TYPE *Y;
+
+    if ( incx < 0 ) // Goto end of X vector
+    {
+        X = _x_vector + offx + ( M - 1) * abs(incx);
+    }
+    else
+    {
+        X = _x_vector + offx;
+    }
+
+    if( incy < 0 ) // Goto end of Y vector
+    {
+        Y = _y_vector + offy + ( N - 1) * abs(incy);
+    }
+    else
+    {
+        Y = _y_vector + offy;
+    }
+
+    int blkID, thrID;
+    int blkColIndx, blkStrtCol, blkStrtRow, blkOffset;
+    int thrRow, thrCol;
+    int bandWidth = KL + KU + 1;
+
+    blkID = get_group_id(0);
+    thrID = get_local_id(0);
+
+    //Find the block start column and start row.
+    blkOffset = blkID * HEIGHT;
+    blkColIndx = (blkOffset) + KL;
+    blkStrtCol = (blkColIndx >= bandWidth) ? (bandWidth - 1) : blkColIndx;
+    blkStrtRow = ((blkColIndx - (bandWidth - 1)) < 0) ? 0 : (blkColIndx - (bandWidth - 1));
+    %TYPE thrSum = %MAKEVEC(0.0); //Private sum for each thread
+    %TYPE reg1, reg2;
+
+    if(((blkOffset) + (thrID % HEIGHT)) < N)
+    {
+        thrRow = blkStrtRow + ((int)thrID / (HEIGHT));
+        thrCol = blkStrtCol + ((int)thrID % (HEIGHT)) - ((int)thrID / (HEIGHT));
+        while((thrRow < M) && (thrCol >= 0))
+        {
+            if(thrCol < bandWidth)
+            {
+                reg2 = X[ thrRow * incx];
+                #ifdef UNIT_DIAG
+                    if(thrCol == ((int)KL))
+                    {
+                        thrSum += reg2;
+                    }
+                    else
+                    {
+                        reg1 = A[(thrRow*lda) + thrCol];
+                        #ifdef DO_CONJ
+                            %CONJUGATE(1 , reg1);
+                        #endif
+                        %MAD(thrSum, reg1, reg2);
+                    }
+                #else
+                    reg1 = A[(thrRow*lda) + thrCol];
+                    #ifdef DO_CONJ
+                        %CONJUGATE(1 , reg1);
+                    #endif
+                    %MAD(thrSum, reg1, reg2);
+                #endif
+                //thrSum += A[(thrRow*lda) + thrCol ] * X[ thrRow ];
+            }
+            thrRow += TARGET_ROWS;
+            thrCol -= TARGET_ROWS;
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //Store the results in a temporary local buffer and accumulate the same.
+    __local %TYPE sum[(TARGET_ROWS * HEIGHT)];
+    sum[thrID] = thrSum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if((thrID < HEIGHT) && ((blkOffset + (thrID % HEIGHT)) < N))
+    {
+        int iY = (blkOffset) + thrID;
+        %TYPE tempSum = %MAKEVEC(0.0);
+        for(int i = 0; i < TARGET_ROWS; i++)
+        {
+            reg1 = sum[thrID + (i * HEIGHT)];
+            %ADD(tempSum, tempSum, reg1);
+            //tempSum += sum[thrID + (i * HEIGHT)];
+        }
+        #ifndef TBMV_ONLY
+            %MUL(reg1, alpha, tempSum);
+            %MUL(reg2, beta, Y[iY * incy]);
+            %ADD(Y[iY * incy], reg1, reg2);
+        #else
+            Y[ iY * incy ] = tempSum;
+        #endif
+        //Y[iY] = ((alpha * tempSum) + (beta * Y[iY]));
+    }
+}
+";
+
+
diff --git a/src/library/blas/gens/clTemplates/gemm.cl b/src/library/blas/gens/clTemplates/gemm.cl
new file mode 100644
index 0000000..26f0526
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/gemm.cl
@@ -0,0 +1,1650 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+//
+// Few Observations:
+// 1. Vector Length of 4 increases the performance of DGEMM to 250GFLOPS.
+//    Coupled with 8x8 block without "barrier", max performance seen is around 267GFLOPS on Cayman.
+//    Otherwise, it is at 225GFLOPS max (16x8 block, with barrier, vector size of 2)
+//    However, a change of tile-size with vector length of just 2 yields 330GFLOPS consistently.
+//    So, 330 is our sweetspot for DGEMM now.
+// 2. When MxN is not completely divisible by [subdimy x subdimx] then a workgroup size of
+//    8x8 yields the best performance.
+//    Even in this case, if extra threads exit - the performance should be better.
+//    Thread exit should be done only if a barrier is NOT used. Otherwise, it will result in a hang.
+// 3. When processing non-tail run, workgroups processing full tiles can be grouped together and run
+//    However, this did not yield any significant performance on tail processing
+//    Sometimes, performance degradation was also seen. So, this idea will not be pursued.
+//
+// Pending Enhancements for GEMM:
+// -4. TN Kernel Performance can be improved. The prototype code shows better performance than the templated
+//     code. The templated code slightly differs from the prototype code. This can be fixed to get more
+//     performance.
+// -2. PENDING BUG FIX on the Unroll Factor for NN kernel - Configurable PANEL Implementation Introduced it.
+//     Currently panel of %V only supported
+//  0. When workgroup size == WAVEFRONT Size,  GEMM_NEEDS_BARRIER need not be defined.
+//     This saves a few milliseconds depending on the problem size.
+//  1. Support for VLOADA, VLOADB and VLOADC in KPRINTF required. Currently, if any one of the matrices
+//     are vector unfriendly, the kernel translates to a completely scalar kernel.
+//     This is pretty easy to implement in KPRINTF.
+//  2. Panel Width == %V in the current implementation. It should be a separate config define
+//     that can has to be a multiple of %V.
+//     Currently only NxN kernel has %PANEL support implemented. TN and NT needs to be enhanced.
+//     This will be required for tuning and also for high performance for D,C and ZGEMMs
+//  3. "actualRow" based improvement can be used in KTail Processing as well for NN Kernel
+//  4. A.B^T can be optimzed for cases where ITEMY > 4. Successive threads are now ITEMX apart
+//     Instead, we can make them float4 apart to get highest L1 cache bandwidth
+//  5. A.B^T - actualCol, actualRow optimization
+//
+static const char *GEMM_NN_KERNEL = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+__kernel void GEMM_NN__KERNEL ( __global %TYPE const * restrict _A, __global %TYPE const * restrict _B, __global %TYPE *_C,
+                                     uint M, uint N, uint _K, uint _lda, uint _ldb, uint ldc, uint offa, uint offb, uint offc,
+                                %TYPE alpha, %TYPE beta
+                                #ifdef TAIL_RUN
+                                , uint tailStartM, uint tailStartN
+                                #endif
+                                )
+{
+    const int V = %V;
+    __global %TYPE const *restrict A;
+    __global %TYPE const *restrict B;
+    __global %TYPE *C = _C + offc;
+    uint K = _K;
+    uint lda, ldb;
+    uint rowA, colA, rowB, colB, rowC, colC;
+    uint numGroupsOnY;
+    uint row, col;
+    uint tid = get_local_id(0);
+    int panel;
+    int ACOLSTART, ACOLEND;
+    uint MV;
+
+    //
+    // %WIDTH - Preferably 16
+    // %ITEMY, %ITEMX - 1 Thread is responsible for %ITEMY * %ITEMX sub-matrix in C
+    //                    %ITEMY must be divisible by %V for NN kernel
+    // The entire workgroup loops-together to complete ITEMY-ITEMX sub-matrix
+    //
+    uint threadsY = %WIDTH;
+    uint threadsX = get_local_size(0)/threadsY;
+
+    //
+    // Column-Major ordering of Workgroups
+    //
+    // %ITEMY - Number of elements , a workitem processes in Y direction.
+    // %ITEMX - Number of elements , a workitem processes in X direction.
+    //
+    // %V     - Vectoring Width
+    // %PANEL(*) - Panel Width to access Rows of A and Columns of B
+    //               Right now, %V is assumed to be the panel width.
+    //               We dont use %PANEL in the current implementation.
+    //
+    MV = M;
+    #ifndef TAIL_RUN
+    {
+        uint bidX, bidY;
+        uint blockDimY;
+
+        #ifdef M_TAIL_PRESENT
+        MV = M - (M % (%V));
+        #endif
+        if (MV == 0)
+        {
+            return;
+        }
+        blockDimY = ((M-1) / (threadsY * %ITEMY)) + 1;
+        bidY = ( get_group_id(0) % ( blockDimY));
+        bidX = ( get_group_id(0) / ( blockDimY));
+        //
+        // Note:
+        // Using the new Map function does not yeild any performnce gain.
+        // In fact, it degraded the performance
+        // Keep this commented.
+        //
+        //mapWorkGroupToTileNumber(M, N, &bidY, &bidX);
+
+        //
+        // <row,col> is the left-top of the TILE region
+        // in the output C matrix that will be determined
+        // by this workgroup
+        //
+        row =  (bidY * (threadsY * %ITEMY));
+        col =  (bidX * (threadsX * %ITEMX));
+    }
+    #else
+    {
+        uint nWorkGroupsAY, nWorkGroupsAX, nWorkGroupsA;
+        uint bidY, bidX;
+
+        if (M == tailStartM)
+        {
+            nWorkGroupsA = 0;
+        } else {
+            nWorkGroupsAY = ((M - tailStartM - 1)/threadsY + 1);
+            nWorkGroupsAX = ((tailStartN - 1)/threadsX + 1);
+            nWorkGroupsA = nWorkGroupsAY * nWorkGroupsAX;
+        }
+        if (get_group_id(0) < nWorkGroupsA)
+        {
+            bidY = get_group_id(0) % (nWorkGroupsAY);
+            bidX = get_group_id(0) / nWorkGroupsAY;
+            row = tailStartM + (bidY * threadsY * %ITEMY);
+            col = (bidX * threadsX * %ITEMX);
+        } else {
+            uint nWorkGroupsBY, nWorkGroupsBX;
+
+            nWorkGroupsBY = ((M-1)/threadsY) + 1;
+            nWorkGroupsBX = ((N-tailStartN-1)/threadsX) + 1;
+            bidY = (get_group_id(0) - nWorkGroupsA) % (nWorkGroupsBY);
+            bidX = (get_group_id(0) - nWorkGroupsA) / nWorkGroupsBY;
+            row = (bidY * threadsY * %ITEMY);
+            col = tailStartN + (bidX * threadsX * %ITEMX);
+        }
+
+    }
+    #endif
+
+    //
+    // ACOLSTART, ACOLEND
+    // SYMM Matrix  multiplication proceeds by multiplying panels on A's block-row
+    // with panels on B's block-column.
+    // However due to symmetric nature of A/B matrix compounded by the fact that
+    // only upper OR lower triangle of the symm matrix is available, vector-loads
+    // are not possible while traversing certain regions of the matrix.
+    // ACOLStart and ACOLEnd - signify what portion of SYMM can be achieved through
+    // this NN kernel. The SYMM handler has to compose the SYMM in-terms of GEMM kernels
+    //
+#ifdef __SYMM_LEFT__
+    // MxM * MxN
+    A = _A + offa;
+    lda = _lda;
+    B = _B + offb;
+    ldb = _ldb;
+    K = M;
+    #ifndef __SYMM_DIAGONAL__
+    #ifdef __SYMM_LOWER__
+    ACOLSTART = 0;
+    ACOLEND = row;
+    #elif defined(__SYMM_UPPER__)
+    ACOLSTART = row + (threadsY*(%ITEMY));
+    ACOLEND = K;
+    #else
+    #error GEMM_NN_KERNEL
+    #endif
+    #else
+        ACOLSTART = row;
+        ACOLEND = row + (threadsY*(%ITEMY));
+    #endif
+    if (ACOLEND > K)
+    {
+        ACOLEND = K;
+    }
+    /*
+    if (get_local_id(0) == 0)
+    {
+        printf(\" GEMM_NN_KERNEL : SYMM_LEFT: Setting ACOLSTART to %d and ACOLEND to %d \\n \" , ACOLSTART, ACOLEND);
+    }
+    */
+#elif defined(__SYMM_RIGHT__)
+    // MxN * NxN
+    A = _B + offb;
+    lda = _ldb;
+    B = _A + offa;
+    ldb = _lda;
+    K = N;
+    #ifndef __SYMM_DIAGONAL__
+    #ifdef __SYMM_UPPER__
+    ACOLSTART = 0;
+    ACOLEND = col;
+    #elif defined(__SYMM_LOWER__)
+    ACOLSTART =  col + (threadsX*(%ITEMX));
+    ACOLEND = K;
+    #else
+    #error GEMM_NN_KERNEl
+    #endif
+#else
+        ACOLSTART = col;
+        ACOLEND =  col + (threadsX*(%ITEMX));
+    #endif
+    if (ACOLEND > K)
+    {
+        ACOLEND = K;
+    }
+#else
+    A = _A + offa;
+    B = _B + offb;
+    K = _K;
+    lda = _lda;
+    ldb = _ldb;
+    ACOLSTART = 0;
+    ACOLEND = K;
+#endif
+
+    uint offsetY = (tid % threadsY) * %V;
+    uint offsetX = (tid / threadsY) * %ITEMX;
+    rowA     =     row + offsetY;
+       colB     =     (col+offsetX);
+    #ifndef TAIL_RUN
+    bool tailBlock = ((row  >= M) || (col >= N));
+    #else
+    bool tailBlock = (row >= tailStartM);
+    #endif
+
+
+    /*
+    #ifdef TAIL_RUN
+    if ((rowA >= M) || (colB >= N))
+    {
+        return;
+    }
+    #endif
+    */
+
+    #ifndef TAIL_RUN
+    // Non-tail RUN
+    if (tailBlock == true)
+    {
+        return;
+    }
+    #elif defined(TAIL_RUN)
+    // TAIL RUN
+    if (tailBlock == false)
+    {
+        return;
+    }
+    #else
+    #error GEMM_NN_KERNEL
+    #endif
+
+    %TYPE%V AVAL[%V][(%ITEMY_BY_V)]; // 8
+    #ifdef COMPLEX
+        %TYPE%HV AVALEVEN[%V][(%ITEMY_BY_V)]; // 8
+        %TYPE%HV AVALODD[%V][(%ITEMY_BY_V)]; // 8
+    #endif
+
+    %TYPE%V   BVAL[%ITEMX];
+    #ifdef COMPLEX
+        %TYPE%HV   BVALEVEN[%ITEMX];
+        %TYPE%HV   BVALODD[%ITEMX];
+    #endif
+
+    %TYPE%V CVAL[(%ITEMY_BY_V)][%ITEMX];
+    #ifdef COMPLEX
+        %TYPE%HV CVALEVEN[(%ITEMY_BY_V)][%ITEMX];
+        %TYPE%HV CVALODD[(%ITEMY_BY_V)][%ITEMX];
+    #endif
+
+    %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+    for(uint i=0; i< (%ITEMY_BY_V); i++)
+    {
+        %IF(%ITEMX) #pragma unroll %ITEMX
+        for(uint j=0; j<(%ITEMX); j++)
+        {
+            CVAL[i][j] = (%TYPE%V) 0;
+            #ifdef COMPLEX
+                CVALEVEN[i][j] = (%TYPE%HV) 0;
+                CVALODD[i][j] = (%TYPE%HV) 0;
+            #endif
+        }
+    }
+
+    uint ACOL;
+    for(ACOL=ACOLSTART; ((ACOL+ %V -1) < ACOLEND); ACOL += %V)
+    {
+        {
+            //
+            // Load B values
+            //
+            %IF(%ITEMX) #pragma unroll %ITEMX
+            for(uint bcol = 0; bcol < %ITEMX; bcol++)
+            {
+                #ifdef N_TAIL_PRESENT
+                uint actualCol;
+                actualCol = ((colB + bcol) >= N) ? (N-1) : (colB + bcol);
+                #endif
+
+                #if !defined(__SYMM_DIAGONAL__) || defined(__SYMM_LEFT__)
+                    #ifndef N_TAIL_PRESENT
+                        BVAL[bcol] = %VLOAD(0, (&B[ACOL + (colB + bcol)*ldb]));
+                    #else
+                        BVAL[bcol] = %VLOAD(0, (&B[ACOL + (actualCol)*ldb]));
+                    #endif
+                #else
+                    // defined(__SYMM_DIAGONAL__) && defined(__SYMM_RIGHT__)
+                    #ifndef N_TAIL_PRESENT
+                        BVAL[bcol] = SYMM_VECTOR_LOAD_USING_SCALAR(B, N, ldb, ACOL, (colB + bcol));
+                    #else
+                        BVAL[bcol] = SYMM_VECTOR_LOAD_USING_SCALAR(B, N, ldb, ACOL, actualCol);
+                    #endif
+                #endif
+                //
+                // If Complex data, load the real and imaginary parts into separate register banks
+                //
+                #ifdef COMPLEX
+                    BVALEVEN[bcol] = BVAL[bcol].even;
+                    BVALODD[bcol] =  BVAL[bcol].odd;
+                #endif
+            }
+        }
+
+        {
+            //
+            // Load A values
+            //
+            //
+            // PENDNG BUG FIX: Unroll Factor should be according to PANEL Size
+            //                 Previoously PANEL was size of V. So ITEMY worked
+            // Current Workaround - Panel same as %V - See gemm_cached.cpp
+            //
+            %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+            for(uint j=0; j< (%ITEMY_BY_V); j++)
+            {
+                #pragma unroll %V
+                for(uint i = 0; i < %V; i++)
+                {
+                    uint actualRow;
+
+                    #if !defined(__SYMM_DIAGONAL__) || defined(__SYMM_RIGHT__)
+                        #ifndef M_TAIL_PRESENT
+                            AVAL[i][j] = %VLOAD(0, (&A[(rowA + j*threadsY*(V)) + (ACOL + i)*lda]) );
+                        #else
+                            actualRow = ((rowA + j*threadsY*(V)) >= MV) ? (MV-%V) : (rowA + j*threadsY*(V));
+                            AVAL[i][j] = %VLOAD(0, (&A[actualRow + (ACOL + i)*lda]) );
+                        #endif
+                    #else
+                        // CASE: SYMM_DIAGONAL && SYMM_LEFT
+                        #ifndef M_TAIL_PRESENT
+                            //AVAL[c][r] = %VLOAD(0, (&A[(rowA + r*threadsY*(V)) + (ACOL + c)*lda]) );
+                            AVAL[i][j] = SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, (rowA + j*threadsY*(V)) ,  (ACOL + i));
+                        #else
+                            actualRow = ((rowA + j*threadsY*(V)) >= MV) ? (MV-%V) : (rowA + j*threadsY*(V));
+                            AVAL[i][j] = SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, actualRow, (ACOL + i));
+                        #endif
+                    #endif
+                    //
+                    // If Complex data, load the real and imaginary parts into separate register banks
+                    //
+                    #ifdef COMPLEX
+                        AVALEVEN[i][j] = AVAL[i][j].even;
+                        AVALODD[i][j] = AVAL[i][j].odd;
+                    #endif
+                }
+            }
+        }
+
+        {
+            %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+            for(uint i=0; i<(%ITEMY_BY_V); i++)
+            {
+                %IF(%ITEMX) #pragma unroll %ITEMX
+                for(uint j=0; j<(%ITEMX); j++)
+                {
+                    #ifndef COMPLEX
+                        %VFOR_REAL
+                        {
+                            CVAL[i][j] = mad(AVAL[%VFORINDEX][i], BVAL[j]%VFORSUFFIX, CVAL[i][j]);
+                        }
+                    #else
+                        //
+                        // Pending - Replace by %COMPLEX_VMAD()
+                        //
+                        %VFOR_REAL
+                        {
+                            //
+                            // PENDING Needs a FIX
+                            //
+                            CVALEVEN[i][j]  = mad(AVALEVEN[%VFORINDEX][i],  BVALEVEN[j]%VFORSUFFIX, CVALEVEN[i][j]);
+                            CVALODD[i][j]   = mad(AVALEVEN[%VFORINDEX][i],  BVALODD[j]%VFORSUFFIX,  CVALODD[i][j]);
+                            CVALEVEN[i][j]  = mad(AVALODD[%VFORINDEX][i],   -BVALODD[j]%VFORSUFFIX,  CVALEVEN[i][j]);
+                            CVALODD[i][j]   = mad(AVALODD[%VFORINDEX][i],   BVALEVEN[j]%VFORSUFFIX,  CVALODD[i][j]);
+                        }
+                    #endif
+                }
+            }
+        }
+
+        #ifdef GEMM_NEEDS_BARRIER
+        barrier(CLK_LOCAL_MEM_FENCE);
+        #endif
+    }
+
+    #ifdef COMPLEX
+    %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+    for(uint i=0; i< (%ITEMY_BY_V); i++)
+    {
+        %IF(%ITEMX) #pragma unroll %ITEMX
+        for(uint j=0; j<(%ITEMX); j++)
+        {
+            %COMPLEX_JOIN(CVAL[i][j], CVALEVEN[i][j], CVALODD[i][j]);
+        }
+    }
+    #endif
+
+    //
+    // Tail blocks never execute this FOR loop as they execute with Vector Width of 1
+    //
+
+
+    for(; ACOL < ACOLEND; ACOL ++)
+    {
+        //
+        // Load B values
+        //
+        %IF(%ITEMX) #pragma unroll %ITEMX
+        for(uint bcol = 0; bcol < %ITEMX; bcol++)
+        {
+            //
+            // PENDING: PANEL iteration to Load the Panel Depth iterating by %V
+            //
+            #if !defined(__SYMM_DIAGONAL__) || defined(__SYMM_LEFT__)
+            {
+                %TYPE SCAL;
+                #ifndef N_TAIL_PRESENT
+                    SCAL = B[ACOL + (colB + bcol)*ldb];
+                    BVAL[bcol] = %VMAKEVEC(SCAL);
+                #else
+                    SCAL = B[ACOL + ((colB + bcol)%(N))*ldb];
+                    BVAL[bcol] = %VMAKEVEC(SCAL);
+                #endif
+            }
+           #else
+                // SYMM_DIAGONAL && SYMM_RIGHT
+            {
+                %TYPE SCAL;
+
+                #ifndef N_TAIL_PRESENT
+                    SCAL = SYMM_SCALAR_LOAD(B, N, ldb, ACOL,  (colB + bcol));
+                    BVAL[bcol] = %VMAKEVEC(SCAL);
+                #else
+                    SCAL = SYMM_SCALAR_LOAD(B, N, ldb, ACOL, ((colB + bcol)%(N)));
+                    BVAL[bcol] = %VMAKEVEC(SCAL);
+                #endif
+            }
+           #endif
+        }
+
+        //
+        // Load A values
+        //
+        %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+        for(uint i = 0; i < (%ITEMY_BY_V); i++) // 1 * ITEMY/V
+        {
+            #if !defined(__SYMM_DIAGONAL__) || defined(__SYMM_RIGHT__)
+            #ifndef M_TAIL_PRESENT
+            AVAL[0][i] = %VLOAD(0, (&A[(rowA + i*threadsY*(V)) + (ACOL)*lda]) );
+            #else
+            AVAL[0][i] = %VLOAD(0, (&A[(((rowA + i*threadsY*(V))) % (MV)) + (ACOL)*lda]) );
+            #endif
+            #else
+                // defined(DIAGONAL) && (LEFT)
+                #ifndef M_TAIL_PRESENT
+                AVAL[0][i] = SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, (rowA + i*threadsY*(V)) , (ACOL));
+                #else
+                AVAL[0][i] = SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, ((rowA + i*threadsY*(V)) % (MV)), (ACOL));
+                #endif
+            #endif
+        }
+
+        {
+            %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+            for(uint i=0; i<(%ITEMY_BY_V); i++)
+            {
+                %IF(%ITEMX) #pragma unroll %ITEMX
+                for(uint j=0; j<(%ITEMX); j++)
+                {
+                    %VMAD(CVAL[i][j] ,  AVAL[0][i] , BVAL[j]);
+                }
+            }
+        }
+    }
+
+
+    /*
+    if ((get_group_id(0) == 0) && (get_local_id(0) == 0))
+    {
+        printf(\"Updating C Matrix: Alpha = %f, Beta = %f\\n\", alpha, beta);
+    }
+    */
+    //
+    // STORE Result in C
+    //
+    %TYPE%V reg , betareg, alphareg;
+    %TYPE%V alphav, betav;
+    alphav = %VMAKEVEC(alpha);
+    betav = %VMAKEVEC(beta);
+
+    %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+    for(uint i=0; i< (%ITEMY_BY_V); i++)
+    {
+        %IF(%ITEMX) #pragma unroll %ITEMX
+        for(uint j=0; j<(%ITEMX); j++)
+        {
+            #if !defined(M_TAIL_PRESENT) && !defined(N_TAIL_PRESENT)
+            reg = %VLOAD(0, (&C[rowA + i*threadsY*V +     (colB+j)*ldc]));
+            %VMUL(betareg, betav, reg);
+            %VMUL(alphareg, alphav, CVAL[i][j]);
+            %ADD( reg, betareg, alphareg);
+            %VSTORE(reg, 0, (&C[(rowA + i*threadsY*V) + (colB+j)*ldc]));
+            #else
+                if (((rowA + i*threadsY*V) < MV) && ((colB + j) < N))
+                {
+                    reg = %VLOAD(0, (&C[rowA + i*threadsY*V +     (colB+j)*ldc]));
+                    %VMUL(betareg, betav, reg);
+                    %VMUL(alphareg, alphav, CVAL[i][j]);
+                    %ADD( reg, betareg, alphareg);
+                    %VSTORE(reg, 0, (&C[(rowA + i*threadsY*V) + (colB+j)*ldc]));
+        }
+            #endif
+    }
+    }
+    return;
+}
+";
+
+static const char *GEMM_NT_KERNEL = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+//#undef COMPLEX
+//#pragma OPENCL EXTENSION cl_amd_printf : enable
+__kernel void GEMM_NT__KERNEL ( __global %TYPE const * restrict _A, __global %TYPE const * restrict _B, __global %TYPE *_C,
+                                uint M, uint N, uint _K, uint _lda, uint _ldb, uint ldc, uint offa, uint offb, uint offc,
+                                %TYPE alpha, %TYPE beta
+                                #ifdef TAIL_RUN
+                                , uint tailStartM, uint tailStartN
+                                #endif
+                                )
+{
+    const int V = %V;
+    __global %TYPE const *restrict A;
+    __global %TYPE const *restrict B;
+    __global %TYPE *C = _C + offc;
+    uint K = _K;
+    uint lda, ldb;
+    uint rowA, colA, rowB, colB, rowC, colC;
+    uint numGroupsOnY;
+    uint row, col;
+    uint tid = get_local_id(0);
+    int panel;
+    int ACOLSTART, ACOLEND;
+    uint MV, NV;
+
+    //
+    // %WIDTH - Preferably 16
+    // %ITEMY, %ITEMX - 1 Thread is responsible for %ITEMY * %ITEMX sub-matrix in C
+    //                    %ITEMY and %ITEMX must be divisible by %V for NT kernel
+    // The entire workgroup loops-together to complete ITEMY-ITEMX sub-matrix
+    //
+    uint threadsY = %WIDTH;
+    uint threadsX = get_local_size(0)/threadsY;
+
+    //
+    // Column-Major ordering of Workgroups
+    //
+    // %ITEMY - Number of elements , a workitem processes in Y direction.
+    // %ITEMX - Number of elements , a workitem processes in X direction.
+    //
+    // %V     - Vectoring Width
+    // %PANEL(*) - Panel Width to access Rows of A and Columns of B
+    //               Right now, %V is assumed to be the panel width.
+    //               We dont use %PANEL in the current implementation.
+    //
+    MV = M;
+    NV = N;
+    #ifndef TAIL_RUN
+    {
+        uint bidX, bidY;
+        uint blockDimY;
+
+        #ifdef M_TAIL_PRESENT
+        MV = M - (M % (%V));
+        if (MV == 0)
+        {
+            return;
+        }
+        #endif
+        #ifdef N_TAIL_PRESENT
+        NV = N - (N% (%V));
+        if (NV == 0)
+        {
+            return;
+        }
+        #endif
+        blockDimY = ((M-1) / (threadsY * %ITEMY)) + 1;
+        uint blockID = get_group_id(0);
+        getBlockNumber(blockDimY, blockID, &bidY, &bidX, 1);
+
+        //
+        // <row,col> is the left-top of the TILE region
+        // in the output C matrix that will be determined
+        // by this workgroup
+        //
+        row =  (bidY * (threadsY * %ITEMY));
+        col =  (bidX * (threadsX * %ITEMX));
+    }
+    #else
+    {
+        uint nWorkGroupsAY, nWorkGroupsAX, nWorkGroupsA;
+        uint bidY, bidX;
+
+        MV = M;
+        if (M == tailStartM)
+        {
+            nWorkGroupsA = 0;
+        } else {
+            nWorkGroupsAY = ((M - tailStartM - 1)/threadsY + 1);
+            nWorkGroupsAX = ((tailStartN - 1)/threadsX + 1);
+            nWorkGroupsA = nWorkGroupsAY * nWorkGroupsAX;
+        }
+        if (get_group_id(0) < nWorkGroupsA)
+        {
+            bidY = get_group_id(0) % (nWorkGroupsAY);
+            bidX = get_group_id(0) / nWorkGroupsAY;
+            row = tailStartM + (bidY * threadsY * %ITEMY);
+            col = (bidX * threadsX * %ITEMX);
+            NV = tailStartN;
+        } else {
+            uint nWorkGroupsBY, nWorkGroupsBX;
+
+            nWorkGroupsBY = ((M-1)/threadsY) + 1;
+            nWorkGroupsBX = ((N-tailStartN-1)/threadsX) + 1;
+            bidY = (get_group_id(0) - nWorkGroupsA) % (nWorkGroupsBY);
+            bidX = (get_group_id(0) - nWorkGroupsA) / nWorkGroupsBY;
+            row = (bidY * threadsY * %ITEMY);
+            col = tailStartN + (bidX * threadsX * %ITEMX);
+            NV = N;
+        }
+
+    }
+    #endif
+
+    //
+    // ACOLSTART, ACOLEND
+    // SYMM Matrix  multiplication proceeds by multiplying panels on A's block-row
+    // with panels on B's block-column.
+    // However due to symmetric nature of A matrix compounded by the fact that
+    // only upper OR lower triangle of the symm matrix is available, vector-loads
+    // are not possible while traversing certain regions of the matrix.
+    // ACOLStart and ACOLEnd - signify what portion of SYMM can be achieved through
+    // this NT kernel. The SYMM handler has to compose the SYMM in-terms of GEMM kernels
+    //
+#ifdef __SYMM_LEFT__
+    #error GEMM_NT_KERNEL Should not be called in __SYMM_LEFT__ case!
+#elif defined(__SYMM_RIGHT__)
+    // MxN * NxN
+    A = _B + offb;
+    lda = _ldb;
+    B = _A + offa;
+    ldb = _lda;
+    K = N;
+    #ifndef __SYMM_DIAGONAL__
+    #ifdef __SYMM_UPPER__
+    ACOLSTART =  col + (threadsX*(%ITEMX));
+    ACOLEND = K;
+    #elif defined(__SYMM_LOWER__)
+    ACOLSTART = 0;
+    ACOLEND = col;
+    #else
+    #error GEMM_NT_KERNEL : Neither SYMM_UPPER nor SYMM_LOWER is defined!
+    #endif
+    #else
+        ACOLSTART = col;
+        ACOLEND =  col + (threadsX*(%ITEMX));
+    #endif
+    if (ACOLEND > K)
+    {
+        ACOLEND = K;
+    }
+#else // GEMM
+    A = _A + offa;
+    B = _B + offb;
+    K = _K;
+    lda = _lda;
+    ldb = _ldb;
+    ACOLSTART = 0;
+    ACOLEND = K;
+#endif
+
+    uint offsetY = (tid % threadsY) * %V;
+    uint offsetX = (tid / threadsY) * %ITEMX;
+    rowA     =     row + offsetY;
+       colB     =     col + offsetX;
+    #ifndef TAIL_RUN
+    bool tailBlock = ((row >= M) || (col >= N));
+    #else
+    bool tailBlock = ((row >= tailStartM) || (col >= tailStartN));
+    #endif
+
+    /* Should be handled with TAIL_PRESENT Macros.
+    if ((rowA >= M) || (colB >= N))
+    {
+        return;
+    }
+    */
+
+    #ifndef TAIL_RUN
+    // Non-tail RUN
+    if (tailBlock == true)
+    {
+        return;
+    }
+    #else
+    // TAIL RUN - This case never happens.
+    if (tailBlock == false)
+    {
+        return;
+    }
+    #endif
+
+    %TYPE%V CVAL[(%ITEMY_BY_V)][%ITEMX];
+    #ifdef COMPLEX
+    %TYPE%HV    CVALEVEN[(%ITEMY_BY_V)][%ITEMX];
+    %TYPE%HV    CVALODD[(%ITEMY_BY_V)][%ITEMX];
+    #endif
+
+    %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+    for(uint i=0; i< (%ITEMY_BY_V); i++)
+    {
+        %IF(%ITEMX) #pragma unroll %ITEMX
+        for(uint j=0; j<(%ITEMX); j++)
+        {
+            CVAL[i][j] = (%TYPE%V) 0;
+            #ifdef COMPLEX
+            CVALEVEN[i][j] = (%TYPE%HV) 0;
+            CVALODD[i][j] = (%TYPE%HV) 0;
+            #endif
+        }
+    }
+
+    uint ACOL;
+    for(ACOL=ACOLSTART; ((ACOL+%V-1) < ACOLEND); ACOL += %V /* %PANEL */)
+    {
+        %TYPE%V AVAL[%V][(%ITEMY_BY_V)];     // [%PANEL][%ITEMY_BY_V]
+        %TYPE%V BVAL[%ITEMX_BY_V][%V];        // [%PANEL][%ITEMX]
+        #ifdef COMPLEX
+        %TYPE%HV    AVALEVEN[%V][(%ITEMY_BY_V)];     // [%PANEL][%ITEMY_BY_V]
+        %TYPE%HV    AVALODD[%V][(%ITEMY_BY_V)];     // [%PANEL][%ITEMY_BY_V]
+        %TYPE%HV    BVALEVEN[%ITEMX_BY_V][%V];        // [%PANEL][%ITEMX]
+        %TYPE%HV    BVALODD[%ITEMX_BY_V][%V];        // [%PANEL][%ITEMX]
+        #endif
+
+        {
+            //
+            // Load B values
+            //
+            %IF(%V) #pragma unroll %V
+            for(uint panel=0; panel < %V; panel++)
+            {
+                %IF(%ITEMX_BY_V) #pragma unroll %ITEMX_BY_V
+                for(uint bcol = 0; bcol < %ITEMX_BY_V; bcol++)
+                {
+                    //
+                    // PENDING: PANEL iteration to Load the Panel Depth iterating by %V
+                    //
+                    #ifndef __SYMM_DIAGONAL__
+                        #ifndef N_TAIL_PRESENT
+                        BVAL[bcol][panel] = %VLOAD(0, (&B[(ACOL + panel)*ldb + (colB + bcol*(V))]));
+                        #else
+                        BVAL[bcol][panel] = %VLOAD(0, (&B[(ACOL + panel)*ldb + ((colB + bcol*V) % NV)]));
+                        #endif
+                    #else
+                        #ifndef N_TAIL_PRESENT
+                        BVAL[bcol][panel] = SYMM_VECTOR_LOAD_USING_SCALAR(B, N, ldb, (colB + bcol*(V)), (ACOL + panel));
+                        #else
+                        BVAL[bcol][panel] =
+                            SYMM_VECTOR_LOAD_USING_SCALAR(B, N, ldb, ((colB + bcol*V) % NV), (ACOL + panel));
+                        #endif
+                    #endif
+
+                    #ifdef CONJUGATE_B
+                        %TYPE%V conjTemp = BVAL[bcol][panel];
+                        %CONJUGATE(1, conjTemp);
+                        BVAL[bcol][panel] = conjTemp;
+                    #endif
+                    #ifdef COMPLEX
+                    {
+                        BVALEVEN[bcol][panel] = BVAL[bcol][panel].even;
+                        BVALODD[bcol][panel]  = BVAL[bcol][panel].odd;
+                    }
+                    #endif
+                }
+            }
+
+            //
+            // Load A values
+            //
+            %IF(%ITEMY) #pragma unroll %ITEMY
+            for(uint i = 0; i < (%V * (%ITEMY_BY_V)) /* PANEL * ITEMY/V */; i++)
+            {
+                const uint yiterations = %ITEMY_BY_V;
+                uint c = (i / yiterations);
+                uint r = (i % yiterations);
+
+                #ifndef M_TAIL_PRESENT
+                AVAL[c][r] = %VLOAD(0, (&A[(rowA + r*threadsY*(V)) + (ACOL + c)*lda]) );
+                #else
+                AVAL[c][r] = %VLOAD(0, (&A[((rowA + r*threadsY*(V)) % MV) + (ACOL + c)*lda]) );
+                #endif
+
+                #ifdef COMPLEX
+                AVALEVEN[c][r] = AVAL[c][r].even;
+                AVALODD[c][r] = AVAL[c][r].odd;
+                #endif
+            }
+        }
+
+        %IF(%V) #pragma unroll %V
+        for(uint panel=0; panel<(%V); panel++)
+        {
+            %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+            for(uint i=0; i<(%ITEMY_BY_V); i++)
+            {
+                %IF(%ITEMX_BY_V) #pragma unroll %ITEMX_BY_V
+                for(uint j=0; j<(%ITEMX_BY_V); j++)
+                {
+                    const int CX = j * (%V);
+
+                    #ifndef COMPLEX
+                    %VFOR_REAL
+                    {
+                        CVAL[i][CX + %VFORINDEX] = mad(AVAL[panel][i], BVAL[j][panel]%VFORSUFFIX,
+                                                        CVAL[i][CX + %VFORINDEX]);
+                    }
+                    #else
+                        //
+                        // PENDING: Replace with %COMPLEX_MAD op
+                        //
+                        %VFOR_REAL
+                        {
+                            CVALEVEN[i][CX + %VFORINDEX] =
+                                mad(AVALEVEN[panel][i], BVALEVEN[j][panel]%VFORSUFFIX, CVALEVEN[i][CX + %VFORINDEX]);
+                            CVALODD[i][CX + %VFORINDEX]  =
+                                mad(AVALEVEN[panel][i], BVALODD[j][panel]%VFORSUFFIX,  CVALODD[i][CX + %VFORINDEX]);
+                            CVALEVEN[i][CX + %VFORINDEX] =
+                                mad(AVALODD[panel][i], -BVALODD[j][panel]%VFORSUFFIX,  CVALEVEN[i][CX + %VFORINDEX]);
+                            CVALODD[i][CX + %VFORINDEX] =
+                                mad(AVALODD[panel][i], BVALEVEN[j][panel]%VFORSUFFIX,  CVALODD[i][CX + %VFORINDEX]);
+                        }
+                    #endif
+                }
+            }
+        }
+
+        #ifdef GEMM_NEEDS_BARRIER
+        barrier(CLK_LOCAL_MEM_FENCE);
+        #endif
+    }
+
+    #ifdef COMPLEX
+    %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+    for(uint i=0; i< (%ITEMY_BY_V); i++)
+    {
+        %IF(%ITEMX) #pragma unroll %ITEMX
+        for(uint j=0; j<(%ITEMX); j++)
+        {
+            %COMPLEX_JOIN(CVAL[i][j], CVALEVEN[i][j], CVALODD[i][j]);
+        }
+    }
+    #endif
+
+    //
+    // Tail blocks never execute this FOR loop as they execute with Vector Width of 1
+    //
+
+    for(; ACOL < ACOLEND; ACOL ++)
+    {
+        %TYPE%V AVAL[(%ITEMY_BY_V)];    // [%PANEL][%ITEMY_BY_V]
+        %TYPE   BVAL[%ITEMX];               // [%PANEL][%ITEMX]
+
+        //
+        // Load B values
+        //
+        %IF(%ITEMX) #pragma unroll %ITEMX
+        for(uint bcol = 0; bcol < %ITEMX; bcol++)
+        {
+            %TYPE SCALAR;
+            //
+            // PENDING: PANEL iteration to Load the Panel Depth iterating by %V
+            //
+            {
+                #ifndef __SYMM_DIAGONAL__
+                    #ifndef N_TAIL_PRESENT
+                        SCALAR = B[ACOL*ldb + (colB + bcol)];
+                    #else
+                        SCALAR = B[ACOL*ldb + ((colB + bcol) % NV)];
+                    #endif
+                #else
+                    #ifndef N_TAIL_PRESENT
+                        SCALAR = SYMM_SCALAR_LOAD(B, N, ldb, (colB + bcol), ACOL );
+                    #else
+                        SCALAR = SYMM_SCALAR_LOAD(B, N, ldb, ((colB + bcol) % NV), ACOL);
+                    #endif
+                #endif
+
+                #ifdef CONJUGATE_B
+                    %CONJUGATE(1, SCALAR);
+                #endif
+                BVAL[bcol] = (SCALAR);
+            }
+        }
+
+        //
+        // Load A values
+        //
+        %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+        for(uint i = 0; i < (%ITEMY_BY_V); i++) // 1 * ITEMY/V
+        {
+            #ifndef M_TAIL_PRESENT
+            AVAL[i] = %VLOAD(0, (&A[(rowA + i*threadsY*(V)) + (ACOL)*lda]) );
+            #else
+            AVAL[i] = %VLOAD(0, (&A[((rowA + i*threadsY*(V)) % MV) + (ACOL)*lda]) );
+            #endif
+        }
+
+        {
+            %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+            for(uint i=0; i<(%ITEMY_BY_V); i++)
+            {
+                %IF(%ITEMX) #pragma unroll %ITEMX
+                for(uint j=0; j<(%ITEMX); j++)
+                {
+                    %VMAD(CVAL[i][j] ,  AVAL[i] , BVAL[j]);
+                }
+            }
+        }
+    }
+
+    //
+    // STORE Result in C
+    //
+    %TYPE%V reg , betareg, alphareg;
+    %TYPE%V alphav, betav;
+    alphav = %VMAKEVEC(alpha);
+    betav = %VMAKEVEC(beta);
+
+    #ifndef HERK
+    %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+    for(uint i=0; i< (%ITEMY_BY_V); i++)
+    {
+        %IF(%ITEMX) #pragma unroll %ITEMX
+        for(uint j=0; j<(%ITEMX); j++)
+        {
+            #if !defined(M_TAIL_PRESENT) && !defined(N_TAIL_PRESENT)
+            reg = %VLOAD(0, (&C[rowA + i*threadsY*V +     (colB+j)*ldc]));
+            %VMUL(betareg, betav, reg);
+            %VMUL(alphareg, alphav, CVAL[i][j]);
+            %ADD( reg, betareg, alphareg);
+            %VSTORE(reg, 0, (&C[(rowA + i*threadsY*V) + (colB+j)*ldc]));
+            #else
+                if (((rowA + i*threadsY*V) < MV) && ((colB+j) < NV))
+                {
+                    reg = %VLOAD(0, (&C[rowA + i*threadsY*V +     (colB+j)*ldc]));
+                    %VMUL(betareg, betav, reg);
+                    %VMUL(alphareg, alphav, CVAL[i][j]);
+                    %ADD( reg, betareg, alphareg);
+                    %VSTORE(reg, 0, (&C[(rowA + i*threadsY*V) + (colB+j)*ldc]));
+        }
+            #endif
+    }
+    }
+    #else
+    %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+    for(uint i=0; i<(%ITEMY_BY_V); i++)
+    {
+        %IF(%ITEMX) #pragma unroll %ITEMX
+        for(uint j=0; j<(%ITEMX); j++)
+        {
+            int actualRow = rowA + i*threadsY*V;
+            int actualCol = colB + j;
+            #if !defined(M_TAIL_PRESENT) && !defined(N_TAIL_PRESENT)
+                {
+                    %VMUL(alphareg, alphav, CVAL[i][j]);
+                    //%TYPE temp[%V];
+                    //*(__private %TYPE%V *)(&temp) = alphareg;
+                    //#pragma unroll %V
+                    //for(uint r = 0; r < %V; r++)
+                    %VFOR
+                    {
+                        #ifdef HERK_LOWER_TRIANGLE
+                        if((actualRow + %VFORINDEX) >= (actualCol))
+                        #else
+                        if((actualRow + %VFORINDEX) <= (actualCol))
+                        #endif
+                        {
+                            %TYPE C_s =  C[%VFORINDEX + actualRow + actualCol * ldc];
+                            %TYPE beta_s;
+                            %MUL(beta_s, beta, C_s);
+                            C_s = alphareg%VFORSUFFIX + beta_s;
+                            if((%VFORINDEX + actualRow) == actualCol)
+                            {
+                                 C_s.odd = 0.0f;
+                            }
+                            C[%VFORINDEX + actualRow + actualCol * ldc] = C_s;
+                        }
+                    }
+                }
+            #else
+                {
+                    if (((rowA + i*threadsY*V) < MV) && ((colB+j) < NV))
+                    {
+                        %VMUL(alphareg, alphav, CVAL[i][j]);
+                        //%TYPE temp[%V];
+                        //*(__private %TYPE%V *)(&temp) = alphareg;
+                        //#pragma unroll %V
+                        //for(uint r = 0; r < %V; r++)
+                        %VFOR
+                        {
+                            #ifdef HERK_LOWER_TRIANGLE
+                            if((%VFORINDEX + actualRow) >= (actualCol))
+                            #else
+                            if((%VFORINDEX + actualRow) <= (actualCol))
+                            #endif
+                            {
+                                %TYPE C_s =  C[%VFORINDEX + actualRow + actualCol * ldc];
+                                %TYPE beta_s;
+                                %MUL(beta_s, beta, C_s);
+                                C_s = alphareg%VFORSUFFIX + beta_s;
+                                if((%VFORINDEX + actualRow) == actualCol)
+                                {
+                                    C_s.odd = 0.0f;
+                                }
+                                C[%VFORINDEX + actualRow + actualCol * ldc] = C_s;
+                            }
+                        }
+                    }
+                }
+            #endif
+        }
+    }
+    #endif
+    return;
+}
+";
+
+
+static const char *GEMM_TN_KERNEL = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+//#pragma OPENCL EXTENSION cl_amd_printf : enable
+__kernel void GEMM_TN__KERNEL ( __global %TYPE const * restrict _A, __global %TYPE const * restrict _B, __global %TYPE *_C,
+                                     uint M, uint N, uint _K, uint _lda, uint _ldb, uint ldc, uint offa, uint offb, uint offc,
+                                %TYPE alpha, %TYPE beta
+                                #ifdef TAIL_RUN
+                                , uint tailStartM, uint tailStartN
+                                #endif
+                                )
+{
+    const int V = %V;
+    const int ITEMY = %ITEMY;
+    __global %TYPE const *restrict A;
+    __global %TYPE const *restrict B;
+    __global %TYPE *C = _C + offc;
+    uint K = _K;
+    uint lda, ldb;
+    uint rowA, colA, rowB, colB, rowC, colC;
+    uint numGroupsOnY;
+    uint row, col;
+    uint tid = get_local_id(0);
+    int panel;
+    int ACOLSTART, ACOLEND;
+    uint MV, bidX;
+    uint bidY;
+    uint blockDimX;
+
+    //
+    // %WIDTH - Preferably 16
+    // %ITEMY, %ITEMX - 1 Thread is responsible for %ITEMY * %ITEMX sub-matrix in C
+    //                    %ITEMY must be divisible by %V for NN kernel
+    // The entire workgroup loops-together to complete ITEMY-ITEMX sub-matrix
+    //
+    uint threadsY = %WIDTH;
+    uint threadsX = get_local_size(0)/threadsY;
+
+    //
+    // Row-Major ordering of Workgroups
+    //
+    // %ITEMY - Number of elements , a workitem processes in Y direction.
+    // %ITEMX - Number of elements , a workitem processes in X direction.
+    //
+    // %V     - Vectoring Width
+    // %PANEL(*) - Panel Width to access Rows of A and Columns of B
+    //               Right now, %V is assumed to be the panel width.
+    //               We dont use %PANEL in the current implementation.
+    //
+    MV = M;
+    #ifndef TAIL_RUN
+    {
+
+        blockDimX = ((N-1) / (threadsX * %ITEMX)) + 1;
+        uint blockID = get_group_id(0);
+        getBlockNumber(blockDimX, blockID, &bidY, &bidX, 0);
+
+        //
+        // <row,col> is the left-top of the TILE region
+        // in the output C matrix that will be determined
+        // by this workgroup
+        //
+        row =  (bidY * (threadsY * %ITEMY));
+        col =  (bidX * (threadsX * %ITEMX));
+    }
+    #else
+    #error GEMM_TN_KERNEL: TAIL_RUN is NOT needed for TN Kernel!
+    #endif
+
+    //
+    // ACOLSTART, ACOLEND
+    // SYMM Matrix  multiplication proceeds by multiplying panels on A's block-row
+    // with panels on B's block-column.
+    // However due to symmetric nature of A/B matrix compounded by the fact that
+    // only upper OR lower triangle of the symm matrix is available, vector-loads
+    // are not possible while traversing certain regions of the matrix.
+    // ACOLStart and ACOLEnd - signify what portion of SYMM can be achieved through
+    // this TN kernel. The SYMM handler has to compose the SYMM in-terms of GEMM kernels
+    // SYMMETRIC LOAD routines are used when traversing the diaognal region wherease normal rules
+    // hold good otherwise.
+    //
+#ifdef __SYMM_LEFT__
+    // MxM * MxN
+    A = _A + offa;
+    lda = _lda;
+    B = _B + offb;
+    ldb = _ldb;
+    K = M;
+    #ifndef __SYMM_DIAGONAL__
+    #ifdef __SYMM_LOWER__
+    ACOLSTART = row + (threadsY * %ITEMY);
+    ACOLEND = K;
+        /*
+        if (get_local_id(0) == 0)
+        {
+            printf(\"GEMM_TN_KERNEL: SYMM_LOWER: Setting ACOLSTART to %d, ACOLEND = %d\\n\", ACOLSTART, ACOLEND);
+        }
+        */
+    #elif defined(__SYMM_UPPER__)
+    ACOLSTART = 0;
+    ACOLEND = row;
+    #else
+    #error GEMM_TN_KERNEL
+    #endif
+    #else
+        ACOLSTART = row;
+        ACOLEND = row + (threadsY * %ITEMY);
+    #endif
+    if (ACOLEND > K)
+    {
+        ACOLEND = K;
+    }
+#elif defined(__SYMM_RIGHT__)
+    // MxN * NxN
+    #error GEMM_TN_KERNEL: Internal Error: Should not be called in SYMM_RIGHT case! Right is Wrong!
+#else
+    // GEMM Case
+    A = _A + offa;
+    B = _B + offb;
+    K = _K;
+    lda = _lda;
+    ldb = _ldb;
+    ACOLSTART = 0;
+    ACOLEND = K;
+#endif
+
+    uint offsetX = (tid % threadsX) * %ITEMX;
+    uint offsetY = (tid / threadsX) * %ITEMY;
+    rowA     =     (row + offsetY);
+    colB     =     (col + offsetX);
+    #ifndef TAIL_RUN
+    bool tailBlock = ((row  >= M) || (col >= N));
+    #else
+    #error GEMM_TN_KERNEL: No TAIL_RUN for TN case
+    #endif
+
+    %TYPE%V AVAL[%ITEMY]; // %ITEMY * %PANEL
+    #ifdef COMPLEX
+    %TYPE%HV AVALEVEN[%ITEMY]; // %ITEMY * %PANEL
+    %TYPE%HV AVALODD[%ITEMY]; // %ITEMY * %PANEL
+    #endif
+
+    %TYPE%V BVAL[%ITEMX];
+    #ifdef COMPLEX
+    %TYPE%HV BVALEVEN[%ITEMX]; // %ITEMY * %PANEL
+    %TYPE%HV BVALODD[%ITEMX]; // %ITEMY * %PANEL
+    #endif
+
+    %TYPE   CVAL[%ITEMY][%ITEMX];
+    #ifdef COMPLEX
+    %TYPE%HV CVALEVEN[%ITEMY][%ITEMX]; // %ITEMY * %PANEL
+    %TYPE%HV CVALODD[%ITEMY][%ITEMX]; // %ITEMY * %PANEL
+    #endif
+
+    %IF(%ITEMY) #pragma unroll %ITEMY
+    for(uint i=0; i< (%ITEMY); i++)
+    {
+        %IF(%ITEMX) #pragma unroll %ITEMX
+        for(uint j=0; j<(%ITEMX); j++)
+        {
+            #ifdef COMPLEX
+            CVAL[i][j] = (%TYPE) 0;
+            CVALEVEN[i][j] = (%TYPE%HV) 0;
+            CVALODD[i][j] = (%TYPE%HV) 0;
+            #else
+            CVAL[i][j] = (%TYPE) 0;
+            #endif
+        }
+    }
+
+    int ACOL;
+    uint actualCol;
+    uint actualRow;
+    int ACOLENDV;
+    int numIterations = (ACOLEND - ACOLSTART) / (%V) ;
+
+    if (numIterations >= 0)
+    {
+        ACOLENDV = ACOLSTART + (numIterations * (%V));
+    } else {
+        ACOLENDV = ACOLEND;
+    }
+
+
+    if (ldb % (512) == 0) // PENDING: 512 needs to be a configurable
+    {
+        //
+        // ASSUMPTION(SYMM Variants): \"ACOLSTART\" is perfectly divisble by \"%V\"
+        // ACOLSTART depends on the tile size on Y direction
+        // Since Vector-sizes are hardly 1, 2,4, 8 or 16, we can assume that
+        // this is indeed the case
+        //
+
+        //
+        // Assumption is that 32/16/8 is divisble by any value in %V
+        //
+        int num32Iterations = (ACOLENDV - ACOLSTART) / (32/(sizeof(%TYPE)/sizeof(float)));
+        if (num32Iterations <= 0)
+        {
+            ACOL = ACOLSTART;
+        } else {
+            int startIteration = bidX % num32Iterations;
+            ACOL = ACOLSTART + ( startIteration * (32/(sizeof(%TYPE)/sizeof(float))));
+        }
+    } else {
+        ACOL = ACOLSTART;
+    }
+
+    for(int itr=0; itr<numIterations; itr++)
+    {
+        {
+            //
+            // Load A values
+            //
+            %IF(%ITEMY) #pragma unroll %ITEMY
+            for(int i = 0; i < %ITEMY; i++)
+            {
+                #ifndef __SYMM_DIAGONAL__
+                #ifndef M_TAIL_PRESENT
+                    AVAL[i] = %VLOAD(0, (&A[(rowA + i)*lda + ACOL]) );
+                #else
+                    actualRow = ((rowA + i) >= MV) ? (MV-1) : (rowA + i);
+                    AVAL[i] = %VLOAD(0, (&A[actualRow*lda + ACOL]) );
+                #endif
+                #else
+                    #ifndef M_TAIL_PRESENT
+                        AVAL[i] = SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, ACOL, (rowA+i));
+                    #else
+                        actualRow = ((rowA + i) >= MV) ? (MV-1) : (rowA + i);
+                        AVAL[i] = SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, ACOL, actualRow);
+                    #endif
+                #endif
+
+                #ifdef CONJUGATE_A
+                    %TYPE%V conjTemp = AVAL[i];
+                    %CONJUGATE(1, conjTemp);
+                    AVAL[i] = conjTemp;
+                #endif
+
+                #ifdef COMPLEX
+                AVALEVEN[i] = AVAL[i].even;
+                AVALODD[i] = AVAL[i].odd;
+                #endif
+            }
+
+            //
+            // Load B values
+            //
+            %IF(%ITEMX) #pragma unroll %ITEMX
+            for(int j=0; j<(%ITEMX); j++)
+            {
+                #ifndef N_TAIL_PRESENT
+                        BVAL[j] = %VLOAD(0, (&B[ACOL + (colB + j)*ldb]));
+                #else
+                        actualCol = ((colB + j) >= N) ? (N-1) : (colB + j);
+                        BVAL[j] = %VLOAD(0, (&B[ACOL + (actualCol)*ldb]));
+                #endif
+
+                #ifdef COMPLEX
+                BVALEVEN[j] = BVAL[j].even;
+                BVALODD[j] = BVAL[j].odd;
+                #endif
+            }
+        } // LOAD A and B Over
+
+
+        // MATH Begin
+        %IF(%ITEMX) #pragma unroll %ITEMX
+        for(int j=0; j<(%ITEMX); j++)
+        {
+            %IF(%ITEMY) #pragma unroll %ITEMY
+            for(int i=0; i<(%ITEMY); i++)
+            {
+                #ifndef COMPLEX
+                %VMAD_AND_REDUCE(CVAL[i][j] ,  AVAL[i], BVAL[j]);
+                #else
+                CVALEVEN[i][j] = mad(AVALEVEN[i], BVALEVEN[j], CVALEVEN[i][j]);
+                CVALEVEN[i][j] = mad(AVALODD[i], -BVALODD[j], CVALEVEN[i][j]);
+                CVALODD[i][j]  = mad(AVALEVEN[i], BVALODD[j], CVALODD[i][j]);
+                CVALODD[i][j]  = mad(AVALODD[i],  BVALEVEN[j], CVALODD[i][j]);
+                /*
+                EVENSUM = AVALEVEN[i] * BVALEVEN[j];
+                EVENSUM = mad(AVALODD[i], -BVALODD[j], EVENSUM);
+                ODDSUM  = AVALEVEN[i]*BVALODD[j];
+                ODDSUM  = mad(AVALODD[i],  BVALEVEN[j], ODDSUM);
+                CVAL[i][j].S0 += EVENSUM.S0 + EVENSUM.S1;
+                CVAL[i][j].S1 += ODDSUM.S0 + ODDSUM.S1;
+                */
+                #endif
+            }
+        }
+
+        ACOL = ((ACOL + %V) == ACOLENDV) ? ACOLSTART : (ACOL + %V); //%PANEL
+    }
+
+    #ifdef COMPLEX
+    {
+        %IF(%ITEMY) #pragma unroll %ITEMY
+        for(uint i=0; i< (%ITEMY); i++)
+        {
+            %IF(%ITEMX) #pragma unroll %ITEMX
+            for(uint j=0; j<(%ITEMX); j++)
+            {
+                CVAL[i][j].even =   %REDUCE_SUM_REAL_HV(CVALEVEN[i][j]);
+                CVAL[i][j].odd =    %REDUCE_SUM_REAL_HV(CVALODD[i][j]);
+            }
+        }
+    }
+    #endif
+
+    ACOL = ACOLENDV;
+
+    for(; ACOL < ACOLEND; ACOL ++)
+    {
+        //
+        // Load B values
+        //
+        %IF(%ITEMX) #pragma unroll %ITEMX
+        for(uint bcol = 0; bcol < %ITEMX; bcol++)
+        {
+            //
+            // PENDING: PANEL iteration to Load the Panel Depth iterating by %V
+            //
+            #ifndef N_TAIL_PRESENT
+            BVAL[bcol] = %VMAKEVEC(B[ACOL + (colB + bcol)*ldb]);
+            #else
+            BVAL[bcol] = %VMAKEVEC(B[ACOL + ((colB + bcol)%(N))*ldb]);
+            #endif
+        }
+
+        //
+        // Load A values
+        //
+        %IF(%ITEMY) #pragma unroll %ITEMY
+        for(uint i = 0; i < (%ITEMY); i++) // 1 * ITEMY/V
+        {
+            #ifndef __SYMM_DIAGONAL__
+            {
+                #ifndef M_TAIL_PRESENT
+                AVAL[i] = %VMAKEVEC(A[(rowA + i)*lda + ACOL]);
+                #else
+                AVAL[i] = %VMAKEVEC(A[((rowA + i) % MV)*lda + ACOL]);
+                #endif
+            }
+            #else
+            {
+                %TYPE t;
+                #ifndef M_TAIL_PRESENT
+                t = SYMM_SCALAR_LOAD(A, M, lda, ACOL, (rowA+i) );
+                #else
+                t = SYMM_SCALAR_LOAD(A, M, lda, ACOL, ((rowA + i) % MV));
+                #endif
+                AVAL[i] = %VMAKEVEC(t);
+            }
+            #endif
+            #ifdef CONJUGATE_A
+                %CONJUGATE(1, AVAL[i]);
+            #endif
+        }
+
+        {
+            %IF(%ITEMY) #pragma unroll %ITEMY
+            for(uint i=0; i<(%ITEMY); i++)
+            {
+                %IF(%ITEMX) #pragma unroll %ITEMX
+                for(uint j=0; j<(%ITEMX); j++)
+                {
+                    %MAD_AND_REDUCE(CVAL[i][j] ,  AVAL[i] , BVAL[j]);
+                }
+            }
+        }
+    }
+
+
+    //
+    // STORE Result in C
+    //
+    %TYPE%V reg , betareg, alphareg;
+    %TYPE reg_s , betareg_s, alphareg_s;
+    %TYPE%V alphav, betav;
+    alphav = %VMAKEVEC(alpha);
+    betav = %VMAKEVEC(beta);
+    //%TYPE CVALV_TEMP[%V];
+    %TYPE%V CVALV;
+
+    #ifndef HERK
+    %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+    for(uint i=0; i< (%ITEMY_BY_V); i++)
+    {
+        %IF(%ITEMX) #pragma unroll %ITEMX
+        for(uint j=0; j<(%ITEMX); j++)
+        {
+                //#pragma unroll %V
+                //for(uint k=0; k< (%V); k++)
+                %VFOR
+                {
+                    CVALV%VFORSUFFIX = CVAL[i*V + %VFORINDEX][j];
+                }
+                //CVALV = *(__private %TYPE%V *)CVALV_TEMP;
+
+            #if !defined(M_TAIL_PRESENT) && !defined(N_TAIL_PRESENT)
+                reg = %VLOAD(0, (&C[(rowA + i*V) +     (colB+j)*ldc]));
+                %VMUL(betareg, betav, reg);
+                %VMUL(alphareg, alphav, CVALV);
+                %ADD( reg, betareg, alphareg);
+                %VSTORE(reg, 0, (&C[(rowA + i*V) + (colB+j)*ldc]));
+            #else
+                if (((rowA + i*V + V - 1) < M) && ((colB + j) < N))
+                {
+                    reg = %VLOAD(0, (&C[rowA + i*V +     (colB+j)*ldc]));
+                    %VMUL(betareg, betav, reg);
+                    %VMUL(alphareg, alphav, CVALV);
+                    %ADD( reg, betareg, alphareg);
+                    %VSTORE(reg, 0, (&C[(rowA + i*V) + (colB+j)*ldc]));
+                } else {
+                    if ((colB + j) < N)
+                    {
+                        //%TYPE TEMP[%V];
+                        //*(__private %TYPE%V *) TEMP = CVALV;
+                        //#pragma unroll %V
+                        //for(uint v=0; ((v< %V) && ((rowA + (i * %V) + v) < M) ); v++)
+                        %VFOR
+                        {
+                            if (((rowA + (i * %V) + %VFORINDEX) < M) )
+                            {
+                                %TYPE c;
+
+                                c = C[rowA + i*V + %VFORINDEX + (colB+j)*ldc];
+                                %MUL(betareg_s, c, beta);
+                                c = CVALV%VFORSUFFIX;
+                                %MUL(alphareg_s, c, alpha);
+                                %ADD(c, betareg_s, alphareg_s);
+                                C[rowA + i*V + %VFORINDEX + (colB+j)*ldc] = c;
+                           }
+                        }
+                    }
+                }
+            #endif
+        }
+    }
+    #else
+    %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+    for(uint i=0; i< (%ITEMY_BY_V); i++)
+    {
+        %IF(%ITEMX) #pragma unroll %ITEMX
+        for(uint j=0; j<(%ITEMX); j++)
+        {
+            int actualRow = rowA + i*V;
+            int actualCol = colB + j;
+
+            //#pragma unroll %V
+            //for(uint k=0; k< (%V); k++)
+            %VFOR
+            {
+                CVALV%VFORSUFFIX = CVAL[i*V + %VFORINDEX][j];
+            }
+            //CVALV = *(__private %TYPE%V *)CVALV_TEMP;
+
+            #if !defined(M_TAIL_PRESENT) && !defined(N_TAIL_PRESENT)
+                %VMUL(alphareg, alphav, CVALV);
+                //%TYPE temp[%V];
+                //*(__private %TYPE%V *)(&temp) = alphareg;
+                //#pragma unroll %V
+                //for(uint r = 0; r < %V; r++)
+                %VFOR
+                {
+                    #ifdef HERK_LOWER_TRIANGLE
+                    if((%VFORINDEX + actualRow) >= (actualCol))
+                    #else
+                    if((%VFORINDEX + actualRow) <= (actualCol))
+                    #endif
+                    {
+                        %TYPE C_s =  C[%VFORINDEX + actualRow + actualCol * ldc];
+                        %TYPE beta_s;
+                        %MUL(beta_s, beta, C_s);
+                        C_s = alphareg%VFORSUFFIX + beta_s;
+                        if((%VFORINDEX + actualRow) == actualCol)
+                        {
+                            C_s.odd = 0.0f;
+                        }
+                        C[%VFORINDEX + actualRow + actualCol * ldc] = C_s;
+                    }
+                }
+            #else
+                if (((rowA + i*V + V - 1) < M) && ((colB + j) < N))
+                {
+                    %VMUL(alphareg, alphav, CVALV);
+                    //%TYPE temp[%V];
+                    //*(__private %TYPE%V *)(&temp) = alphareg;
+                    //#pragma unroll %V
+                    //for(uint r = 0; r < %V; r++)
+                    %VFOR
+                    {
+                        #ifdef HERK_LOWER_TRIANGLE
+                        if((%VFORINDEX + actualRow) >= (actualCol))
+                        #else
+                        if((%VFORINDEX + actualRow) <= (actualCol))
+                        #endif
+                        {
+                            %TYPE C_s =  C[%VFORINDEX + actualRow + actualCol * ldc];
+                            %TYPE beta_s;
+                            %MUL(beta_s, beta, C_s);
+                            C_s = alphareg%VFORSUFFIX + beta_s;
+                            if((%VFORINDEX + actualRow) == actualCol)
+                            {
+                                C_s.odd = 0.0f;
+                            }
+                            C[%VFORINDEX + actualRow + actualCol * ldc] = C_s;
+                        }
+                    }
+                }
+                else
+                {
+                    if ((colB + j) < N)
+                    {
+                        //%TYPE TEMP[%V];
+
+                        //*(__private %TYPE%V *)(&TEMP) = CVALV;
+                        //#pragma unroll %V
+                        //for(uint r=0; ((r< %V) && ((rowA + (i * %V) + r) < M) ); r++)
+                        %VFOR
+                        {
+                            if (((rowA + (i * %V) + %VFORINDEX) < M))
+                            {
+                                #ifdef HERK_LOWER_TRIANGLE
+                                if((%VFORINDEX + actualRow) >= (actualCol))
+                                #else
+                                if((%VFORINDEX + actualRow) <= (actualCol))
+                                #endif
+                                {
+                                    %TYPE c;
+                                    c = C[%VFORINDEX + actualRow + (actualCol)*ldc];
+                                    %MUL(betareg_s, c, beta);
+                                    c = CVALV%VFORSUFFIX;
+                                    %MUL(alphareg_s, c, alpha);
+                                    %ADD(c, betareg_s, alphareg_s);
+                                    if((%VFORINDEX + actualRow) == (actualCol))
+                                    {
+                                        c.odd = 0.0f;
+                                    }
+                                    C[%VFORINDEX + actualRow  + actualCol * ldc] = c;
+                                }
+                            }
+                        }
+                    }
+                }
+            #endif
+        }
+    }
+    #endif
+    return;
+}
+";
+
diff --git a/src/library/blas/gens/clTemplates/gemm_helper.cl b/src/library/blas/gens/clTemplates/gemm_helper.cl
new file mode 100644
index 0000000..0c8c3e9
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/gemm_helper.cl
@@ -0,0 +1,87 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+static const char *GEMM_HELPER = "
+void getBlockNumber(uint nBlocks, uint blockID, uint *bidY, uint *bidX, uint flag)
+{
+    #ifndef HERK
+    {
+        if(flag) //Column Major ordering for NT kernels
+        {
+            *bidY = ( blockID % ( nBlocks));
+            *bidX = ( blockID / ( nBlocks));
+        }
+        else //Row Major ordering for TN kernels
+        {
+            *bidX = ( blockID % ( nBlocks));
+            *bidY = ( blockID / ( nBlocks));
+        }
+    }
+    #else
+    {
+        volatile uint _i = 0, _j = 0;
+        for ( _j = (blockID / nBlocks); _j < nBlocks; _j++)
+        {
+            _i = blockID - ((_j*((2* nBlocks) + 1 - _j))/2) + _j;
+            if ( _i < nBlocks && ( _i >= 0) )
+            {
+                break;
+            }
+        }
+        #ifdef HERK_LOWER_TRIANGLE
+            *bidY = _i;
+            *bidX = _j;
+        #else
+            *bidY = _j;
+            *bidX = _i;
+        #endif
+    }
+    #endif
+}
+
+//
+// mapWorkGroupToTileNumber() - Maps a workgroup number to a Tile position in output matrix
+// Groups the full tiles together and half-tiles together and maps the workgroup number
+// such that full tiles are processed wholly by consecutive workgroups and half-tiles are
+// processed by consecutive workgroups
+//
+// ASSUMPTION:
+//  Assumes column major numbering of workgroup
+//
+// Observation:
+//  This new grouping yielded worse results than normal column-major order.
+//  Tested with GEMM NN kernel. So, we will not be using this function.
+//  This is here just for completeness sake
+//
+void mapWorkGroupToTileNumber(uint M, uint N, uint *bidY, uint *bidX)
+{
+    uint fullTilesOnY, numTilesOnX;
+    uint relativeGroupId;
+
+    numTilesOnX = ((N-1) / ((get_local_size(0) / %WIDTH) * %ITEMX)) + 1;
+	fullTilesOnY = (M / (%WIDTH * %ITEMY));
+    if (get_group_id(0) < (numTilesOnX * fullTilesOnY) )
+    {
+	    *bidY = ( get_group_id(0) % ( fullTilesOnY));
+	    *bidX = ( get_group_id(0) / ( fullTilesOnY));
+    } else {
+        relativeGroupId = get_group_id(0) - (numTilesOnX * fullTilesOnY);
+        *bidY = fullTilesOnY;
+        *bidX = relativeGroupId;
+    }
+}
+";
+
diff --git a/src/library/blas/gens/clTemplates/ger.cl b/src/library/blas/gens/clTemplates/ger.cl
new file mode 100644
index 0000000..0006087
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/ger.cl
@@ -0,0 +1,293 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// Column-Major Case
+
+static const char *ger_C_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#define BH %BH_DEF
+#define BW %BW_DEF
+
+__kernel void %PREFIXger_C_kernel( __global %TYPE const* restrict _X, __global %TYPE const* restrict _Y, __global %TYPE* _A,
+				uint M, uint N, uint offx, int incx, uint offy, int incy, uint offa, uint lda,
+				%TYPE alpha, int doConj )
+{
+	__global %TYPE* A;
+	__global %TYPE const* restrict X;
+	__global %TYPE const* restrict Y;
+
+	A = _A + offa;
+	X = _X + offx;
+	Y = _Y + offy;
+
+	if ( incx < 0 ) // Goto end of vector
+	{
+		X = X + ( M - 1) * abs(incx);
+	}
+
+	if ( incy < 0 ) // Goto end of vector
+	{
+		Y = Y + ( N - 1) * abs(incy);
+	}
+
+	// create local memory
+	__local %TYPE localX[ BH * %V ];
+	__local %TYPE localY[ BW ];
+
+	uint lID = get_local_id( 0 );
+	uint gID = get_group_id( 0 );
+
+	uint tIDy = lID & ( BH-1 );  //get y coordinate of a thread in 1D workgroup
+	uint tIDx = lID / BH;        //get x coordinate of a thread in !D workgroup
+    uint nBlocksX = (( N + BW - 1) / BW );
+    uint nBlocksY = (( M + BH * %V - 1 ) / ( BH * %V ));
+
+	uint gIDy = gID % nBlocksY;	//get y coordinate of a workgroup in 1D grid
+    uint gIDx = gID / nBlocksY;	// get x coordinate of a workgroup in a 1D grid
+
+    uint row = (( BH * gIDy)+  tIDy) * %V;
+    uint col = (( BW * gIDx)+  tIDx);
+
+
+    if( (gIDx != (nBlocksX-1)) && (gIDy != (nBlocksY-1)) )       // Completely vector blocks
+    {
+        //populate local memory
+        for( int i = lID; i< ( BH * %V); i+= get_local_size(0) )
+        {
+            int idx = i + ( gIDy * BH * %V);
+            localX[ i ] = *(X + (idx * incx));
+        }
+
+        for( int i = lID; i< BW; i+= get_local_size(0) )
+        {
+            int idx = i + ( gIDx * BW);
+            localY[ i ] = *(Y + (idx * incy));
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        %TYPE%V prevA, temp;
+        %TYPE yReg = localY[  tIDx ];
+        %TYPE%V xReg = *(__local %TYPE%V*)(&localX[ tIDy * %V]);
+
+        prevA = %VLOAD( 0, ( A + col*lda + row ) );
+        %CONJUGATE(doConj, yReg);
+        %VMUL( temp, xReg, alpha );
+        %VMAD( prevA, temp, yReg);
+        %VSTORE( prevA, 0 , ( A + col*lda + row ) );
+
+    }
+    else                            // Border blocks in both X & Y direction
+    {
+    	//populate local memory
+        for( int i = lID; i< ( BH * %V); i+= get_local_size(0) )
+        {
+    		int idx = i + ( gIDy * BH * %V);
+           	if ( idx < M )
+    		{
+    			localX[ i ] = *(X + (idx * incx));
+        	}
+    	}
+
+        for( int i = lID; i< BW; i+= get_local_size(0) )
+        {
+    		int idx = i + ( gIDx * BW);
+    		if ( idx < N)
+    		{
+           		localY[ i ] = *(Y + (idx * incy));
+        	}
+    	}
+    	barrier(CLK_LOCAL_MEM_FENCE);
+
+    	uint gTIDx = (gIDx * BW) + tIDx;
+        if ( gTIDx < N)  // if whithin last column
+    	{
+    		if( (row + %V - 1) < M )  // if the next V rows are still within M, then do vector math
+	    	{
+    			%TYPE%V prevA, temp;
+    			%TYPE yReg = localY[  tIDx ];
+    			%TYPE%V xReg = *(__local %TYPE%V*)(&localX[ tIDy * %V]);
+
+	    		prevA = %VLOAD( 0, ( A + col*lda + row ) );
+		    	%CONJUGATE(doConj, yReg);
+			    %VMUL( temp, xReg, alpha );
+    			%VMAD( prevA, temp, yReg);
+    			%VSTORE( prevA, 0 , ( A + col*lda + row ) );
+
+    		}
+	    	else if( row < M  )  //else do scalar multiplication
+		    {
+    			%TYPE xRegS, yReg, prevA, temp;
+    			for( int i=row; i<M; i++ )
+    			{
+    				yReg  = localY[ tIDx ];
+    				xRegS = localX[ (tIDy * %V) + (i-row) ];
+    				prevA = A[ col*lda + i];
+	    			%CONJUGATE(doConj, yReg);
+    				%MUL( temp, xRegS, alpha );
+    				%MAD( prevA, temp, yReg );
+	    			A[ col*lda + i ] = prevA;
+		    	}
+		    }
+	    }
+    }
+}
+\n";
+
+
+
+//Row major kernel
+
+static const char *ger_R_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#define BH %BH_DEF
+#define BW %BW_DEF
+
+__kernel void %PREFIXger_R_kernel( __global %TYPE const* restrict _X, __global %TYPE const* restrict _Y, __global %TYPE* _A,
+				uint M, uint N, uint offx, int incx, uint offy, int incy, uint offa, uint lda,
+				%TYPE alpha, int doConj )
+{
+	__global %TYPE* A;
+	__global %TYPE const* restrict X;
+	__global %TYPE const* restrict Y;
+
+	A = _A + offa;
+	X = _X + offx;
+	Y = _Y + offy;
+
+	if ( incx < 0 ) // Goto end of vector
+	{
+		X = X + ( M - 1) * abs(incx);
+	}
+
+	if ( incy < 0 ) // Goto end of vector
+	{
+		Y = Y + ( N - 1) * abs(incy);
+	}
+
+    __local %TYPE localX[ BH ];
+    __local %TYPE localY[ BW * %V ];
+
+    uint lID = get_local_id( 0 );
+    uint gID = get_group_id( 0 );
+
+    uint tIDy = lID / BW;
+    uint tIDx = lID & ( BW - 1);
+    uint nBlocksY = (( M + BH - 1) / BH );
+    uint nBlocksX = (( N + BW * %V - 1 ) / ( BW * %V ));
+
+    uint gIDy = gID / nBlocksX;
+    uint gIDx = gID % nBlocksX;
+
+    uint row = (( BH * gIDy)+  tIDy);
+    uint col = (( BW * gIDx)+  tIDx) * %V;
+
+    if( (gIDy != (nBlocksY-1)) && (gIDx != (nBlocksX-1)) )         // Perfectly vector blocks
+    {
+        //populate local memory
+        for( int i = lID; i< ( BW * %V); i+= get_local_size(0) )
+        {
+            int idx = i + ( gIDx * BW * %V);
+            localY[ i ] = *(Y + (idx * incy));
+        }
+
+        for( int i = lID; i< BH; i+= get_local_size(0) )
+        {
+            int idx = i + ( gIDy * BH);
+            localX[ i ] = *(X + (idx * incx));
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        %TYPE%V prevA, temp;
+        %TYPE xReg = localX[ tIDy ];
+        %TYPE%V yRegS = *(__local %TYPE%V*)(&localY[ tIDx * %V]);
+
+        prevA = %VLOAD( 0, ( A + row*lda + col ) );
+        %CONJUGATE(doConj, yRegS);
+        %VMUL( temp, yRegS, alpha );
+        %VMAD( prevA, temp, xReg );
+        %VSTORE( prevA, 0 , ( A + row*lda + col ) );
+    }
+    else
+    {
+        //populate local memory
+        for( int i = lID; i< ( BW * %V); i+= get_local_size(0) )
+        {
+            int idx = i + ( gIDx * BW * %V);
+            if ( idx < N)
+            {
+                localY[ i ] = *(Y + (idx * incy));
+            }
+        }
+
+        for( int i = lID; i< BH; i+= get_local_size(0) )
+        {
+            int idx = i + ( gIDy * BH);
+            if ( idx < M)
+            {
+                localX[ i ] = *(X + (idx * incx));
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        uint gTIDy = gIDy * BH + tIDy;
+        if ( gTIDy < M)
+        {
+            if( (col + %V - 1) < N )
+            {
+                %TYPE%V prevA, temp;
+                %TYPE xReg = localX[ tIDy ];
+                %TYPE%V yRegS = *(__local %TYPE%V*)(&localY[ tIDx * %V]);
+
+                prevA = %VLOAD( 0, ( A + row*lda + col ) );
+                %CONJUGATE(doConj, yRegS);
+    			%VMUL( temp, yRegS, alpha );
+    			%VMAD( prevA, temp, xReg );
+                %VSTORE( prevA, 0 , ( A + row*lda + col ) );
+
+            }
+            else if( col < N  )
+            {
+                %TYPE xReg, yRegS, prevA, temp;
+                for( int i=col; i<N; i++ )
+                {
+                    yRegS = localY[ (tIDx * %V) + (i-col) ];
+                    xReg  = localX[ tIDy ];
+                    prevA = A[ row*lda + i];
+                    %CONJUGATE(doConj, yRegS);
+    				%MUL( temp, yRegS, alpha );
+    				%MAD( prevA, temp, xReg );
+    				A[ row*lda + i ] = prevA;
+                }
+            }
+        }
+    }
+}
+\n";
diff --git a/src/library/blas/gens/clTemplates/her.cl b/src/library/blas/gens/clTemplates/her.cl
new file mode 100644
index 0000000..87e9747
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/her.cl
@@ -0,0 +1,533 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/***********************************************/
+//NOTE: THIS FILE IS NOT USED. SEE SYR_HER.CLT
+//      THIS FILE IS FOR LEGACY PURPOSES.
+
+//Column-Major Lower
+
+static const char *her_CL_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#define TARGET_ROWS_BY_VEC 	(%TARGET_ROWS / %V)
+#define TARGET_WIDTH		(%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+#define TARGET_HEIGHT		(%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+
+// nBlocks 	= (N - 1)/ TR + 1
+// totalBlocks 	= (nBlocks * ( nBlocks + 1)) / 2
+
+__kernel void %PREFIXher_CL_kernel( __global %TYPE* _A, __global const %TYPE* _X, int N,
+										int offx, int incx, int offa, int lda, %PTYPE alpha )
+{
+	__global const %TYPE* X;
+	__global %TYPE *A;
+	__local %TYPE xShared[%TARGET_ROWS];
+	__local %TYPE yShared[%TARGET_ROWS];
+
+	A = _A + offa;
+	if ( incx < 0 ) // Goto end of vector
+	{
+		X = _X + offx - ( N - 1) * incx;
+	}
+	else
+	{
+		X = _X + offx;
+	}
+
+	int blockID  = get_group_id(0);
+	int threadID = get_local_id(0);
+	int nBlocks  = ((N - 1) / %TARGET_ROWS) + 1;
+
+	__local int iShared;
+	__local int jShared;
+
+	// Get (i,j) of Block
+	if ( threadID == 0)
+	{
+		int _i = 0, _j = 0;
+		//for ( _j = 0; _j < nBlocks; _j++)
+		for ( _j = (blockID / nBlocks); _j < nBlocks; _j++)
+		{
+			_i = blockID - ((_j*((2* nBlocks) + 1 - _j))/2) + _j;
+			if ( _i < nBlocks && ( _i >= 0) )
+			{
+				break;
+			}
+		}
+
+		iShared = _i;
+		jShared = _j;
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int i = iShared;
+	int j = jShared;
+
+
+	int ref_x = i * %TARGET_ROWS;
+	int ref_y = j * %TARGET_ROWS;
+
+	// Load data into xShared and yShared
+	// Not a common task among blocks in the present implementation..
+
+	// Diagonal Blocks : Should handle not reading diagonal element complex value
+	// Diagonal blocks : Should handle the last block as well
+	// Scalar code in Present implementation
+	if ( i == j)
+	{
+ 		int ncols = ((ref_y + %TARGET_ROWS) < N) ? %TARGET_ROWS : (N-ref_y);
+        int nrows = ((ref_x + %TARGET_ROWS) < N) ? %TARGET_ROWS : (N-ref_x);
+        int nElements = ((nrows) * ((ncols) + 1)) >> 1;
+        nrows -= 1;
+        ncols -= 1;
+        for(i = threadID; i < nElements; i += get_local_size(0))
+        {
+            int r = -1, c = -1;
+            for(int k = 1; (k <= %TARGET_ROWS); k ++)
+            {
+                int temp = ((k - 1) * k) >> 1;
+                r = ((i >= temp) && (i <= (temp + k - 1))) ? k - 1 : r;
+            }
+            c = i - (((r + 1) * r) >> 1);
+
+            r = ref_x + r;
+            c = ref_y + c;
+
+            %TYPE res1, res2, res;
+            res1 = alpha * X[r * incx];
+            res2 = X[c * incx];
+            #ifdef HERMITIAN_ROWMAJOR
+				%CONJUGATE(1, res1);
+			#else
+				%CONJUGATE(1, res2);
+			#endif
+            res = A[r + c * lda];
+            %MAD( res, res1, res2);
+/* HER defn: On input, the imaginary parts of the diagonal elements of the
+	complex Hermitian matrix A are assumed to be zero, so you do not have to set
+	these values. On output, if alpha not equal to 0.0, they are set to zero. */
+
+			res.odd = ((r == c) && (alpha != 0.0)) ? 0.0 : res.odd;
+
+            A[r + c * lda] = res;
+        }
+	}
+	else if ( i == (nBlocks - 1)) // Last Row Strip blocks ( May not fit into target region)
+	{
+
+		%TYPE%V loadedA;
+
+		// Populating xShared: May not fit into target region
+		for( int i = (ref_x + threadID); i < N; i += get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			#ifdef HERMITIAN_ROWMAJOR
+				%CONJUGATE(1, loadedX); // Taking conjugate while loading only
+			#endif
+			xShared[i - ref_x] = loadedX;
+		}
+
+		// Populating yShared: Always fits well..
+		for( int i = (ref_y  + threadID); (i - ref_y) < %TARGET_ROWS; i += get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			#ifndef HERMITIAN_ROWMAJOR
+				%CONJUGATE(1, loadedX); // Taking conjugate while loading only
+			#endif
+			yShared[(i - ref_y) ] = loadedX;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop %TARGET_ROWS / TARGET_WIDTH times
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = (threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V;
+
+
+		int startRow = ref_x + rowShift;
+		%TYPE%V  loadedX;
+
+		if ( startRow  < (N - (%V - 1)) )
+		{
+			loadedX=  *((__local %TYPE%V*)( xShared + rowShift));
+		}
+
+		for( int i= 1; i <=  nLoops; i++)
+		{
+			int startCol = ref_y + colShift + ( i - 1 ) * TARGET_WIDTH;
+
+			if ( ( startRow  < N ) && ( startCol  < (ref_y + %TARGET_ROWS ) ) )// threads that fall into target region
+			{
+				if(( startRow + %V) > N )// Loop serially as can't do VLOAD
+				{
+					%TYPE yValue = yShared[ startCol - ref_y];
+
+					for(int row = startRow; row < N; row++)
+					{
+						%TYPE xValue = xShared[ row - ref_x];
+						%TYPE res1, res2;
+						res1 = alpha * xValue;
+						%MUL( res2, res1,  yValue);
+						A[ row + startCol * lda] += res2;
+					}
+				}
+				else
+				{
+					loadedA  	= %VLOAD( 0, (&A[ startRow + startCol * lda]));
+
+					%TYPE 	 loadedY= yShared[ startCol - ref_y];
+					%TYPE 	 res;
+					res =  loadedY * alpha;
+					%TYPE%V  resVec;
+					resVec = %VMAKEVEC(res);
+					%VMAD( loadedA, loadedX, resVec);
+					%VSTORE(  loadedA, 0, (&A[ startRow + startCol * lda]));
+				}
+			}
+		}
+	}
+	else // blocks that fit exactly.
+	{
+
+		%TYPE%V loadedA;
+
+		// Populating xShared
+		for( int i = (ref_x + threadID); (i - ref_x) < %TARGET_ROWS; i += get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			#ifdef HERMITIAN_ROWMAJOR
+				%CONJUGATE(1, loadedX); // Taking conjugate while loading only
+			#endif
+			xShared[i - ref_x] = loadedX;
+		}
+
+		// Populating yShared
+		for( int i = (ref_y + threadID); (i - ref_y) < %TARGET_ROWS; i += get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			#ifndef HERMITIAN_ROWMAJOR
+				%CONJUGATE(1, loadedX); // Taking conjugate while loading only
+			#endif
+			yShared[i - ref_y] = loadedX;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop %TARGET_ROWS / TARGET_WIDTH times
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = (threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V;
+
+		int startRow = ref_x + rowShift;
+		int startCol = ref_y + colShift;
+		%TYPE%V  loadedX;
+
+		if ( startCol < ( ref_y + %TARGET_ROWS) ) // threads that fall into target region
+		{
+			loadedX	 =  *((__local %TYPE%V*)( xShared + rowShift));
+		}
+
+		//#pragma unroll
+		for( int i= 1; i <= nLoops; i++)
+		{
+			startCol = ref_y + colShift + ( i - 1 ) * TARGET_WIDTH;
+
+			if ( startCol < ( ref_y + %TARGET_ROWS) ) // threads that fall into target region
+			{
+				loadedA  	= %VLOAD( 0, (&A[ startRow + startCol * lda]));
+				%TYPE 	 loadedY= yShared[ startCol - ref_y];
+				%TYPE 	 res;
+				res =  loadedY * alpha;
+				%TYPE%V  resVec;
+				resVec = %VMAKEVEC(res);
+				%VMAD( loadedA, loadedX, resVec);
+				%VSTORE(  loadedA, 0, (&A[ startRow + startCol * lda]));
+			}
+		}
+	}
+}
+\n";
+
+
+// Column-Major Upper
+
+static const char *her_CU_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#define TARGET_ROWS_BY_VEC   	(%TARGET_ROWS / %V)
+#define TARGET_WIDTH     		(%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+#define TARGET_HEIGHT        	(%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+
+// nBlocks 	= (N - 1)/ TR + 1
+// totalBlocks 	= (nBlocks * ( nBlocks + 1)) / 2
+
+__kernel void %PREFIXher_CU_kernel( __global %TYPE* _A, __global const %TYPE* _X, int N,
+										int offx, int incx, int offa, int lda, %PTYPE alpha )
+{
+	__global const %TYPE* X;
+	__global %TYPE *A;
+
+	__local %TYPE xShared[%TARGET_ROWS];
+	__local %TYPE yShared[%TARGET_ROWS];
+
+	A = _A + offa;
+	if ( incx < 0 ) // Goto end of vector
+	{
+		X = _X + offx - ( N - 1) * incx;
+	}
+	else
+	{
+		X = _X + offx;
+	}
+
+	int blockID  = get_group_id(0);
+	int threadID = get_local_id(0);
+	int nBlocks  = ((N - 1) / %TARGET_ROWS) + 1;
+
+	__local int iShared;
+	__local int jShared;
+
+	// Get (i,j) of Block
+	if ( threadID == 0)
+	{
+		int _i = 0, _j = 0;
+		//for ( _j = 0; _j < nBlocks; _j++)
+		for ( _j = (blockID / nBlocks); _j < nBlocks; _j++)
+		{
+			_i = blockID - ((_j*((2* nBlocks) + 1 - _j))/2) + _j;
+			if ( _i < nBlocks && ( _i >= 0) )
+			{
+				break;
+			}
+		}
+
+		iShared = _i;
+		jShared = _j;
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int i = iShared;
+	int j = jShared;
+
+	int ref_x = (N- 1) - i * %TARGET_ROWS;
+	int ref_y = (N- 1) - j * %TARGET_ROWS;
+
+	// Load data into xShared and yShared
+	// Not a common task among blocks in the present implementation..
+
+	// Diagonal Blocks : Should handle not reading diagonal element complex value
+	// Diagonal blocks : Should handle the last block as well
+	// Scalar code in Present implementation
+	if ( i == j)
+	{
+		int ncols = ((ref_y - %TARGET_ROWS) >= 0) ? %TARGET_ROWS : (ref_y+1);
+		int nrows = ((ref_x - %TARGET_ROWS) >= 0) ? %TARGET_ROWS : (ref_x+1);
+		int nElements = ((nrows) * ((ncols) + 1)) >> 1;
+		nrows -= 1;
+		ncols -= 1;
+		for(i = threadID; i < nElements; i += get_local_size(0))
+		{
+			int r, c = -1;
+			for(int k = 1; (k <= %TARGET_ROWS); k ++)
+			{
+				int temp = ((k - 1) * k) >> 1;
+				c = ((i >= temp) && (i <= (temp + k - 1))) ? k - 1 : c;
+			}
+			r = i - (((c + 1) * c) >> 1);
+
+			r = ref_x - (nrows) + r;
+			c = ref_y - (ncols) + c;
+
+			%TYPE res1, res2, res;
+            res1 = alpha * X[r * incx];
+            res2 = X[c * incx];
+            #ifdef HERMITIAN_ROWMAJOR
+				%CONJUGATE(1, res1);
+			#else
+				%CONJUGATE(1, res2);
+			#endif
+            res = A[r + c * lda];
+            %MAD( res, res1, res2);
+/* HER defn: On input, the imaginary parts of the diagonal elements of the
+	complex Hermitian matrix A are assumed to be zero, so you do not have to set
+	these values. On output, if alpha not equal to 0.0, they are set to zero. */
+
+            res.odd = ((r == c) && (alpha != 0.0)) ? 0.0 : res.odd;
+
+            A[r + c * lda] = res;
+		}
+	}
+	else if ( i == (nBlocks - 1)) // Last Row Strip blocks ( May not fit into target region)
+	{
+
+		%TYPE%V loadedA;
+
+		// Populating xShared: May not fit into target region
+		for( int i = (ref_x - threadID); i >= 0; i -= get_local_size(0))
+		{
+			// FIXME: Assumes BLOCKSIZE >= TARGET_ROWS
+			// FIXME: Works correctly only for 1 ITERATION
+			//xShared[(%TARGET_ROWS - 1) - threadID] = X[ i * incx];
+			%TYPE loadedX = X[ i * incx];
+			#ifdef HERMITIAN_ROWMAJOR
+				%CONJUGATE(1, loadedX); // Taking conjugate while loading only
+			#endif
+			xShared[(%TARGET_ROWS - 1) -(ref_x - i)] = loadedX;
+		}
+
+		// Populating yShared: Always fits well..
+		for( int i = (ref_y - threadID); (ref_y - i) < %TARGET_ROWS; i -= get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			#ifndef HERMITIAN_ROWMAJOR
+				%CONJUGATE(1, loadedX); // Taking conjugate while loading only
+			#endif
+			yShared[(ref_y - i)] = loadedX;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = ((threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V) + (%V - 1);
+
+		int startRow = ref_x - rowShift;
+		%TYPE%V  loadedX;
+
+		if ( startRow  >= 0 )
+		{
+			loadedX=  *((__local %TYPE%V*)( &xShared[ (%TARGET_ROWS - 1) - rowShift]));
+		}
+
+		for( int i= 1; i <=  nLoops; i++)
+		{
+			int startCol = ref_y - colShift - ( i - 1 ) * TARGET_WIDTH;
+
+			// threads that fall into target region
+			if( ( startRow  > -(%V) ) && (startCol > (ref_y - %TARGET_ROWS)) )
+			{
+				if( startRow  < 0 )// Loop serially as can't do VLOAD
+				{
+					%TYPE yValue = yShared[ ref_y - startCol];
+
+					for(int row = startRow + (%V - 1); row >= 0; row--)
+					{
+						%TYPE xValue = xShared[ %TARGET_ROWS - 1 - (ref_x - row)];
+						%TYPE res1, res2;
+						res1 = alpha * xValue;
+						%MUL( res2, res1,  yValue);
+						A[ row + startCol * lda] += res2;
+					}
+				}
+				else
+				{
+					loadedA  = %VLOAD( 0, (&A[ startRow + startCol * lda]));
+
+					%TYPE 	 loadedY= yShared[ ref_y - startCol];
+					%TYPE 	 res;
+					res =  loadedY * alpha;
+					%TYPE%V  resVec;
+					resVec = %VMAKEVEC(res);
+					%VMAD( loadedA, loadedX, resVec);
+					%VSTORE(  loadedA, 0, (&A[ startRow + startCol * lda]));
+				}
+			}
+		}
+	}
+	else // blocks that fit exactly.
+	{
+		%TYPE%V loadedA;
+
+		// Populating xShared
+		for( int i = (ref_x - threadID); ((ref_x - i) < %TARGET_ROWS); i -= get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			#ifdef HERMITIAN_ROWMAJOR
+				%CONJUGATE(1, loadedX); // Taking conjugate while loading only
+			#endif
+			xShared[ (%TARGET_ROWS - 1) - (ref_x - i)] = loadedX;
+		}
+
+		// Populating yShared
+		for( int i = (ref_y - threadID); (ref_y - i) < %TARGET_ROWS; i -= get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			#ifndef HERMITIAN_ROWMAJOR
+				%CONJUGATE(1, loadedX); // Taking conjugate while loading only
+			#endif
+			yShared[(ref_y - i)] = loadedX;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop %TARGET_ROWS / TARGET_WIDTH times
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = ((threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V) + (%V - 1);
+
+
+		int startRow = ref_x - rowShift;
+		int startCol = ref_y - colShift;
+		%TYPE%V  loadedX;
+		// Not all threads should do this..
+		// Depends on whether blocksize width is > target_rows
+		if ( startCol > ( ref_y - %TARGET_ROWS) ) // threads that fall into target region
+		{
+			loadedX=  *((__local %TYPE%V*)( &xShared[ (%TARGET_ROWS - 1)- rowShift]));
+		}
+
+		for( int i = 1; i <= nLoops; i++)
+		{
+			startCol = ref_y - colShift - ( i - 1 ) * TARGET_WIDTH;
+
+			if ( startCol > ( ref_y - %TARGET_ROWS) ) // threads that fall into target region
+			{
+				loadedA = %VLOAD( 0, (&A[ startRow + startCol * lda]));
+				%TYPE  loadedY = yShared[ ref_y - startCol];
+				%TYPE  res;
+				res = loadedY * alpha;
+				%TYPE%V  resVec;
+				resVec = %VMAKEVEC(res);
+				%VMAD( loadedA, loadedX, resVec);
+				%VSTORE(  loadedA, 0, (&A[ startRow + startCol * lda]));
+			}
+		}
+
+	}
+}
+\n";
+
diff --git a/src/library/blas/gens/clTemplates/her2.cl b/src/library/blas/gens/clTemplates/her2.cl
new file mode 100644
index 0000000..ae8f92d
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/her2.cl
@@ -0,0 +1,662 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/************************************************/
+//NOTE: THIS FILE IS NOT USED. SEE SYR2_HER2.CLT
+//      THIS FILE IS FOR LEGACY PURPOSES.
+
+//Column Major Lower
+static const char *her2_CL_kernel = "
+
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+	#pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#define TARGET_ROWS_BY_VEC 	(%TARGET_ROWS / %V)
+#define TARGET_WIDTH		(%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+#define TARGET_HEIGHT		(%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+// Column-Major Lower
+// nBlocks 	= (N - 1)/ TR + 1
+// totalBlocks 	= (nBlocks * ( nBlocks + 1)) / 2
+__kernel void %PREFIXher2_CL_kernel( __global const %TYPE* _A, __global const %TYPE* _X, __global const %TYPE* _Y, int N, int offx, int incx, int offy, int incy, int offa, int lda, %TYPE alpha)
+{
+
+	__global const %TYPE* X;
+	__global const %TYPE* Y;
+	__global %TYPE* A;
+
+	__local %TYPE xShared[%TARGET_ROWS];
+	__local %TYPE yShared[%TARGET_ROWS];
+	__local %TYPE xSharedConj[%TARGET_ROWS];
+	__local %TYPE ySharedConj[%TARGET_ROWS];
+
+	if( (alpha.even == 0.0) && (alpha.odd == 0.0) )
+		return;
+
+	int nBlocks = ((N - 1) / %TARGET_ROWS) + 1;
+
+	A = _A + offa;
+
+	if ( incx < 0 ) // Goto end of vector
+	{
+		X	 = _X + offx - ( N - 1) * incx;
+	}
+	else
+	{
+		X = _X + offx;
+	}
+
+	if ( incy < 0 ) // Goto end of vector
+	{
+		Y	 = _Y + offy - ( N - 1) * incy;
+	}
+	else
+	{
+		Y = _Y + offy;
+	}
+
+	int blockID  = get_group_id(0);
+	int threadID = get_local_id(0);
+
+	__local int iShared;
+	__local int jShared;
+
+	// Get (i,j) of Block
+	if ( threadID == 0)
+	{
+		int _i = 0, _j = 0;
+		for ( _j = (blockID / nBlocks); _j < nBlocks; _j++)
+		{
+			_i = blockID - ((_j*((2* nBlocks) + 1 - _j))/2) + _j;
+			if ( _i < nBlocks && ( _i >= 0) )
+			{
+				break;
+			}
+		}
+
+		iShared = _i;
+		jShared = _j;
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int i = iShared;
+	int j = jShared;
+
+
+	int ref_x = i * %TARGET_ROWS;
+	int ref_y = j * %TARGET_ROWS;
+	%TYPE conjAlpha	 = alpha;
+	%CONJUGATE( 1, conjAlpha );
+
+	// Load data into xShared and yShared
+	// Not a common task among blocks in the present implementation..
+	// Diagonal Blocks : Should handle not reading diagonal element complex value
+	// Diagonal blocks : Should handle the last block as well
+	// Scalar code in Present implementation
+	if ( i == j)
+	{
+		int ncols = ((ref_y + %TARGET_ROWS) < N) ? %TARGET_ROWS : (N-ref_y);
+        int nrows = ((ref_x + %TARGET_ROWS) < N) ? %TARGET_ROWS : (N-ref_x);
+        int nElements = ((nrows) * ((ncols) + 1)) >> 1;
+        nrows -= 1;
+        ncols -= 1;
+        for(i = threadID; i < nElements; i += get_local_size(0))
+        {
+            int r = -1, c = -1;
+            for(int k = 1; (k <= %TARGET_ROWS); k ++)
+            {
+                int temp = ((k - 1) * k) >> 1;
+                r = ((i >= temp) && (i <= (temp + k - 1))) ? k - 1 : r;
+            }
+            c = i - (((r + 1) * r) >> 1);
+
+            r = ref_x + r;
+            c = ref_y + c;
+
+            %TYPE res1, res2, res3, res4, res5;
+            res1 = X[r * incx];
+            res2 = X[c * incx];
+            #ifdef HER2_ROWMAJOR
+				%CONJUGATE(1, res1);
+			#endif
+            %MUL( res5, alpha, res1 );
+            res1 = Y[c * incx];
+            res3 = Y[r * incx];
+            #ifndef HER2_ROWMAJOR
+				%CONJUGATE(1, res1);
+            #endif
+            %MUL( res4, res5, res1 );
+            #ifdef HER2_ROWMAJOR
+				%CONJUGATE(1, res3);
+			#else
+				%CONJUGATE(1, res2);
+			#endif
+            %MUL( res5, conjAlpha, res3 );
+            %MAD( res4, res5, res2 );
+            res1 = A[r + c * lda];
+            %ADD( res2, res1, res4 );
+ /* HER2 defn: On output, if alpha not equal to 0.0, then imaginary part of A is set to zero. */
+
+			res2.odd = (r == c) ? 0.0 : res2.odd;
+
+
+			A[r + c * lda] = res2;
+        }
+	}
+	else if ( i == (nBlocks - 1)) // Last Row Strip blocks ( May not fit into target region)
+	{
+
+		%TYPE%V loadedA;
+
+		// Populating xShared: May not fit into target region
+		for( int i = (ref_x + threadID); i < N; i += get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			#ifdef HER2_ROWMAJOR
+				%CONJUGATE(1, loadedX);		//taking conjugate while loading
+				%CONJUGATE(1, loadedY);
+			#endif
+			xShared[i - ref_x] = loadedX;
+			yShared[i - ref_x] = loadedY;
+		}
+
+		// Populating yShared: Always fits well..
+		for( int i = (ref_y  + threadID); (i - ref_y) < %TARGET_ROWS; i += get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			#ifndef HER2_ROWMAJOR
+				%CONJUGATE(1, loadedX);		//taking conjugate while loading
+				%CONJUGATE(1, loadedY);
+			#endif
+			xSharedConj[(i - ref_y) ] = loadedX;
+			ySharedConj[(i - ref_y) ] = loadedY;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop %TARGET_ROWS / TARGET_WIDTH times
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = (threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V;
+
+
+		int startRow = ref_x + rowShift;
+		%TYPE%V  loadedX, loadedY;
+
+		if ( startRow  < (N - (%V - 1)) )
+		{
+			loadedX=  *((__local %TYPE%V*)( xShared + rowShift));
+			loadedY=  *((__local %TYPE%V*)( yShared + rowShift));
+		}
+
+		for( int i= 1; i <=  nLoops; i++)
+		{
+			int startCol = ref_y + colShift + ( i - 1 ) * TARGET_WIDTH;
+
+			if ( ( startRow  < N ) && ( startCol  < (ref_y + %TARGET_ROWS ) ) )// threads that fall into target region
+			{
+				if(( startRow + %V) > N )// Loop serially as can't do VLOAD
+				{
+					%TYPE yValueConj = ySharedConj[ startCol - ref_y];
+					%TYPE xValueConj = xSharedConj[ startCol - ref_y];
+
+					for(int row = startRow; row < N; row++)
+					{
+						%TYPE xValue = xShared[ row - ref_x];
+						%TYPE yValue = yShared[ row - ref_x];
+
+						%TYPE res1, res2;
+						// X * Y(H)
+						%MUL(res1, alpha, yValueConj);
+						%MUL( res2, res1,  xValue);
+
+						// Y * X(H)
+						%MUL(res1, conjAlpha, xValueConj);
+						%MAD( res2, res1,  yValue);
+						A[ row + startCol * lda] += res2;
+					}
+				}
+				else
+				{
+					loadedA  	= %VLOAD( 0, (&A[ startRow + startCol * lda]));
+
+					%TYPE 	 loadedYConj = ySharedConj[ startCol - ref_y];
+					%TYPE 	 loadedXConj = xSharedConj[ startCol - ref_y];
+					%TYPE 	 res;
+
+					// X * Y(H)
+					%MUL(res, loadedYConj, alpha);
+					%TYPE%V  resVec;
+					resVec = %VMAKEVEC(res);
+					%VMAD( loadedA, loadedX, resVec);
+
+					// Y * X(H)
+					%MUL(res, loadedXConj, conjAlpha);
+					resVec = %VMAKEVEC(res);
+					%VMAD( loadedA, loadedY, resVec);
+
+					%VSTORE(  loadedA, 0, (&A[ startRow + startCol * lda]));
+				}
+			}
+		}
+	}
+	else // blocks that fit exactly.
+	{
+
+		%TYPE%V loadedA;
+
+		// Populating xShared
+		for( int i = (ref_x + threadID); (i - ref_x) < %TARGET_ROWS; i += get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			#ifdef HER2_ROWMAJOR
+				%CONJUGATE(1, loadedX);		//taking conjugate while loading
+				%CONJUGATE(1, loadedY);
+			#endif
+			xShared[i - ref_x] = loadedX;
+			yShared[i - ref_x] = loadedY;
+		}
+
+		// Populating yShared
+		for( int i = (ref_y + threadID); (i - ref_y) < %TARGET_ROWS; i += get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			#ifndef HER2_ROWMAJOR
+				%CONJUGATE(1, loadedX);		//taking conjugate while loading
+				%CONJUGATE(1, loadedY);
+			#endif
+			xSharedConj[i - ref_y] = loadedX;
+			ySharedConj[i - ref_y] = loadedY;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop %TARGET_ROWS / TARGET_WIDTH times
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = (threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V;
+
+		int startRow = ref_x + rowShift;
+		int startCol = ref_y + colShift;
+		%TYPE%V  loadedX, loadedY;
+
+		if ( startCol < ( ref_y + %TARGET_ROWS) ) // threads that fall into target region
+		{
+			loadedX	 =  *((__local %TYPE%V*)( xShared + rowShift));
+			loadedY	 =  *((__local %TYPE%V*)( yShared + rowShift));
+		}
+
+		//#pragma unroll
+		for( int i= 1; i <= nLoops; i++)
+		{
+			startCol = ref_y + colShift + ( i - 1 ) * TARGET_WIDTH;
+
+			if ( startCol < ( ref_y + %TARGET_ROWS) ) // threads that fall into target region
+			{
+				loadedA  	 = %VLOAD( 0, (&A[ startRow + startCol * lda]));
+				%TYPE 	 loadedYConj = ySharedConj[ startCol - ref_y];
+				%TYPE 	 loadedXConj = xSharedConj[ startCol - ref_y];
+
+				// X * Y(H)
+				%TYPE 	 res;
+				%MUL(res, loadedYConj, alpha);
+				%TYPE%V  resVec;
+				resVec = %VMAKEVEC(res);
+				%VMAD( loadedA, loadedX, resVec);
+
+				// Y * X(H)
+				%MUL(res, loadedXConj, conjAlpha);
+				resVec = %VMAKEVEC(res);
+				%VMAD( loadedA, loadedY, resVec);
+				%VSTORE(  loadedA, 0, (&A[ startRow + startCol * lda]));
+			}
+		}
+	}
+
+}
+\n";
+
+//Column Major Upper
+static const char *her2_CU_kernel = "
+
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#define TARGET_ROWS_BY_VEC  (%TARGET_ROWS / %V)
+#define TARGET_WIDTH        (%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+#define TARGET_HEIGHT       (%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+// Column-Major Upper
+// nBlocks 	= (N - 1)/ TR + 1
+// totalBlocks 	= (nBlocks * ( nBlocks + 1)) / 2
+__kernel void %PREFIXher2_CU_kernel( __global const %TYPE* _A, __global const %TYPE* _X, __global const %TYPE* _Y, int N, int offx, int incx, int offy, int incy, int offa, int lda, %TYPE alpha)
+{
+
+    __global const %TYPE* X;
+    __global const %TYPE* Y;
+    __global %TYPE* A;
+
+    __local %TYPE xShared[%TARGET_ROWS];
+    __local %TYPE yShared[%TARGET_ROWS];
+    __local %TYPE xSharedConj[%TARGET_ROWS];
+    __local %TYPE ySharedConj[%TARGET_ROWS];
+
+	if( (alpha.even == 0.0) && (alpha.odd == 0.0) )
+		return;
+
+    int nBlocks = ((N - 1) / %TARGET_ROWS) + 1;
+
+    A = _A + offa;
+
+    if ( incx < 0 ) // Goto end of vector
+    {
+        X    = _X + offx - ( N - 1) * incx;
+    }
+    else
+    {
+        X = _X + offx;
+    }
+
+    if ( incy < 0 ) // Goto end of vector
+    {
+        Y    = _Y + offy - ( N - 1) * incy;
+    }
+    else
+    {
+        Y = _Y + offy;
+    }
+
+	int blockID  = get_group_id(0);
+	int threadID = get_local_id(0);
+
+	__local int iShared;
+	__local int jShared;
+
+	// Get (i,j) of Block
+	if ( threadID == 0)
+	{
+		int _i = 0, _j = 0;
+		//for ( _j = 0; _j < nBlocks; _j++)
+		for ( _j = (blockID / nBlocks); _j < nBlocks; _j++)
+		{
+			_i = blockID - ((_j*((2* nBlocks) + 1 - _j))/2) + _j;
+			if ( _i < nBlocks && ( _i >= 0) )
+			{
+				break;
+			}
+		}
+
+		iShared = _i;
+		jShared = _j;
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int i = iShared;
+	int j = jShared;
+	%TYPE conjAlpha	 = alpha;
+	%CONJUGATE( 1, conjAlpha );
+
+	int ref_x = (N- 1) - i * %TARGET_ROWS;
+	int ref_y = (N- 1) - j * %TARGET_ROWS;
+
+	// Load data into xShared and yShared
+	// Not a common task among blocks in the present implementation..
+
+	// Diagonal Blocks : Should handle not reading diagonal element complex value
+	// Diagonal blocks : Should handle the last block as well
+	// Scalar code in Present implementation
+	if ( i == j)
+	{
+ 		int ncols = ((ref_y - %TARGET_ROWS) >= 0) ? %TARGET_ROWS : (ref_y+1);
+        int nrows = ((ref_x - %TARGET_ROWS) >= 0) ? %TARGET_ROWS : (ref_x+1);
+        int nElements = ((nrows) * ((ncols) + 1)) >> 1;
+        nrows -= 1;
+        ncols -= 1;
+        for(i = threadID; i < nElements; i += get_local_size(0))
+        {
+            int r, c = -1;
+            for(int k = 1; (k <= %TARGET_ROWS); k ++)
+            {
+                int temp = ((k - 1) * k) >> 1;
+                c = ((i >= temp) && (i <= (temp + k - 1))) ? k - 1 : c;
+            }
+            r = i - (((c + 1) * c) >> 1);
+
+            r = ref_x - (nrows) + r;
+            c = ref_y - (ncols) + c;
+
+            %TYPE res1, res2, res3, res4, res5;
+            res1 = X[r * incx];
+            res2 = X[c * incx];
+            #ifdef HER2_ROWMAJOR
+				%CONJUGATE(1, res1);
+			#endif
+            %MUL( res5, alpha, res1 );
+            res1 = Y[c * incx];
+            res3 = Y[r * incx];
+            #ifndef HER2_ROWMAJOR
+				%CONJUGATE(1, res1);
+            #endif
+            %MUL( res4, res5, res1 );
+            #ifdef HER2_ROWMAJOR
+				%CONJUGATE(1, res3);
+			#else
+				%CONJUGATE(1, res2);
+			#endif
+            %MUL( res5, conjAlpha, res3 );
+            %MAD( res4, res5, res2 );
+            res1 = A[r + c * lda];
+            %ADD( res2, res1, res4 );
+ /* HER2 defn: On output, if alpha not equal to 0.0, then imaginary part of A is set to zero. */
+
+			res2.odd = (r == c) ? 0.0 : res2.odd;
+
+
+			A[r + c * lda] = res2;
+        }
+	}
+	else if ( i == (nBlocks - 1)) // Last Row Strip blocks ( May not fit into target region)
+	{
+
+		%TYPE%V loadedA;
+
+		// Populating xShared: May not fit into target region
+		for( int i = (ref_x - threadID); i >= 0; i -= get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			#ifdef HER2_ROWMAJOR
+				%CONJUGATE(1, loadedX);		//taking conjugate while loading
+				%CONJUGATE(1, loadedY);
+			#endif
+			xShared[(%TARGET_ROWS - 1) - threadID] = loadedX;
+			yShared[(%TARGET_ROWS - 1) - threadID] = loadedY;
+		}
+
+		// Populating yShared: Always fits well..
+		for( int i = (ref_y - threadID); (ref_y - i) < %TARGET_ROWS; i -= get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			#ifndef HER2_ROWMAJOR
+				%CONJUGATE(1, loadedX);		//taking conjugate while loading
+				%CONJUGATE(1, loadedY);
+			#endif
+			xSharedConj[(ref_y - i)] = loadedX;
+			ySharedConj[(ref_y - i)] = loadedY;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = ((threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V) + (%V - 1);
+
+		int startRow = ref_x - rowShift;
+		%TYPE%V  loadedX, loadedY;
+
+		if ( startRow  >= 0 )
+		{
+			loadedX=  *((__local %TYPE%V*)( &xShared[ (%TARGET_ROWS - 1) - rowShift]));
+			loadedY=  *((__local %TYPE%V*)( &yShared[ (%TARGET_ROWS - 1) - rowShift]));
+		}
+
+		for( int i= 1; i <=  nLoops; i++)
+		{
+			int startCol = ref_y - colShift - ( i - 1 ) * TARGET_WIDTH;
+
+			// threads that fall into target region
+			if( ( startRow  > -(%V) ) && (startCol > (ref_y - %TARGET_ROWS)) )
+			{
+				if( startRow  < 0 )// Loop serially as can't do VLOAD
+				{
+					%TYPE yValueConj = ySharedConj[ ref_y - startCol];
+					%TYPE xValueConj = xSharedConj[ ref_y - startCol];
+
+					for(int row = startRow + (%V - 1); row >= 0; row--)
+					{
+						%TYPE xValue = xShared[ %TARGET_ROWS - 1 - (ref_x - row)];
+						%TYPE yValue = yShared[ %TARGET_ROWS - 1 - (ref_x - row)];
+
+						%TYPE res1, res2;
+
+						// X * Y(H)
+						%MUL(res1, alpha, yValueConj);
+						%MUL( res2, res1,  xValue);
+
+						// Y * X(H)
+						%MUL(res1, conjAlpha, xValueConj);
+						%MAD( res2, res1,  yValue);
+						A[ row + startCol * lda] += res2;
+					}
+				}
+				else
+				{
+					loadedA  = %VLOAD( 0, (&A[ startRow + startCol * lda]));
+
+					%TYPE 	 loadedXConj = xSharedConj[ ref_y - startCol];
+					%TYPE 	 loadedYConj = ySharedConj[ ref_y - startCol];
+					%TYPE 	 res;
+					// X * Y(H)
+					%MUL(res, loadedYConj, alpha);
+					%TYPE%V  resVec;
+					resVec = %VMAKEVEC(res);
+					%VMAD( loadedA, loadedX, resVec);
+
+					// Y * X(H)
+					%MUL(res, loadedXConj, conjAlpha);
+					resVec = %VMAKEVEC(res);
+					%VMAD( loadedA, loadedY, resVec);
+					%VSTORE(  loadedA, 0, (&A[ startRow + startCol * lda]));
+				}
+			}
+		}
+	}
+	else // blocks that fit exactly.
+	{
+		%TYPE%V loadedA;
+
+		// Populating xShared
+		for( int i = (ref_x - threadID); ((ref_x - i) < %TARGET_ROWS); i -= get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			#ifdef HER2_ROWMAJOR
+				%CONJUGATE(1, loadedX);		//taking conjugate while loading
+				%CONJUGATE(1, loadedY);
+			#endif
+			xShared[ (%TARGET_ROWS - 1) - (ref_x - i)] = loadedX;
+			yShared[ (%TARGET_ROWS - 1) - (ref_x - i)] = loadedY;
+		}
+
+		// Populating yShared
+		for( int i = (ref_y - threadID); (ref_y - i) < %TARGET_ROWS; i -= get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			#ifndef HER2_ROWMAJOR
+				%CONJUGATE(1, loadedX);		//taking conjugate while loading
+				%CONJUGATE(1, loadedY);
+			#endif
+			xSharedConj[(ref_y - i)] = loadedX;
+			ySharedConj[(ref_y - i)] = loadedY;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop %TARGET_ROWS / TARGET_WIDTH times
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = ((threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V) + (%V - 1);
+
+
+		int startRow = ref_x - rowShift;
+		int startCol = ref_y - colShift;
+		%TYPE%V  loadedX, loadedY;
+		// Not all threads should do this..
+		// Depends on whether blocksize width is > target_rows
+		if ( startCol > ( ref_y - %TARGET_ROWS) ) // threads that fall into target region
+		{
+			loadedX=  *((__local %TYPE%V*)( &xShared[ (%TARGET_ROWS - 1)- rowShift]));
+			loadedY=  *((__local %TYPE%V*)( &yShared[ (%TARGET_ROWS - 1)- rowShift]));
+		}
+
+		for( int i= 1; i <= nLoops; i++)
+		{
+			startCol = ref_y - colShift - ( i - 1 ) * TARGET_WIDTH;
+
+			if ( startCol > ( ref_y - %TARGET_ROWS) ) // threads that fall into target region
+			{
+				loadedA  		 	 = %VLOAD( 0, (&A[ startRow + startCol * lda]));
+				%TYPE 	 loadedYConj = ySharedConj[ ref_y - startCol];
+	 			%TYPE 	 loadedXConj = xSharedConj[ ref_y - startCol];
+				%TYPE 	 res;
+				// X * Y(H)
+				%MUL(res, loadedYConj, alpha);
+				%TYPE%V  resVec;
+				resVec = %VMAKEVEC(res);
+				%VMAD( loadedA, loadedX, resVec);
+
+				// Y * X(H)
+				%MUL(res, loadedXConj, conjAlpha);
+				resVec = %VMAKEVEC(res);
+				%VMAD( loadedA, loadedY, resVec);
+				%VSTORE(  loadedA, 0, (&A[ startRow + startCol * lda]));
+			}
+		}
+
+	}
+}
+\n";
diff --git a/src/library/blas/gens/clTemplates/iamax.cl b/src/library/blas/gens/clTemplates/iamax.cl
new file mode 100644
index 0000000..9152ccb
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/iamax.cl
@@ -0,0 +1,108 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+static const char *iamax_kernel = "
+#pragma OPENCL EXTENSION cl_amd_printf:enable
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+    #define MIN 0x1.0p-1022         // Min in case of d/z (values from khronos site)
+#else
+    #define MIN 0x1.0p-126f         // Min in case od s/c
+#endif
+/******************************************************
+ *  Implementations available for REDUCTION_BY_MAX
+     0 - ATOMIC_FLI
+     1 - REG_FLI,
+     2 - ATOMIC_FHI,
+     3 - REG_FHI
+
+    Implementation available for REDUCE_MAX
+    0 - FHI
+    1 - FLI
+ ***************************************************/
+
+__kernel void i%PREFIXamax_kernel( __global %TYPE *_X, __global %PTYPE *_scratchBuf,
+                                        uint N, uint offx, int incx)
+{
+	__global %TYPE *X = _X + offx;
+    __global %PTYPE *scratchBufVal = _scratchBuf;
+    int numGrps = get_num_groups(0);
+    __global uint *scratchBufIndex = (__global uint*)(&_scratchBuf[numGrps]);
+
+    #ifdef RETURN_ON_INVALID
+        // Incase of incx<1, index will be zero
+        if( get_global_id(0) == 0 ) {
+            scratchBufVal[0] = (%PTYPE)0.0;
+            scratchBufIndex[0] = 0;
+        }
+        return;
+    #endif
+
+    %PTYPE maxVal = MIN, val = MIN;
+    uint index = 0, maxIndex = 0;
+    %TYPE%V vReg1;
+    %PTYPE%V pReg1;
+
+    int gOffset;
+    for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1)<N; gOffset+=( get_global_size(0) * %V ) )
+    {
+        #ifdef INCX_NONUNITY
+            %VLOADWITHINCX( vReg1, (X + (gOffset*incx)), incx);
+        #else
+            vReg1 = %VLOAD( 0, (X + (gOffset * incx)) );
+        #endif
+
+        pReg1 = %VABS(vReg1);
+
+        %REDUCE_MAX(pReg1,val,index,1); // Find max within a vector
+
+        if(val > maxVal)
+        {
+            maxVal = val;
+            maxIndex = (gOffset + index);
+        }
+    }
+
+    // Loop for the last thread to handle the tail part of the vector
+    // Using the same gOffset used above
+    for( ; gOffset<N; gOffset++ )
+    {
+        %TYPE sReg1;
+        sReg1 = X[gOffset * incx];
+        if(%VABS(sReg1) > maxVal)
+        {
+            maxVal = %VABS(sReg1);
+            maxIndex = gOffset;
+        }
+    }
+
+    // Note: this has to be called outside any if-conditions- because REDUCTION uses barrier
+#ifdef REDUCE_MAX_WITH_INDEX_ATOMICS
+    %REDUCTION_BY_MAX(maxVal,maxIndex,0);
+#else
+    %REDUCTION_BY_MAX(maxVal,maxIndex,1);
+#endif
+
+    if(get_local_id(0) == 0)
+    {
+        scratchBufVal[get_group_id(0)] = maxVal;
+        scratchBufIndex[get_group_id(0)] = maxIndex + 1; // because 0 is reserved for error
+    }
+}";
diff --git a/src/library/blas/gens/clTemplates/nrm2.cl b/src/library/blas/gens/clTemplates/nrm2.cl
new file mode 100644
index 0000000..e866233
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/nrm2.cl
@@ -0,0 +1,217 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+static const char *nrm2_hypot_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+    #define MIN 0x1.0p-1022         // Min in case of d/z (values from khronos site)
+#else
+    #define MIN 0x1.0p-126f         // Min in case od s/c
+#endif
+
+__kernel void %PREFIXnrm2_hypot_kernel( __global %TYPE *_X, __global %PTYPE *scratchBuff,
+                                        uint N, uint offx, int incx )
+{
+	__global %TYPE *X = _X + offx;
+
+    #ifdef RETURN_ON_INVALID
+        // Incase of incx<1, NRM2 will be zero
+        if( get_global_id(0) == 0 ) {
+            scratchBuff[0] = (%PTYPE)0.0;
+        }
+        return;
+    #endif
+
+    int gOffset;
+    %TYPE%V res = (%TYPE%V) 0.0;
+    for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1)<N; gOffset+=( get_global_size(0) * %V ) )
+    {
+        %TYPE%V vReg1;
+
+        #ifdef INCX_NONUNITY
+            %VLOADWITHINCX( vReg1, (X + (gOffset*incx)), incx);
+        #else
+            vReg1 = %VLOAD( 0, (X + gOffset) );
+        #endif
+
+        res = hypot( res, vReg1 );
+    }
+    %TYPE nrm2 = %REDUCE_HYPOT( res );
+
+    // Loop for the last thread to handle the tail part of the vector
+    // Using the same gOffset used above
+    for( ; gOffset<N; gOffset++ )
+    {
+        %TYPE sReg1;
+        sReg1 = X[gOffset * incx];
+        nrm2 = hypot( nrm2, sReg1 );
+    }
+
+    // Note: this has to be called outside any if-conditions- because REDUCTION uses barrier
+    // dotP of work-item 0 will have the final reduced item of the work-group
+    %REDUCTION_BY_HYPOT( nrm2 );
+
+    %PTYPE nrm2_ptype;
+    #ifdef COMPLEX
+        nrm2_ptype = hypot( nrm2.even, nrm2.odd );
+    #else
+        nrm2_ptype = nrm2;
+    #endif
+
+
+    if( (get_local_id(0)) == 0 ) {
+        scratchBuff[ get_group_id(0) ] = nrm2_ptype;
+    }
+}
+\n";
+
+static const char *nrm2_ssq_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+    #define MAX 0x1.fffffffffffffp1023      // Max in case of d/z (values from khronos site)
+#else
+    #define MAX 0x1.fffffep127f             // Max in case of s/c
+#endif
+
+#define PZERO (%PTYPE)0.0
+#define ZERO (%TYPE)0.0
+#define VZERO (%TYPE%V)0.0
+
+//
+// Same scratch buffer will be used both scale and ssq.
+// So a scratch buffer of size 2*N is needed.
+// scale will be stored in scratch-buffer from [0] to [get_num_groups(0) - 1]
+// ssq will be stored from [get_num_groups(0)] to [2*get_num_groups(0) - 1]
+//
+
+__kernel void %PREFIXnrm2_ssq_kernel( __global %TYPE *_X, __global %PTYPE *scratchBuff,
+                                        uint N, uint offx, int incx )
+{
+	__global %TYPE *X = _X + offx;
+    uint numWGs = get_num_groups(0);
+
+    #ifdef RETURN_ON_INVALID
+        // Incase of incx<1, NRM2 will be zero
+        if( get_global_id(0) == 0 ) {
+            scratchBuff[0] = PZERO;
+            scratchBuff[numWGs] = PZERO;
+        }
+        return;
+    #endif
+
+    // First we find the max element in the whole work-group
+    // i.e calculating scale
+    %TYPE maxFound = (%TYPE) -MAX;
+
+    int gOffset;
+    for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1)<N; gOffset+=( get_global_size(0) * %V ) )
+    {
+        %TYPE%V vReg1;
+
+        #ifdef INCX_NONUNITY
+            %VLOADWITHINCX( vReg1, (X + (gOffset*incx)), incx);
+        #else
+            vReg1 = %VLOAD( 0, (X + gOffset) );
+        #endif
+
+        vReg1 = fabs( vReg1 );
+        %TYPE regMax = %REDUCE_MAX( vReg1 );
+        maxFound = fmax( maxFound, regMax );
+    }
+
+    for( ; gOffset<N; gOffset++ )
+    {
+        %TYPE sReg1;
+
+        sReg1 = X[gOffset * incx];
+        sReg1 = fabs( sReg1 );
+        maxFound = fmax( maxFound, sReg1 );
+    }
+
+    %REDUCTION_BY_MAX( maxFound );
+
+    __local %PTYPE _scale;
+
+    if( (get_local_id(0)) == 0 ) {
+        #ifdef COMPLEX
+            _scale = fmax( maxFound.even, maxFound.odd );
+        #else
+            _scale = maxFound;
+        #endif
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // At this point we have scale.
+    // Now we calculate ssq by loading the array again and dividing the
+    // elements by scale and squaring it.
+
+    %TYPE ssq = ZERO;
+    %PTYPE scaleOfWG = _scale;
+
+    // If scaleOfWG was zero, that means the whole array encountered before was filled with zeroes
+    // Note: _scale is a local variable, either all enter or none
+    if(isnotequal(scaleOfWG, PZERO))
+    {
+        for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1)<N; gOffset+=( get_global_size(0) * %V ) )
+        {
+            %TYPE%V vReg1;
+
+            #ifdef INCX_NONUNITY
+                %VLOADWITHINCX( vReg1, (X + (gOffset*incx)), incx);
+            #else
+                vReg1 = %VLOAD( 0, (X + gOffset) );
+            #endif
+
+            vReg1 = fabs( vReg1 );
+            %TYPE%V tempSsq = (vReg1 / scaleOfWG) * (vReg1 / scaleOfWG);
+
+            ssq += %REDUCE_SUM( tempSsq );
+        }
+
+        for( ; gOffset<N; gOffset++ )
+        {
+            %TYPE sReg1;
+
+            sReg1 = X[gOffset * incx];
+            sReg1 = fabs( sReg1 );
+
+            ssq += (sReg1 / scaleOfWG) * (sReg1 / scaleOfWG);
+        }
+
+        %REDUCTION_BY_SUM( ssq );
+    }
+
+    if( (get_local_id(0)) == 0 ) {
+        scratchBuff[ get_group_id(0) ] = scaleOfWG;
+
+        #ifdef COMPLEX
+            scratchBuff[ numWGs + get_group_id(0) ] = ssq.even + ssq.odd;
+        #else
+            scratchBuff[ numWGs + get_group_id(0) ] = ssq;
+        #endif
+    }
+}
+\n";
+
diff --git a/src/library/blas/gens/clTemplates/reduction.cl b/src/library/blas/gens/clTemplates/reduction.cl
new file mode 100644
index 0000000..5a79f1f
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/reduction.cl
@@ -0,0 +1,352 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+
+static const char *red_sum_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+__kernel void %PREFIXred_sum_kernel( __global %TYPE *_X, __global %TYPE *_res,
+                                                    uint N, uint offx, uint offRes )
+{
+ 	__global %TYPE *X = _X + offx;
+    __global %TYPE *res = _res + offRes;
+    %TYPE redVal = (%TYPE) 0.0;
+
+    int gOffset;
+    for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1)<N; gOffset+=( get_global_size(0) * %V ) )
+    {
+        %TYPE%V reg1;
+        reg1 = %VLOAD( 0, (X + gOffset) );
+        redVal +=  %REDUCE_SUM( reg1 );
+        }
+    // Loop for the last thread to handle the tail part of the vector
+    // Using the same gOffset used above
+    for( ; gOffset<N; gOffset++ )
+    {
+        redVal += X[gOffset];
+    }
+
+    // Note: this has to be called outside any if-conditions- because REDUCTION uses barrier
+    %REDUCTION_BY_SUM( redVal );
+
+    if( (get_local_id(0)) == 0 ) {
+        res[0] = redVal;
+    }
+}
+\n";
+
+
+static const char *red_max_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+        #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+        #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+    #define MAX 0x1.fffffffffffffp1023      // Max in case of d/z (values from khronos site)
+#else
+    #define MAX 0x1.fffffep127f             // Max in case of s/c
+#endif
+
+__kernel void %PREFIXred_max_kernel( __global %TYPE *_X, __global %TYPE *_res,
+                                                    uint N, uint offx, uint offRes )
+{
+ 	__global %TYPE *X = _X + offx;
+    __global %TYPE *res = _res + offRes;
+    %TYPE redVal = (%TYPE) - MAX;
+
+    int gOffset;
+    for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1)<N; gOffset+=( get_global_size(0) * %V ) )
+    {
+        %TYPE%V reg1;
+        reg1 = %VLOAD( 0, (X + gOffset) );
+        %TYPE scalarMax = %REDUCE_MAX( reg1 );
+        redVal =  fmax( redVal, scalarMax );
+        }
+    // Loop for the last thread to handle the tail part of the vector
+    // Using the same gOffset used above
+    for( ; gOffset<N; gOffset++ )
+    {
+        redVal = fmax( redVal, X[gOffset] );
+    }
+
+    // Note: this has to be called outside any if-conditions- because REDUCTION uses barrier
+    %REDUCTION_BY_MAX( redVal );
+
+    if( (get_local_id(0)) == 0 ) {
+        res[0] = redVal;
+    }
+}
+\n";
+
+static const char *red_min_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+        #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+        #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+
+    #define MAX 0x1.fffffffffffffp1023      // Max in case of d/z (values from khronos site)
+#else
+    #define MAX 0x1.fffffep127f             // Max in case of s/c
+#endif
+
+__kernel void %PREFIXred_min_kernel( __global %TYPE *_X, __global %TYPE *_res,
+                                                    uint N, uint offx, uint offRes )
+{
+ 	__global %TYPE *X = _X + offx;
+    __global %TYPE *res = _res + offRes;
+    %TYPE redVal = (%TYPE) MAX;
+
+    int gOffset;
+    for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1)<N; gOffset+=( get_global_size(0) * %V ) )
+    {
+        %TYPE%V reg1;
+        reg1 = %VLOAD( 0, (X + gOffset) );
+        %TYPE scalarMin = %REDUCE_MIN( reg1 );
+        redVal =  fmin( redVal, scalarMin );
+        }
+    // Loop for the last thread to handle the tail part of the vector
+    // Using the same gOffset used above
+    for( ; gOffset<N; gOffset++ )
+    {
+        redVal = fmin( redVal, X[gOffset] );
+    }
+
+    // Note: this has to be called outside any if-conditions- because REDUCTION uses barrier
+    %REDUCTION_BY_MIN( redVal );
+
+    if( (get_local_id(0)) == 0 ) {
+        res[0] = redVal;
+    }
+}
+\n";
+
+
+static const char *red_with_index_kernel = "
+
+/******************************************************
+ *  Implementations available for REDUCTION_BY_MAX
+     0 - ATOMIC_FLI
+     1 - REG_FLI,
+     2 - ATOMIC_FHI,
+     3 - REG_FHI
+
+    Implementation available for REDUCE_MAX
+    0 - FHI
+    1 - FLI
+ ***************************************************/
+
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+        #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+        #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+
+    #define MIN 0x1.0p-1022         // Min in case of d/z (values from khronos site)
+#else
+    #define MIN 0x1.0p-126f         // Min in case od s/c
+#endif
+
+
+__kernel void %PREFIXred_with_index_kernel( __global %TYPE *_X, __global uint *_res,
+                                                    uint N, uint offx, uint offRes )
+{
+ 	__global %TYPE *X = _X + offx;
+    __global uint *XIndex = (__global uint*)(&X[N]);
+    __global uint *res = _res + offRes;
+    %TYPE maxVal = (%TYPE)MIN, val = (%TYPE)MIN;
+    uint maxIndex = 0, index = 0;
+
+    int gOffset;
+    for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1)<N; gOffset+=( get_global_size(0) * %V ) )
+    {
+        %TYPE%V vReg1 = %VLOAD( 0, (X + gOffset) );
+
+        %REDUCE_MAX(vReg1,val,index,1); // Find max within a vector
+        if(val > maxVal)
+        {
+            maxVal = val;
+            maxIndex = XIndex[(gOffset + index)];
+    }
+    }
+    // Loop for the last thread to handle the tail part of the vector
+    // Using the same gOffset used above
+    for( ; gOffset<N; gOffset++ )
+    {
+        %TYPE sreg1 = X[gOffset];
+        if(sreg1 > maxVal)
+        {
+            maxVal = sreg1;
+            maxIndex = XIndex[gOffset];
+        }
+    }
+
+    // Note: this has to be called outside any if-conditions- because REDUCTION uses barrier
+#ifdef REDUCE_MAX_WITH_INDEX_ATOMICS
+    %REDUCTION_BY_MAX(maxVal,maxIndex,0);
+#else
+    %REDUCTION_BY_MAX(maxVal,maxIndex,1);
+#endif
+
+
+    if(get_local_id(0) == 0)
+    {
+        res[0] = maxIndex;
+    }
+}
+\n";
+
+
+static const char *red_hypot_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+        #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+        #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+__kernel void %PREFIXred_hypot_kernel( __global %TYPE *_X, __global %TYPE *_res,
+                                                    uint N, uint offx, uint offRes )
+{
+ 	__global %TYPE *X = _X + offx;
+    __global %TYPE *res = _res + offRes;
+    %TYPE redVal = (%TYPE) 0.0;
+
+    int gOffset;
+    for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1)<N; gOffset+=( get_global_size(0) * %V ) )
+    {
+        %TYPE%V reg1;
+        reg1 = %VLOAD( 0, (X + gOffset) );
+        %TYPE scalarHypot = %REDUCE_HYPOT( reg1 );
+        redVal =  hypot( redVal, scalarHypot );
+    }
+    // Loop for the last thread to handle the tail part of the vector
+    // Using the same gOffset used above
+    for( ; gOffset<N; gOffset++ )
+    {
+        redVal = hypot( redVal, X[gOffset] );
+    }
+
+    // Note: this has to be called outside any if-conditions- because REDUCTION uses barrier
+    %REDUCTION_BY_HYPOT( redVal );
+
+    if( (get_local_id(0)) == 0 ) {
+        res[0] = redVal;
+    }
+}
+\n";
+
+static const char *red_ssq_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+        #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+        #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+    #define MAX 0x1.fffffffffffffp1023      // Max in case of d/z (values from khronos site)
+#else
+    #define MAX 0x1.fffffep127f             // Max in case of s/c
+#endif
+
+#define ZERO (%TYPE)0.0
+
+// Since scale & ssq are always of primitive type,
+// This kernel will always be called only for float/double
+
+__kernel void %PREFIXred_ssq_kernel( __global %TYPE *_X, __global %TYPE *_res,
+                                                    uint N, uint offx, uint offRes )
+{
+ 	__global %TYPE *X = _X + offx;
+    __global %TYPE *res = _res + offRes;
+    %TYPE scale = -MAX;
+
+    int gOffset;
+    for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1)<N; gOffset+=( get_global_size(0) * %V ) )
+    {
+        %TYPE%V scale1;
+        scale1 = %VLOAD( 0, (X + gOffset) );
+
+        %TYPE regMax = %REDUCE_MAX( scale1 );
+        scale = fmax( scale, regMax );
+    }
+
+    for( ; gOffset<N; gOffset++ )
+    {
+        %TYPE sReg;
+        sReg = X[gOffset];
+        scale = fmax( scale, sReg );
+    }
+
+    %REDUCTION_BY_MAX( scale );
+
+    __local %TYPE _scaleOfWG;
+
+    if( (get_local_id(0)) == 0 ) {
+        _scaleOfWG = scale;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // At this point we have scale.
+    // Now we calculate ssq by loading the array again and dividing the
+    // elements by scale and squaring it.
+
+    %TYPE ssq = (%TYPE) 0.0;
+    %TYPE scaleOfWG = _scaleOfWG;
+
+    // If scale was zero, that means the whole array encountered before was filled with zeroes
+    // Note: scale is a local variable, either all enter or none
+    if(isnotequal(scaleOfWG, ZERO))
+    {
+        for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1)<N; gOffset+=( get_global_size(0) * %V ) )
+        {
+            %TYPE%V scale1, ssq1;
+            scale1 = %VLOAD( 0, (X + gOffset) );
+            ssq1 = %VLOAD( 0, (X + gOffset + N) );
+
+            %TYPE%V tempSsq = (scale1 / scaleOfWG) * (scale1 / scaleOfWG) * ssq1;
+
+            ssq += %REDUCE_SUM( tempSsq );
+        }
+
+        for( ; gOffset<N; gOffset++ )
+        {
+            %TYPE scale1, ssq1;
+            scale1 = X[gOffset];
+            ssq1 = X[gOffset + N];
+
+            ssq += (scale1 / scaleOfWG) * (scale1 / scaleOfWG) * ssq1;
+        }
+
+        %REDUCTION_BY_SUM( ssq );
+    }
+
+    if( (get_local_id(0)) == 0 ) {
+        res[0] = scaleOfWG * sqrt(ssq);
+    }
+}
+\n";
+
diff --git a/src/library/blas/gens/clTemplates/rotg.cl b/src/library/blas/gens/clTemplates/rotg.cl
new file mode 100644
index 0000000..701fe9d
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/rotg.cl
@@ -0,0 +1,112 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+
+static const char *rotg_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#define ZERO (%TYPE)0.0
+#define PZERO (%PTYPE)0.0
+
+// CABS(A) returns SQRT(REALPART(A)**2+IMAGPART(A)**2) -- opencl function length() computes the same
+#define CABS( arg )  length( arg )
+
+__kernel void %PREFIXrotg_kernel( __global %TYPE *_A, __global %TYPE *_B, __global %PTYPE *_C,
+                                __global %TYPE *_S, uint offa, uint offb, uint offc, uint offs )
+{
+    %TYPE Areg, Breg, Sreg;
+    %PTYPE Creg;
+
+	Areg = _A[offa];
+	Breg = _B[offb];
+
+	if(get_global_id(0) == 0)       // Only 1 thread will work
+	{
+	    #ifndef COMPLEX         // Real and complex math for rotg are different according to netlib
+	        %TYPE R, Z, roe, scale, absA, absB;
+
+	        absA = fabs(Areg);
+	        absB = fabs(Breg);
+
+	        roe = (isgreater(absA, absB))? Areg: Breg;
+	        scale = absA + absB;
+
+	        if(isequal(scale, ZERO))
+	        {
+	            Creg = 1.0;
+	            Sreg = ZERO;
+	            R = ZERO;
+	            Z = ZERO;
+	        }
+	        else
+	        {
+	            // R = scale * sqrt( pown((Areg/scale), 2) + pown((Breg/scale), 2) );
+	            // gentype hypot (gentype x, gentype y) -- Computes the value of the
+	            //          square root of x2+ y2 without undue overflow or underflow.
+	            R = scale * hypot( (Areg/scale), (Breg/scale) );
+	            R = (isless(roe, ZERO))? (-R): R;
+	            Creg = Areg / R;
+	            Sreg = Breg / R;
+	            Z = (isgreater(absA, absB))? Sreg:
+	                    ( (isnotequal(Creg, ZERO))? (1.0/Creg): 1.0 );
+	        }
+	        _A[offa] = R;
+	        _B[offb] = Z;
+	        _C[offc] = Creg;
+	        _S[offs] = Sreg;
+	    #else           // For comlpex type
+	        %TYPE alpha, temp;
+	        %PTYPE norm, scale, cabsA, cabsB;
+
+	        cabsA = CABS(Areg);
+	        cabsB = CABS(Breg);
+
+	        if(isequal(cabsA, PZERO))
+	        {
+	            Creg = PZERO;
+	            Sreg = (%TYPE)(1.0, 0.0);
+	            Areg = Breg;
+	        }
+	        else
+	        {
+	            scale = cabsA + cabsB;
+	            // norm = scale * sqrt( pown( CABS(Areg/scale), 2 ) + pown( CABS(Breg/scale), 2 ) );
+	            norm = scale * hypot( CABS(Areg/scale), CABS(Breg/scale) );
+	            alpha = Areg / cabsA;
+	            Creg = cabsA / norm;
+
+	            temp = Breg;
+	            %CONJUGATE(1, temp);
+	            %MUL( Sreg, alpha, temp );
+	            Sreg = Sreg / norm;
+
+	            Areg = alpha * norm;
+	        }
+	        _C[offc] = Creg;
+	        _S[offs] = Sreg;
+	        _A[offa] = Areg;
+	    #endif      // COMPLEX
+    }
+}
+\n";
+
diff --git a/src/library/blas/gens/clTemplates/rotm.cl b/src/library/blas/gens/clTemplates/rotm.cl
new file mode 100644
index 0000000..04fb34b
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/rotm.cl
@@ -0,0 +1,120 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+
+static const char *rotm_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#define ZERO    (%TYPE)0.0
+#define ONE     (%TYPE)1.0
+#define TWO     (%TYPE)2.0
+
+__kernel void %PREFIXrotm_kernel( __global %TYPE *_X, __global %TYPE *_Y, uint N,
+                                uint offx, int incx, uint offy, int incy
+#ifndef DO_ROT
+                                , __global %TYPE *_param, uint offParam             // Rotm parameters
+#else
+                                , %PTYPE C,  %PTYPE S                               // Rot parameters
+#endif
+                                )
+{
+	__global %TYPE *X = _X + offx;
+	__global %TYPE *Y = _Y + offy;
+
+    if ( incx < 0 ) {
+        X = X + (N - 1) * abs(incx);
+    }
+    if ( incy < 0 ) {
+        Y = Y + (N - 1) * abs(incy);
+    }
+
+    %PTYPE H11, H21, H12, H22, flag;    // All these are of PTYPE for rot and rotm
+
+    #ifndef DO_ROT
+    // Incase of Rotm
+        flag = _param[offParam];
+        H11 = _param[offParam+1];
+        H21 = _param[offParam+2];
+        H12 = _param[offParam+3];
+        H22 = _param[offParam+4];
+
+        (flag == (ZERO))? (H11 = ONE, H22 = ONE)                            : 1;    // 1 is dummy here to avoid compilation error
+        (flag == (ONE) )? (H21 = -ONE, H12 = ONE)                           : 1;
+        (flag == (-TWO))? (H11 = ONE, H21 = ZERO, H12 = ZERO, H22 = ONE)    : 1;
+    #else   // ROT
+        H11 = C;
+        H12 = S;
+        H21 = -S;
+        H22 = C;
+    #endif
+
+    int gOffset;
+    for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1)<N; gOffset+=( get_global_size(0) * %V ) )
+    {
+        %TYPE%V vReg1, vReg2, temp;
+
+        #ifdef INCX_NONUNITY
+            %VLOADWITHINCX( vReg1, (X + (gOffset*incx)), incx);
+        #else
+            vReg1 = %VLOAD( 0, (X + gOffset) );
+        #endif
+
+        #ifdef INCY_NONUNITY
+            %VLOADWITHINCX( vReg2, (Y + (gOffset*incy)), incy);
+        #else
+            vReg2 = %VLOAD( 0, (Y + gOffset) );
+        #endif
+
+        temp = (vReg1 * H11) + (vReg2 * H12);
+        vReg2 = (vReg1 * H21) + (vReg2 * H22);
+
+        #ifdef INCX_NONUNITY
+            %VSTOREWITHINCX( (X + (gOffset * incx)), temp, incx );
+        #else
+            %VSTORE( temp, 0 ,(X + (gOffset * incx)) );
+        #endif
+
+        #ifdef INCY_NONUNITY
+            %VSTOREWITHINCX( (Y + (gOffset * incy)), vReg2, incy );
+        #else
+            %VSTORE( vReg2, 0 ,(Y + (gOffset * incy)) );
+        #endif
+    }
+
+    // Loop for the last thread to handle the tail part of the vector
+    // Using the same gOffset used above
+    for( ; gOffset<N; gOffset++ )
+    {
+        %TYPE sReg1, sReg2, temp;
+        sReg1 = X[gOffset * incx];
+        sReg2 = Y[gOffset * incy];
+
+        temp = (sReg1 * H11) + (sReg2 * H12);
+        sReg2 = (sReg1 * H21) + (sReg2 * H22);
+
+        X[gOffset * incx] = temp;
+        Y[gOffset * incy] = sReg2;
+        }
+}
+\n";
+
diff --git a/src/library/blas/gens/clTemplates/rotmg.cl b/src/library/blas/gens/clTemplates/rotmg.cl
new file mode 100644
index 0000000..13bb0a7
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/rotmg.cl
@@ -0,0 +1,208 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+
+static const char *rotmg_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+// Rotmg exists only for S/D
+#define ZERO    (%TYPE)0.0
+#define ONE     (%TYPE)1.0
+#define TWO     (%TYPE)2.0
+
+#define GAM     (%TYPE)4096.0
+#define GAMSQ   (%TYPE)( GAM * GAM )
+#define RGAMSQ  (%TYPE)( 1.0 / GAMSQ )
+
+__kernel void %PREFIXrotmg_kernel( __global %TYPE *_D1, __global %TYPE *_D2, __global %TYPE *_X1,
+                                __global %TYPE *_Y1, __global %TYPE *_param,
+                                uint offD1, uint offD2, uint offX1, uint offY1, uint offParam )
+{
+	%TYPE D1, D2, X1, Y1;
+	%TYPE flag, H11, H12, H21, H22;                 // elements of PARAM
+	__global %TYPE *param = _param + offParam;
+
+    if(get_global_id(0) == 0)       // Only 1 thread will work
+	{
+        %TYPE P1, P2, Q1, Q2, temp, U;
+
+        D1 = _D1[offD1];
+        D2 = _D2[offD2];
+        X1 = _X1[offX1];
+        Y1 = _Y1[offY1];
+
+        if(isless(D1, ZERO))
+        {
+            flag = -ONE;
+            H11 = ZERO;
+            H12 = ZERO;
+            H21 = ZERO;
+            H22 = ZERO;
+            D1 = ZERO;
+            D2 = ZERO;
+            X1 = ZERO;
+        }
+        else                                // CASE D1 NONNEGATIVE
+        {
+            P2 = D2 * Y1;
+            if(isequal(P2, ZERO))
+            {
+                flag = -TWO;
+                param[0] = flag;
+                return;
+            }
+            // Regular case
+            P1 = D1 * X1;
+            Q2 = P2 * Y1;
+            Q1 = P1 * X1;
+
+            if(isgreater( fabs(Q1), fabs(Q2) ))
+            {
+                H21 = -Y1 / X1;
+                H12 = P2 / P1;
+                U = ONE - (H12 * H21);
+
+                if(isgreater( U, ZERO ))
+                {
+                    flag = ZERO;
+                    D1 = D1 / U;
+                    D2 = D2 / U;
+                    X1 = X1 * U;
+                }
+            }
+            else
+            {
+                if(isless(Q2, ZERO))
+                {
+                    flag = -ONE;
+                    H11 = ZERO;
+                    H12 = ZERO;
+                    H21 = ZERO;
+                    H22 = ZERO;
+                    D1 = ZERO;
+                    D2 = ZERO;
+                    X1 = ZERO;
+                }
+                else
+                {
+                    flag = ONE;
+                    H11 = P1 / P2;
+                    H22 = X1 / Y1;
+                    U = ONE + (H11 * H22);
+                    temp = D2 / U;
+                    D2 = D1 / U;
+                    D1 = temp;
+                    X1 = Y1 * U;
+                }
+            }
+            if(isnotequal(D1, ZERO))
+            {
+                while(isless(D1, RGAMSQ) || isgreater(D1, GAMSQ))
+                {
+                    if(isequal(flag, ZERO))
+                    {
+                        H11 = ONE;
+                        H22 = ONE;
+                        flag = -ONE;
+                    }
+                    else
+                    {
+                        H21 = -ONE;
+                        H12 = ONE;
+                        flag = -ONE;
+                    }
+                    if(isless(D1, RGAMSQ))
+                    {
+                        D1 = D1 * GAMSQ;
+                        X1 = X1 / GAM;
+                        H11 = H11 / GAM;
+                        H12 = H12 / GAM;
+                    }
+                    else
+                    {
+                        D1 = D1 / GAMSQ;
+                        X1 = X1 * GAM;
+                        H11 = H11 * GAM;
+                        H12 = H12 * GAM;
+                    }
+                }   // End of while
+            }
+
+            if(isnotequal(D2, ZERO))
+            {
+                while(isless( fabs(D2), RGAMSQ ) || isgreater( fabs(D2), GAMSQ ))
+                {
+                    if(isequal(flag, ZERO))
+                    {
+                        H11 = ONE;
+                        H22 = ONE;
+                        flag = -ONE;
+                    }
+                    else
+                    {
+                        H21 = -ONE;
+                        H12 = ONE;
+                        flag = -ONE;
+                    }
+                    if(isless( fabs(D2), RGAMSQ ))
+                    {
+                        D2 = D2 * GAMSQ;
+                        H21 = H21 / GAM;
+                        H22 = H22 / GAM;
+                    }
+                    else
+                    {
+                        D2 = D2 / GAMSQ;
+                        H21 = H21 * GAM;
+                        H22 = H22 * GAM;
+                    }
+                }   // End of while
+            }
+        }
+
+        if(isless(flag, ZERO))
+        {
+            param[1] = H11;
+            param[2] = H21;
+            param[3] = H12;
+            param[4] = H22;
+        }
+        else if(isequal(flag, ZERO))
+        {
+            param[2] = H21;
+            param[3] = H12;
+        }
+        else
+        {
+            param[1] = H11;
+            param[4] = H22;
+        }
+
+        param[0] = flag;
+        _D1[offD1] = D1;
+        _D2[offD2] = D2;
+        _X1[offX1] = X1;
+    }   // global_id(0) == 0
+}
+\n";
+
diff --git a/src/library/blas/gens/clTemplates/scal.cl b/src/library/blas/gens/clTemplates/scal.cl
new file mode 100644
index 0000000..19348fd
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/scal.cl
@@ -0,0 +1,69 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+
+static const char *scal_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+__kernel void %PREFIXscal_kernel( %TYPE alpha, __global %TYPE *_X, uint N, uint offx, int incx )
+{
+    if(incx < 0) {
+        return;
+    }
+
+	__global %TYPE *X = _X + offx;
+    uint global_offset = get_global_id(0) * %V;
+    bool isVectorWI = ((global_offset + (%V-1)) < N) && (incx == 1);
+
+    int gOffset;
+    for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1)<N; gOffset+=( get_global_size(0) * %V ) )
+    {
+        %TYPE%V vReg1, temp;
+
+        #ifdef INCX_NONUNITY
+            %VLOADWITHINCX( vReg1, (X + (gOffset*incx)), incx);
+        #else
+            vReg1 = %VLOAD( 0, (X + gOffset) );
+        #endif
+
+        %VMUL( temp, vReg1, alpha );
+
+        #ifdef INCX_NONUNITY
+            %VSTOREWITHINCX( (X + (gOffset * incx)), temp, incx );
+        #else
+            %VSTORE( temp, 0 ,(X + (gOffset * incx)) );
+        #endif
+    }
+
+    // Loop for the last thread to handle the tail part of the vector
+    // Using the same gOffset used above
+    for( ; gOffset<N; gOffset++ )
+    {
+        %TYPE sReg1, temp;
+        sReg1 = X[gOffset * incx];
+        %MUL( temp, sReg1, alpha );
+        X[gOffset * incx] = temp;
+        }
+}
+\n";
+
diff --git a/src/library/blas/gens/clTemplates/swap.cl b/src/library/blas/gens/clTemplates/swap.cl
new file mode 100644
index 0000000..91c86c6
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/swap.cl
@@ -0,0 +1,83 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+
+static const char *swap_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+__kernel void %PREFIXswap_kernel( __global %TYPE *_X, __global %TYPE *_Y, uint N, uint offx, int incx, uint offy, int incy )
+{
+	__global %TYPE *X = _X + offx;
+	__global %TYPE *Y = _Y + offy;
+
+    if ( incx < 0 ) {
+        X = X + (N - 1) * abs(incx);
+    }
+    if ( incy < 0 ) {
+        Y = Y + (N - 1) * abs(incy);
+    }
+
+    int gOffset;
+    for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1)<N; gOffset+=( get_global_size(0) * %V ) )
+    {
+        %TYPE%V vReg1, vReg2;
+
+        #ifdef INCX_NONUNITY
+            %VLOADWITHINCX( vReg1, (X + (gOffset*incx)), incx);
+        #else
+            vReg1 = %VLOAD( 0, (X + gOffset) );
+        #endif
+
+        #ifdef INCY_NONUNITY
+            %VLOADWITHINCX( vReg2, (Y + (gOffset*incy)), incy);
+        #else
+            vReg2 = %VLOAD( 0, (Y + gOffset) );
+        #endif
+
+        #ifdef INCX_NONUNITY
+            %VSTOREWITHINCX( (X + (gOffset * incx)), vReg2, incx );
+        #else
+            %VSTORE( vReg2, 0 ,(X + (gOffset * incx)) );
+        #endif
+
+        #ifdef INCY_NONUNITY
+            %VSTOREWITHINCX( (Y + (gOffset * incy)), vReg1, incy );
+        #else
+            %VSTORE( vReg1, 0 ,(Y + (gOffset * incy)) );
+        #endif
+    }
+
+    // Loop for the last thread to handle the tail part of the vector
+    // Using the same gOffset used above
+    for( ; gOffset<N; gOffset++ )
+    {
+        %TYPE sReg1, sReg2;
+        sReg1 = X[gOffset * incx];
+        sReg2 = Y[gOffset * incy];
+
+        X[gOffset * incx] = sReg2;
+        Y[gOffset * incy] = sReg1;
+        }
+}
+\n";
+
diff --git a/src/library/blas/gens/clTemplates/symm.cl b/src/library/blas/gens/clTemplates/symm.cl
new file mode 100644
index 0000000..597fa8b
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/symm.cl
@@ -0,0 +1,1020 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+//
+// NOTE:                                  OUTDATED FILE - NOT USED
+//
+// BUG NOTE:
+// The SYMM_C_KERNEL() suffers from TAIL BUG. Does not handle TAILS properly on the M and N side.
+// Needs to be treated like GEMM2 - Having a separate TAIL_RUN and trimming M and N on Non-tail Runs.
+// However, SYMM is now composed from GEMM. Only handling the diaognal portion depends on this kernel.
+// So, we will fix LOADA_SECOND and LOADB_SECOND appropriately and use this kernel.
+// This kernel should not be used at all.
+// In essence, one should review this kernel only for the __SYMM_DIAGONAL__ portion.
+//
+
+const char *SYMM_C_KERNEL= "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+#if !defined(__SYMM_UPPER__) && !defined(__SYMM_LOWER__)
+	#error Upper or Lower must be defined
+#endif
+
+#if defined(__SYMM_UPPER__) && defined(__SYMM_LOWER__)
+	#error Both Upper and Lower cannot be defined together
+#endif
+
+#if !defined(__SYMM_LEFT__) && !defined(__SYMM_RIGHT__)
+	#error Neither Left nor Right defined
+#endif
+
+#if defined(__SYMM_LEFT__) && defined(__SYMM_RIGHT__)
+	#error Both LEFT and RIGHT cannot be defined together
+#endif
+
+#if defined(__SYMM_COLMAJOR__)
+	void VECTOR_STORE(%TYPE%V data, __global %TYPE *A, uint M, uint N, uint lda, uint row, uint col)
+	{
+		if ( ((row + %V -1) < M) && (col < N) )
+		{
+			%VSTORE( data, 0, (&A[col*lda + row]));
+			return;
+		}
+
+		//
+		// Construct from SCALAR
+		//
+		if ((row < M) && (col < N))
+		{
+			int i=0;
+			%TYPE temp[%V];
+
+			//
+			// FIXME: ENDIAN ISSUES - Currently for Little endian
+			//		  Needs fixes for Big Endian
+			//
+			*(__private %TYPE%V *)temp = data;
+
+			for(; i< (M-row); i++)
+			{
+				A[col*lda + row + i] = temp[i];
+			}
+		}
+		return;
+	}
+
+	%TYPE%V VECTOR_LOAD(__global %TYPE *A, uint M, uint N, uint lda, uint row, uint col)
+	{
+		%TYPE temp[%V];
+		%TYPE%V retval = (%TYPE%V) 0;
+
+		if ( ((row + %V -1) < M) && (col < N) )
+		{
+			retval = %VLOAD(0, (&A[col*lda + row]));
+			return retval;
+		}
+
+		//
+		// Construct from SCALAR
+		//
+		if ((row < M) && (col < N))
+		{
+			int i=0;
+
+			for(; i< (M-row); i++)
+			{
+				temp[i] = A[col*lda + row + i];
+			}
+			for(; i< (%V);  i++)
+			{
+				temp[i] = 0;
+			}
+			%VLOADWITHINCX(retval, temp, 1);
+		}
+		return retval;
+	}
+
+	%TYPE%V SYMM_VECTOR_LOAD_USING_SCALAR(__global %TYPE *A, uint M, uint lda, uint row, uint col)
+	{
+		%TYPE temp[%V];
+		%TYPE%V retval;
+
+		for(uint i=0; i< (%V); i++)
+		{
+			if (((row + i) < M) && (col < M))
+			{
+				#ifdef __SYMM_UPPER__
+				if ((row + i) <= col)
+				#else
+				if ((row + i) >= col)
+				#endif
+				{
+					temp[i] = A[col*lda + row + i];
+				} else {
+					temp[i] = A[(row+i)*lda + col];
+				}
+			} else {
+				temp[i] = (%TYPE) 0;
+			}
+		}
+		%VLOADWITHINCX(retval, temp, 1 );
+		return retval;
+	}
+
+	%TYPE%V SYMM_VECTOR_LOAD(__global %TYPE *A, uint M, uint lda, uint row, uint col)
+	{
+		%TYPE%V retval = (%TYPE%V) 0;
+
+		bool validAddress = ((row >= M) || (col >=M)) ? false : true;
+		bool fullyWithinUpperTriangle = validAddress && ((row + %V -1) <= col);
+		bool fullyWithinLowerTriangle = validAddress && (row > col) && ((row + %V -1) < M);
+		bool protrudingLowerTriangle  = validAddress && ((row + %V -1) >= M);
+		bool inBetweenDiagonal  	  = validAddress && (!fullyWithinUpperTriangle) && (!fullyWithinLowerTriangle) && (!protrudingLowerTriangle);
+		if (fullyWithinLowerTriangle || fullyWithinUpperTriangle)
+		{
+			#ifdef __SYMM_UPPER__
+			if (fullyWithinUpperTriangle)
+			#else
+			if (fullyWithinLowerTriangle)
+			#endif
+			{
+				retval = %VLOAD(0, (&A[(col)*lda + (row)]));
+			} else {
+				retval = %VLOADWITHINCXV2(0, (&A[(row)*lda + (col)]), lda);
+			}
+		} else {
+			if (protrudingLowerTriangle || inBetweenDiagonal)
+			{
+				retval = SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, row, col);
+			}
+		}
+		return retval;
+	}
+
+	#ifdef __SYMM_LEFT__
+	// (A) MxM * (B) MxN
+		%TYPE%V LOADA(__global %TYPE *A, uint M, uint K, uint lda, uint row, uint col)
+		{
+			return SYMM_VECTOR_LOAD(A, M, lda, row, col);
+		}
+		#ifdef __SYMM_LOWER__
+			// CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller
+			#define LOADA_FIRST(A,M,K,lda,row,col)	%VLOAD(0, (&A[(col)*lda + (row)]))
+		#elif defined(__SYMM_UPPER__)
+			// CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller
+			#define LOADA_FIRST(A,M,K,lda,row,col) 	%VLOADWITHINCXV2(0, (&A[(row)*lda + (col)]), lda)
+		#endif
+		#define LOADA_SECOND(A,M,K,lda,row,col)		SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, row, col)
+		#ifdef __SYMM_LOWER__
+			// CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller
+			#define LOADA_THIRD(A,M,K,lda,row, col)	%VLOADWITHINCXV2(0, (&A[(row)*lda + (col)]), lda)
+		#elif defined(__SYMM_UPPER__)
+			// CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller
+			#define LOADA_THIRD(A,M,K,lda,row, col)	%VLOAD(0, (&A[(col)*lda + (row)]))
+		#endif
+		#define LOADA_TAIL(A,M,K,lda,row,col) 		SYMM_VECTOR_LOAD_USING_SCALAR(A,M,lda,row,col)
+
+		%TYPE%V LOADB(__global %TYPE *B, uint K, uint N, uint ldb, uint row, uint col)
+		{
+			return VECTOR_LOAD(B, K, N, ldb, row, col );
+		}
+		#define LOADB_FIRST(B,K,N,ldb,row,col) 	%VLOAD(0, (&B[(col)*(ldb) + (row)]))
+		#define LOADB_SECOND(B,K,N,ldb,row,col) 	%VLOAD(0, (&B[(col)*(ldb) + (row)]))
+		#define LOADB_THIRD(B,K,N,ldb,row,col) 	%VLOAD(0, (&B[(col)*(ldb) + (row)]))
+		#define LOADB_TAIL(B,K,N,ldb,row,col)	VECTOR_LOAD(B, K, N, ldb, row, col)
+
+	#elif defined(__SYMM_RIGHT__)
+		// (A)MxN * (B)NxN
+		%TYPE%V LOADA(__global %TYPE *A, uint M, uint K, uint lda, uint row, uint col)
+		{
+			return VECTOR_LOAD(A, M, K, lda, row, col );
+		}
+		#define LOADA_FIRST(A,M,K,lda,row,col)	%VLOAD(0, (&A[(col)*(lda) + (row)]))
+		#define LOADA_SECOND(A,M,K,lda,row,col)	%VLOAD(0, (&A[(col)*(lda) + (row)]))
+		#define LOADA_THIRD(A,M,K,lda,row,col)	%VLOAD(0, (&A[(col)*(lda) + (row)]))
+		#define LOADA_TAIL(A,M,K,lda,row,col)	VECTOR_LOAD(A, M, K, lda, row, col)
+
+		%TYPE%V LOADB(__global %TYPE *B, uint K, uint N, uint ldb, uint row, uint col)
+		{
+			return SYMM_VECTOR_LOAD(B, N, ldb, row, col);
+		}
+		#ifdef __SYMM_UPPER__
+			// CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller
+			#define LOADB_FIRST(B,K,N,ldb,row,col)	%VLOAD(0, (&B[(col)*(ldb) + (row)]))
+		#elif defined(__SYMM_LOWER__)
+			// CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller
+			#define LOADB_FIRST(B,K,N,ldb,row,col)	%VLOADWITHINCXV2(0, (&B[(row)*(ldb)  + (col)]), ldb)
+		#endif
+		#define LOADB_SECOND(B,K,N,ldb,row,col)		SYMM_VECTOR_LOAD_USING_SCALAR(B, N, ldb, row, col)
+		#ifdef __SYMM_UPPER__
+			// CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller
+			#define LOADB_THIRD(B,K,N,ldb,row,col)	%VLOADWITHINCXV2(0, (&B[(row)*(ldb) + (col)]), ldb)
+		#elif defined(__SYMM_LOWER__)
+			// CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller
+			#define LOADB_THIRD(B,K,N,ldb,row,col)	%VLOAD(0, (&B[(col)*(ldb) + (row)]))
+		#endif
+		#define LOADB_TAIL(B,K,N,ldb,row,col)		SYMM_VECTOR_LOAD_USING_SCALAR(B, N,ldb,row,col)
+	#endif // Left, Right
+
+	__kernel void symm_C_kernel( __global %TYPE const * restrict _A, __global %TYPE const * restrict _B, __global %TYPE *_C,
+			       			  	uint M, uint N, uint _lda, uint _ldb, int ldc, uint offa, uint offb, uint offc, %TYPE alpha, %TYPE beta)
+	{
+		__global %TYPE const *restrict A;
+		__global %TYPE const *restrict B;
+		__global %TYPE *C;
+		uint K;
+		uint lda, ldb;
+		uint indexA, indexB, indexC;
+		uint rowA, colA, rowB, colB, rowC, colC;
+		uint numGroupsOnY;
+		uint bidX, bidY;
+		uint row, col;
+		uint REDColStart, REDColEnd; // As the panel traverses these columns, it will slow down - Hence RED.
+		uint tid = get_local_id(0);
+		int panel;
+		uint blockDimY;
+		C = _C + offc;
+	#ifdef __SYMM_LEFT__
+		// MxM * MxN
+		A = _A + offa;
+		lda = _lda;
+		B = _B + offb;
+		ldb = _ldb;
+		K = M;
+	#elif defined(__SYMM_RIGHT__)
+		// MxN * NxN
+		A = _B + offb;
+		lda = _ldb;
+		B = _A + offa;
+		ldb = _lda;
+		K = N;
+	#endif
+
+		//
+		// %WIDTH - Preferably 16
+		// %ITEMY, %ITEMX - 1 Thread is responsible for %ITEMY * %ITEMX sub-matrix in C
+		//					%ITEMY must be divisible by %V
+		// The entire workgroup loops-together to complete ITEMY-ITEMX sub-matrix
+		//
+		uint threadsY = %WIDTH;
+		uint threadsX = get_local_size(0)/threadsY;
+		uint offsetY = (tid % threadsY) * %V;
+		uint offsetX = (tid / threadsY);
+
+		//
+		// Column-Major ordering of Workgroups
+		//
+		// %ITEMY - Number of elements , a workitem processes in Y direction.
+		// %ITEMX - Number of elements , a workitem processes in X direction.
+		//
+		// %V 	- Vectoring Width
+		// %PANEL(*) - Panel Width to access Rows of A and Columns of B
+		//		   Right now, %V is assumed to be the panel width.
+		//		   We dont use %PANEL in the current implementation.
+		//
+		blockDimY = ((M-1) / (threadsY * %ITEMY)) + 1;
+		bidY = ( get_group_id(0) % ( blockDimY));
+		bidX = ( get_group_id(0) / ( blockDimY));
+
+		//
+		// <row,col> is the left-top of the TILE region
+		// in the output C matrix that will be determined
+		// by this workgroup
+		//
+		row =  (bidY * (threadsY * %ITEMY));
+		col =  (bidX * (threadsX * %ITEMX));
+
+		//
+		// REDColStart, REDColEnd:
+		// SYMM Matrix  multiplication proceeds by multiplying panels on A's block-row
+		// with panels on B's block-column.
+		// However due to symmetric nature of A/B matrix compounded by the fact that
+		// only upper OR lower triangle of the symm matrix is available, vector-loads
+		// are not possible while traversing certain regions of the matrix.
+		// REDColStart, REDColEnd identifies that region in which the panel crosses
+		// the diagonal. This region will be the slowest portion of the kernel next to
+		// processing the TAIL part.
+		//
+		#ifdef __SYMM_LEFT__
+			REDColStart = row;
+			REDColEnd = row  + (threadsY*(%ITEMY));
+		#elif defined(__SYMM_RIGHT__)
+			REDColStart = col;
+			REDColEnd = col + (threadsX*(%ITEMX));
+		#endif
+		rowA 	= 	row + offsetY;
+	   	colB 	= 	(col+offsetX);
+		indexC 	= 	(col+offsetX)*ldc + (row + offsetY);
+		bool tailBlock = ((row + threadsY*(%ITEMY)) > M) || ((col + threadsX*(%ITEMX)) > N);
+
+		%TYPE%V AVAL[%V][(%ITEMY_BY_V)]; // 8
+		%TYPE BVAL[%ITEMX][%V];
+		%TYPE%V CVAL[(%ITEMY_BY_V)][%ITEMX];
+
+		%IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+		for(uint i=0; i< (%ITEMY_BY_V); i++)
+		{
+			%IF(%ITEMX) #pragma unroll %ITEMX
+			for(uint j=0; j<(%ITEMX); j++)
+			{
+				CVAL[i][j] = (%TYPE%V) 0;
+			}
+		}
+
+		uint ACOL=0;
+		//
+		// 		SYMM
+		//
+		for(ACOL=0; ((tailBlock == false) && ((ACOL+%V-1) < K)); ACOL += %V /* %PANEL */)
+		{
+
+			if ((ACOL+%V-1) < REDColStart)
+			{
+				//
+				// Load B values
+				//
+				%IF(%ITEMX) #pragma unroll %ITEMX
+				for(uint bcol = 0; bcol < %ITEMX; bcol++)
+				{
+					//
+					// PENDING: PANEL iteration to Load the Panel Depth iterating by %V
+					//
+					*(__private %TYPE%V *)(&BVAL[bcol]) = LOADB_FIRST(B, K, N , ldb, ACOL, colB + (threadsX*bcol));
+				}
+
+				//
+				// Load A values
+				//
+				%IF(%ITEMY) #pragma unroll %ITEMY
+				for(uint i = 0; i < (%V * (%ITEMY_BY_V)) /* PANEL * ITEMY/V */; i++)
+				{
+					const uint yiterations = %ITEMY_BY_V;
+					uint c = (i / yiterations);
+					uint r = (i % yiterations);
+
+					AVAL[c][r] = LOADA_FIRST(A, M, K, lda, rowA + r*threadsY*(%V), ACOL + c );
+				}
+			} else if (ACOL < REDColEnd)
+			{
+				//
+				// Load B values
+				//
+				%IF(%ITEMX) #pragma unroll %ITEMX
+				for(uint bcol = 0; bcol < %ITEMX; bcol++)
+				{
+					//
+					// PENDING: PANEL iteration to Load the Panel Depth iterating by %V
+					//
+					*(__private %TYPE%V *)(&BVAL[bcol]) = LOADB_SECOND(B, K, N , ldb, ACOL, colB + (threadsX*bcol));
+				}
+
+				//
+				// Load A values
+				//
+				%IF(%ITEMY) #pragma unroll %ITEMY
+				for(uint i = 0; i < (%V * (%ITEMY_BY_V)) /* PANEL * ITEMY/V */; i++)
+				{
+					const uint yiterations = %ITEMY_BY_V;
+					uint c = (i / yiterations);
+					uint r = (i % yiterations);
+
+					AVAL[c][r] = LOADA_SECOND(A, M, K, lda, rowA + r*threadsY*(%V), ACOL + c );
+				}
+			} else {
+				//
+				// Load B values
+				//
+				%IF(%ITEMX) #pragma unroll %ITEMX
+				for(uint bcol = 0; bcol < %ITEMX; bcol++)
+				{
+					//
+					// PENDING: PANEL iteration to Load the Panel Depth iterating by %V
+					//
+					*(__private %TYPE%V *)(&BVAL[bcol]) = LOADB_THIRD(B, K, N , ldb, ACOL, colB + (threadsX*bcol));
+				}
+
+				//
+				// Load A values
+				//
+				%IF(%ITEMY) #pragma unroll %ITEMY
+				for(uint i = 0; i < (%V * (%ITEMY_BY_V)) /* PANEL * ITEMY/V */; i++)
+				{
+					const uint yiterations = %ITEMY_BY_V;
+					uint c = (i / yiterations);
+					uint r = (i % yiterations);
+
+					AVAL[c][r] = LOADA_THIRD(A, M, K, lda, rowA + r*threadsY*(%V), ACOL + c );
+				}
+			}
+
+			%IF(%V) #pragma unroll %V
+			for(uint panel=0; panel < %V; panel++)
+			{
+				%IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+				for(uint i=0; i<(%ITEMY_BY_V); i++)
+				{
+					%IF(%ITEMX) #pragma unroll %ITEMX
+					for(uint j=0; j<(%ITEMX); j++)
+					{
+						%VMAD(CVAL[i][j] ,  AVAL[panel][i] , BVAL[j][panel]);
+					}
+				}
+			}
+
+			#ifdef SYMM_NEEDS_BARRIER
+			barrier(CLK_LOCAL_MEM_FENCE);
+			#endif
+		}
+
+		//
+		//  SYMM - 	The Tail....
+		//		The tail can wag past M and N. The LOAD routines clamp those accesses
+		//
+		for(; ACOL < K; ACOL += %V /* %PANEL */)
+		{
+			//
+			// Load B values
+			//
+			%IF(%ITEMX) #pragma unroll %ITEMX
+			for(uint bcol = 0; bcol < %ITEMX; bcol++)
+			{
+				//
+				// PENDING: PANEL iteration to Load the Panel Depth iterating by %V
+				//
+				*(__private %TYPE%V *)(&BVAL[bcol]) = LOADB_TAIL(B, K, N , ldb, ACOL, colB + (threadsX*bcol));
+			}
+
+			//
+			// Load A values
+			//
+			%IF(%ITEMY) #pragma unroll %ITEMY
+			for(uint i = 0; i < (%V * (%ITEMY_BY_V)) /* PANEL * ITEMY/V */; i++)
+			{
+				const uint yiterations = %ITEMY_BY_V;
+				uint c = (i / yiterations);
+				uint r = (i % yiterations);
+
+				AVAL[c][r] = LOADA_TAIL(A, M, K, lda, rowA + r*threadsY*(%V), ACOL + c );
+			}
+
+			%IF(%V) #pragma unroll %V
+			for(uint panel=0; panel < %V; panel++)
+			{
+				%IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+				for(uint i=0; i<(%ITEMY_BY_V); i++)
+				{
+					%IF(%ITEMX) #pragma unroll %ITEMX
+					for(uint j=0; j<(%ITEMX); j++)
+					{
+						%VMAD(CVAL[i][j] ,  AVAL[panel][i] , BVAL[j][panel]);
+					}
+				}
+			}
+
+			#ifdef SYMM_NEEDS_BARRIER
+			barrier(CLK_LOCAL_MEM_FENCE);
+			#endif
+		}
+
+
+		//
+		// STORE Result in C
+		//
+		%TYPE%V reg , betareg, alphareg;
+		%TYPE%V alphav, betav;
+		alphav = %VMAKEVEC(alpha);
+		betav = %VMAKEVEC(beta);
+
+		%IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+		for(uint i=0; i< (%ITEMY_BY_V); i++)
+		{
+			%IF(%ITEMX) #pragma unroll %ITEMX
+			for(uint j=0; j<(%ITEMX); j++)
+			{
+				reg = VECTOR_LOAD(C, M, N, ldc, rowA + i*threadsY*%V, colB+(j*threadsX));
+				%VMUL(betareg, betav, reg);
+				%VMUL(alphareg, alphav, CVAL[i][j]);
+				%ADD( reg, betareg, alphareg);
+				VECTOR_STORE(reg, C, M, N, ldc, rowA + i*threadsY*%V, colB+(j*threadsX));
+			}
+		}
+		return;
+	}
+#else
+#error COLMAJOR Not Defined while compiling SYMM_C_KERNEL
+#endif
+";
+
+const char *SYMM_C_KERNEL_WORKING_EXCEPT_CSYMM_PROBLEM = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+#if !defined(__SYMM_UPPER__) && !defined(__SYMM_LOWER__)
+	#error Upper or Lower must be defined
+#endif
+
+#if defined(__SYMM_UPPER__) && defined(__SYMM_LOWER__)
+	#error Both Upper and Lower cannot be defined together
+#endif
+
+#if !defined(__SYMM_LEFT__) && !defined(__SYMM_RIGHT__)
+	#error Neither Left nor Right defined
+#endif
+
+#if defined(__SYMM_LEFT__) && defined(__SYMM_RIGHT__)
+	#error Both LEFT and RIGHT cannot be defined together
+#endif
+
+#if defined(__SYMM_COLMAJOR__)
+	void VECTOR_STORE(%TYPE%V data, __global %TYPE *A, uint M, uint N, uint lda, uint row, uint col)
+	{
+		if ( ((row + %V -1) < M) && (col < N) )
+		{
+			%VSTORE( data, 0, (&A[col*lda + row]));
+			return;
+		}
+
+		//
+		// Construct from SCALAR
+		//
+		if ((row < M) && (col < N))
+		{
+			int i=0;
+			%TYPE temp[%V];
+
+			//
+			// FIXME: ENDIAN ISSUES - Currently for Little endian
+			//		  Needs fixes for Big Endian
+			//
+			*(__private %TYPE%V *)temp = data;
+
+			for(; i< (M-row); i++)
+			{
+				A[col*lda + row + i] = temp[i];
+			}
+		}
+		return;
+	}
+
+	%TYPE%V VECTOR_LOAD(__global %TYPE *A, uint M, uint N, uint lda, uint row, uint col)
+	{
+		%TYPE temp[%V];
+		%TYPE%V retval = (%TYPE%V) 0;
+
+		if ( ((row + %V -1) < M) && (col < N) )
+		{
+			retval = %VLOAD(0, (&A[col*lda + row]));
+			return retval;
+		}
+
+		//
+		// Construct from SCALAR
+		//
+		if ((row < M) && (col < N))
+		{
+			int i=0;
+
+			for(; i< (M-row); i++)
+			{
+				temp[i] = A[col*lda + row + i];
+			}
+			for(; i< (%V);  i++)
+			{
+				temp[i] = 0;
+			}
+			%VLOADWITHINCX(retval, temp, 1);
+		}
+		return retval;
+	}
+
+	%TYPE%V SYMM_VECTOR_LOAD_USING_SCALAR(__global %TYPE *A, uint M, uint lda, uint row, uint col)
+	{
+		%TYPE temp[%V];
+		%TYPE%V retval;
+
+		for(uint i=0; i< (%V); i++)
+		{
+			if (((row + i) < M) && (col < M))
+			{
+				#ifdef __SYMM_UPPER__
+				if ((row + i) <= col)
+				#else
+				if ((row + i) >= col)
+				#endif
+				{
+					temp[i] = A[col*lda + row + i];
+				} else {
+					temp[i] = A[(row+i)*lda + col];
+				}
+			} else {
+				temp[i] = (%TYPE) 0;
+			}
+		}
+		%VLOADWITHINCX(retval, temp, 1 );
+		return retval;
+	}
+
+	%TYPE%V SYMM_VECTOR_LOAD(__global %TYPE *A, uint M, uint lda, uint row, uint col)
+	{
+		%TYPE%V retval = (%TYPE%V) 0;
+
+		bool validAddress = ((row >= M) || (col >=M)) ? false : true;
+		bool fullyWithinUpperTriangle = validAddress && ((row + %V -1) <= col);
+		bool fullyWithinLowerTriangle = validAddress && (row > col) && ((row + %V -1) < M);
+		bool protrudingLowerTriangle  = validAddress && ((row + %V -1) >= M);
+		bool inBetweenDiagonal  	  = validAddress && (!fullyWithinUpperTriangle) && (!fullyWithinLowerTriangle) && (!protrudingLowerTriangle);
+		if (fullyWithinLowerTriangle || fullyWithinUpperTriangle)
+		{
+			#ifdef __SYMM_UPPER__
+			if (fullyWithinUpperTriangle)
+			#else
+			if (fullyWithinLowerTriangle)
+			#endif
+			{
+				retval = %VLOAD(0, (&A[(col)*lda + (row)]));
+			} else {
+				retval = %VLOADWITHINCXV2(0, (&A[(row)*lda + (col)]), lda);
+			}
+		} else {
+			if (protrudingLowerTriangle || inBetweenDiagonal)
+			{
+				retval = SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, row, col);
+			}
+		}
+		return retval;
+	}
+
+	#ifdef __SYMM_LEFT__
+	// (A) MxM * (B) MxN
+		%TYPE%V LOADA(__global %TYPE *A, uint M, uint K, uint lda, uint row, uint col)
+		{
+			return SYMM_VECTOR_LOAD(A, M, lda, row, col);
+		}
+		#ifdef __SYMM_LOWER__
+			// CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller
+			#define LOADA_FIRST(A,M,K,lda,row,col)	%VLOAD(0, (&A[(col)*lda + (row)]))
+		#elif defined(__SYMM_UPPER__)
+			// CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller
+			#define LOADA_FIRST(A,M,K,lda,row,col) 	%VLOADWITHINCXV2(0, (&A[(row)*lda + (col)]), lda)
+		#endif
+		#define LOADA_SECOND(A,M,K,lda,row,col)		SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, row, col)
+		#ifdef __SYMM_LOWER__
+			// CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller
+			#define LOADA_THIRD(A,M,K,lda,row, col)	%VLOADWITHINCXV2(0, (&A[(row)*lda + (col)]), lda)
+		#elif defined(__SYMM_UPPER__)
+			// CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller
+			#define LOADA_THIRD(A,M,K,lda,row, col)	%VLOAD(0, (&A[(col)*lda + (row)]))
+		#endif
+		#define LOADA_TAIL(A,M,K,lda,row,col) 		SYMM_VECTOR_LOAD_USING_SCALAR(A,M,lda,row,col)
+
+		%TYPE%V LOADB(__global %TYPE *B, uint K, uint N, uint ldb, uint row, uint col)
+		{
+			return VECTOR_LOAD(B, K, N, ldb, row, col );
+		}
+		#define LOADB_FIRST(B,K,N,ldb,row,col) 	%VLOAD(0, (&B[(col)*(ldb) + (row)]))
+		#define LOADB_SECOND(B,K,N,ldb,row,col) 	%VLOAD(0, (&B[(col)*(ldb) + (row)]))
+		#define LOADB_THIRD(B,K,N,ldb,row,col) 	%VLOAD(0, (&B[(col)*(ldb) + (row)]))
+		#define LOADB_TAIL(B,K,N,ldb,row,col)	VECTOR_LOAD(B, K, N, ldb, row, col)
+
+	#elif defined(__SYMM_RIGHT__)
+		// (A)MxN * (B)NxN
+		%TYPE%V LOADA(__global %TYPE *A, uint M, uint K, uint lda, uint row, uint col)
+		{
+			return VECTOR_LOAD(A, M, K, lda, row, col );
+		}
+		#define LOADA_FIRST(A,M,K,lda,row,col)	%VLOAD(0, (&A[(col)*(lda) + (row)]))
+		#define LOADA_SECOND(A,M,K,lda,row,col)	%VLOAD(0, (&A[(col)*(lda) + (row)]))
+		#define LOADA_THIRD(A,M,K,lda,row,col)	%VLOAD(0, (&A[(col)*(lda) + (row)]))
+		#define LOADA_TAIL(A,M,K,lda,row,col)	VECTOR_LOAD(A, M, K, lda, row, col)
+
+		%TYPE%V LOADB(__global %TYPE *B, uint K, uint N, uint ldb, uint row, uint col)
+		{
+			return SYMM_VECTOR_LOAD(B, N, ldb, row, col);
+		}
+		#ifdef __SYMM_UPPER__
+			// CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller
+			#define LOADB_FIRST(B,K,N,ldb,row,col)	%VLOAD(0, (&B[(col)*(ldb) + (row)]))
+		#elif defined(__SYMM_LOWER__)
+			// CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller
+			#define LOADB_FIRST(B,K,N,ldb,row,col)	%VLOADWITHINCXV2(0, (&B[(row)*(ldb)  + (col)]), ldb)
+		#endif
+		#define LOADB_SECOND(B,K,N,ldb,row,col)		SYMM_VECTOR_LOAD_USING_SCALAR(B, N, ldb, row, col)
+		#ifdef __SYMM_UPPER__
+			// CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller
+			#define LOADB_THIRD(B,K,N,ldb,row,col)	%VLOADWITHINCXV2(0, (&B[(row)*(ldb) + (col)]), ldb)
+		#elif defined(__SYMM_LOWER__)
+			// CHECK: KPRINTF Behaviour with so many parantheses - If fails, use parantheses in the caller
+			#define LOADB_THIRD(B,K,N,ldb,row,col)	%VLOAD(0, (&B[(col)*(ldb) + (row)]))
+		#endif
+		#define LOADB_TAIL(B,K,N,ldb,row,col)		SYMM_VECTOR_LOAD_USING_SCALAR(B, N,ldb,row,col)
+	#endif // Left, Right
+
+	__kernel void symm_C_kernel( __global %TYPE const * restrict _A, __global %TYPE const * restrict _B, __global %TYPE *C,
+			       			  	uint M, uint N, uint _lda, uint _ldb, int ldc, %TYPE alpha, %TYPE beta)
+	{
+		__global %TYPE const *restrict A;
+		__global %TYPE const *restrict B;
+		uint K;
+		uint lda, ldb;
+		uint indexA, indexB, indexC;
+		uint rowA, colA, rowB, colB, rowC, colC;
+		uint numGroupsOnY;
+		uint bidX, bidY;
+		uint row, col;
+		uint REDColStart, REDColEnd; // As the panel traverses these columns, it will slow down - Hence RED.
+		uint tid = get_local_id(0);
+		int panel;
+		uint blockDimY;
+	#ifdef __SYMM_LEFT__
+		// MxM * MxN
+		A = _A;
+		lda = _lda;
+		B = _B;
+		ldb = _ldb;
+		K = M;
+	#elif defined(__SYMM_RIGHT__)
+		// MxN * NxN
+		A = _B;
+		lda = _ldb;
+		B = _A;
+		ldb = _lda;
+		K = N;
+	#endif
+
+		//
+		// %WIDTH - Preferably 16
+		// %ITEMY, %ITEMX - 1 Thread is responsible for %ITEMY * %ITEMX sub-matrix in C
+		//					%ITEMY must be divisible by %V
+		// The entire workgroup loops-together to complete ITEMY-ITEMX sub-matrix
+		//
+		uint threadsY = %WIDTH;
+		uint threadsX = get_local_size(0)/threadsY;
+		uint offsetY = (tid % threadsY) * %V;
+		uint offsetX = (tid / threadsY);
+
+		//
+		// Column-Major ordering of Workgroups
+		//
+		// %ITEMY - Number of elements , a workitem processes in Y direction.
+		// %ITEMX - Number of elements , a workitem processes in X direction.
+		//
+		// %V 	- Vectoring Width
+		// %PANEL(*) - Panel Width to access Rows of A and Columns of B
+		//		   Right now, %V is assumed to be the panel width.
+		//		   We dont use %PANEL in the current implementation.
+		//
+		blockDimY = ((M-1) / (threadsY * %ITEMY)) + 1;
+		bidY = ( get_group_id(0) % ( blockDimY));
+		bidX = ( get_group_id(0) / ( blockDimY));
+
+		//
+		// <row,col> is the left-top of the TILE region
+		// in the output C matrix that will be determined
+		// by this workgroup
+		//
+		row =  (bidY * (threadsY * %ITEMY));
+		col =  (bidX * (threadsX * %ITEMX));
+
+		//
+		// REDColStart, REDColEnd:
+		// SYMM Matrix  multiplication proceeds by multiplying panels on A's block-row
+		// with panels on B's block-column.
+		// However due to symmetric nature of A/B matrix compounded by the fact that
+		// only upper OR lower triangle of the symm matrix is available, vector-loads
+		// are not possible while traversing certain regions of the matrix.
+		// REDColStart, REDColEnd identifies that region in which the panel crosses
+		// the diagonal. This region will be the slowest portion of the kernel next to
+		// processing the TAIL part.
+		//
+		#ifdef __SYMM_LEFT__
+			REDColStart = row;
+			REDColEnd = row  + (threadsY*(%ITEMY));
+		#elif defined(__SYMM_RIGHT__)
+			REDColStart = col;
+			REDColEnd = col + (threadsX*(%ITEMX));
+		#endif
+		rowA 	= 	row + offsetY;
+	   	colB 	= 	(col+offsetX);
+		indexC 	= 	(col+offsetX)*ldc + (row + offsetY);
+		bool tailBlock = ((row + threadsY*(%ITEMY)) > M) || ((col + threadsX*(%ITEMX)) > N);
+
+		%TYPE%V AVAL[%V][(%ITEMY_BY_V)]; // 8
+		%TYPE BVAL[%ITEMX][%V];
+		%TYPE%V CVAL[(%ITEMY_BY_V)][%ITEMX];
+
+		%IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+		for(uint i=0; i< (%ITEMY_BY_V); i++)
+		{
+			%IF(%ITEMX) #pragma unroll %ITEMX
+			for(uint j=0; j<(%ITEMX); j++)
+			{
+				CVAL[i][j] = (%TYPE%V) 0;
+			}
+		}
+
+		uint ACOL=0;
+		//
+		// 		SYMM
+		//
+		for(ACOL=0; ((tailBlock == false) && ((ACOL+%V-1) < K)); ACOL += %V /* %PANEL */)
+		{
+
+			if ((ACOL+%V-1) < REDColStart)
+			{
+				//
+				// Load B values
+				//
+				%IF(%ITEMX) #pragma unroll %ITEMX
+				for(uint bcol = 0; bcol < %ITEMX; bcol++)
+				{
+					//
+					// PENDING: PANEL iteration to Load the Panel Depth iterating by %V
+					//
+					*(__private %TYPE%V *)(&BVAL[bcol]) = LOADB_FIRST(B, K, N , ldb, ACOL, colB + (threadsX*bcol));
+				}
+
+				//
+				// Load A values
+				//
+				%IF(%ITEMY) #pragma unroll %ITEMY
+				for(uint i = 0; i < (%V * (%ITEMY_BY_V)) /* PANEL * ITEMY/V */; i++)
+				{
+					const uint yiterations = %ITEMY_BY_V;
+					uint c = (i / yiterations);
+					uint r = (i % yiterations);
+
+					AVAL[c][r] = LOADA_FIRST(A, M, K, lda, rowA + r*threadsY*(%V), ACOL + c );
+				}
+			} else if (ACOL < REDColEnd)
+			{
+				//
+				// Load B values
+				//
+				%IF(%ITEMX) #pragma unroll %ITEMX
+				for(uint bcol = 0; bcol < %ITEMX; bcol++)
+				{
+					//
+					// PENDING: PANEL iteration to Load the Panel Depth iterating by %V
+					//
+					*(__private %TYPE%V *)(&BVAL[bcol]) = LOADB_SECOND(B, K, N , ldb, ACOL, colB + (threadsX*bcol));
+				}
+
+				//
+				// Load A values
+				//
+				%IF(%ITEMY) #pragma unroll %ITEMY
+				for(uint i = 0; i < (%V * (%ITEMY_BY_V)) /* PANEL * ITEMY/V */; i++)
+				{
+					const uint yiterations = %ITEMY_BY_V;
+					uint c = (i / yiterations);
+					uint r = (i % yiterations);
+
+					AVAL[c][r] = LOADA_SECOND(A, M, K, lda, rowA + r*threadsY*(%V), ACOL + c );
+				}
+			} else {
+				//
+				// Load B values
+				//
+				%IF(%ITEMX) #pragma unroll %ITEMX
+				for(uint bcol = 0; bcol < %ITEMX; bcol++)
+				{
+					//
+					// PENDING: PANEL iteration to Load the Panel Depth iterating by %V
+					//
+					*(__private %TYPE%V *)(&BVAL[bcol]) = LOADB_THIRD(B, K, N , ldb, ACOL, colB + (threadsX*bcol));
+				}
+
+				//
+				// Load A values
+				//
+				%IF(%ITEMY) #pragma unroll %ITEMY
+				for(uint i = 0; i < (%V * (%ITEMY_BY_V)) /* PANEL * ITEMY/V */; i++)
+				{
+					const uint yiterations = %ITEMY_BY_V;
+					uint c = (i / yiterations);
+					uint r = (i % yiterations);
+
+					AVAL[c][r] = LOADA_THIRD(A, M, K, lda, rowA + r*threadsY*(%V), ACOL + c );
+				}
+			}
+
+			%IF(%V) #pragma unroll %V
+			for(uint panel=0; panel < %V; panel++)
+			{
+				%IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+				for(uint i=0; i<(%ITEMY_BY_V); i++)
+				{
+					%IF(%ITEMX) #pragma unroll %ITEMX
+					for(uint j=0; j<(%ITEMX); j++)
+					{
+						%VMAD(CVAL[i][j] ,  AVAL[panel][i] , BVAL[j][panel]);
+					}
+				}
+			}
+
+			#ifdef SYMM_NEEDS_BARRIER
+			barrier(CLK_LOCAL_MEM_FENCE);
+			#endif
+		}
+
+		//
+		//  SYMM - 	The Tail....
+		//		The tail can wag past M and N. The LOAD routines clamp those accesses
+		//
+		for(; ACOL < K; ACOL += %V /* %PANEL */)
+		{
+			//
+			// Load B values
+			//
+			%IF(%ITEMX) #pragma unroll %ITEMX
+			for(uint bcol = 0; bcol < %ITEMX; bcol++)
+			{
+				//
+				// PENDING: PANEL iteration to Load the Panel Depth iterating by %V
+				//
+				*(__private %TYPE%V *)(&BVAL[bcol]) = LOADB_TAIL(B, K, N , ldb, ACOL, colB + (threadsX*bcol));
+			}
+
+			//
+			// Load A values
+			//
+			%IF(%ITEMY) #pragma unroll %ITEMY
+			for(uint i = 0; i < (%V * (%ITEMY_BY_V)) /* PANEL * ITEMY/V */; i++)
+			{
+				const uint yiterations = %ITEMY_BY_V;
+				uint c = (i / yiterations);
+				uint r = (i % yiterations);
+
+				AVAL[c][r] = LOADA_TAIL(A, M, K, lda, rowA + r*threadsY*(%V), ACOL + c );
+			}
+
+			%IF(%V) #pragma unroll %V
+			for(uint panel=0; panel < %V; panel++)
+			{
+				%IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+				for(uint i=0; i<(%ITEMY_BY_V); i++)
+				{
+					%IF(%ITEMX) #pragma unroll %ITEMX
+					for(uint j=0; j<(%ITEMX); j++)
+					{
+						%VMAD(CVAL[i][j] ,  AVAL[panel][i] , BVAL[j][panel]);
+					}
+				}
+			}
+
+			#ifdef SYMM_NEEDS_BARRIER
+			barrier(CLK_LOCAL_MEM_FENCE);
+			#endif
+		}
+
+
+		//
+		// STORE Result in C
+		//
+		%TYPE%V reg , betareg, alphareg;
+		%TYPE%V alphav, betav;
+		alphav = %VMAKEVEC(alpha);
+		betav = %VMAKEVEC(beta);
+
+		%IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
+		for(uint i=0; i< (%ITEMY_BY_V); i++)
+		{
+			%IF(%ITEMX) #pragma unroll %ITEMX
+			for(uint j=0; j<(%ITEMX); j++)
+			{
+				reg = VECTOR_LOAD(C, M, N, ldc, rowA + i*threadsY*%V, colB+(j*threadsX));
+				%VMUL(betareg, betav, reg);
+				%VMUL(alphareg, alphav, CVAL[i][j]);
+				%ADD( reg, betareg, alphareg);
+				VECTOR_STORE(reg, C, M, N, ldc, rowA + i*threadsY*%V, colB+(j*threadsX));
+			}
+		}
+		return;
+	}
+#else
+#error COLMAJOR Not Defined while compiling SYMM_C_KERNEL
+#endif
+";
+
diff --git a/src/library/blas/gens/clTemplates/symm_helper.cl b/src/library/blas/gens/clTemplates/symm_helper.cl
new file mode 100644
index 0000000..909787b
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/symm_helper.cl
@@ -0,0 +1,102 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+static const char *SYMM_HEMM_HELPER = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+%TYPE SYMM_SCALAR_LOAD(__global %TYPE const * restrict A, uint M, uint lda, uint row, uint col)
+{
+	%TYPE retval;
+
+    //PENDING: Remove this Check for M. This will never happen
+	if (((row) < M) && (col < M))
+	{
+		#ifdef __SYMM_UPPER__
+		if ((row) <= col)
+		#else
+		if ((row) >= col)
+		#endif
+		{
+			retval = A[(col)*lda + row];
+            #ifdef __HEMM__
+            if (row == col) { retval.odd = 0; }
+            #endif
+		} else {
+			retval = A[(row)*lda + col];
+            #ifdef __HEMM__
+            %CONJUGATE(1, retval);
+            #endif
+		}
+	} else {
+		retval = (%TYPE) 0;
+	}
+	return retval;
+}
+
+%TYPE%V SYMM_VECTOR_LOAD_USING_SCALAR(__global %TYPE const * restrict A, uint M, uint lda, uint row, uint col)
+{
+	//%TYPE symm_vec_load_temp[%V];
+	%TYPE%V symm_vec_retval;
+
+    //#pragma unroll %V
+	//for(uint index_i=0; index_i< (%V); index_i++)
+    %VFOR
+	{
+        //PENDING: Remove this Check for M. This will never happen
+		if (((row + %VFORINDEX) < M) && (col < M))
+		{
+			#ifdef __SYMM_UPPER__
+			if ((row + %VFORINDEX) <= col)
+			#else
+			if ((row + %VFORINDEX) >= col)
+			#endif
+			{
+				//symm_vec_load_temp[index_i] = A[(col)*(lda) + ((row) + index_i)];
+				symm_vec_retval%VFORSUFFIX = A[(col)*(lda) + ((row) + %VFORINDEX)];
+                #ifdef __HEMM__
+                //if ((row + index_i) == col) { symm_vec_load_temp[index_i].odd = 0; }
+                if ((row + %VFORINDEX) == col) { (symm_vec_retval%VFORSUFFIX).odd = 0; }
+                #endif
+			} else {
+				//symm_vec_load_temp[index_i] = A[((row)+index_i)*(lda) + (col)];
+				symm_vec_retval%VFORSUFFIX = A[((row)+ %VFORINDEX )*(lda) + (col)];
+                #ifdef __HEMM__
+                //CONJUGATE(1, (symm_vec_load_temp[index_i]));
+                {
+                    %TYPE SCALAR;
+
+                    SCALAR = symm_vec_retval%VFORSUFFIX;
+                    %CONJUGATE(1, SCALAR);
+                    symm_vec_retval%VFORSUFFIX = SCALAR;
+                }
+                #endif
+			}
+		} else {
+			//symm_vec_load_temp[index_i] = (%TYPE) 0;
+			symm_vec_retval%VFORSUFFIX = (%TYPE) 0;
+		}
+	}
+	//%VLOADWITHINCX(symm_vec_retval, symm_vec_load_temp, 1 );
+    //symm_vec_retval = *(__private %TYPE%V *)symm_vec_load_temp;
+	return symm_vec_retval;
+}
+\n";
diff --git a/src/library/blas/gens/clTemplates/syr.cl b/src/library/blas/gens/clTemplates/syr.cl
new file mode 100644
index 0000000..6593ab1
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/syr.cl
@@ -0,0 +1,474 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/***********************************************/
+//NOTE: THIS FILE IS NOT USED. SEE SYR_HER.CLT
+//      THIS FILE IS FOR LEGACY PURPOSES.
+
+//Column Major Lower
+static const char *syr_CL_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#define TARGET_ROWS_BY_VEC 	(%TARGET_ROWS / %V)
+#define TARGET_WIDTH		(%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+#define TARGET_HEIGHT		(%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+
+// Column-Major Lower
+// nBlocks 	= (N - 1)/ TR + 1
+// totalBlocks 	= (nBlocks * ( nBlocks + 1)) / 2
+__kernel void %PREFIXsyr_CL_kernel( __global %TYPE* _A, __global const %TYPE* _X, int N, int offx, int incx, int offa, int lda, %PTYPE alpha )
+{
+	__global %TYPE* X;
+	__global %TYPE *A;
+	__local %TYPE xShared[%TARGET_ROWS];
+	__local %TYPE yShared[%TARGET_ROWS];
+
+	A = _A + offa;
+	if ( incx < 0 ) // Goto end of vector
+	{
+		X = _X + offx - ( N - 1) * incx;
+	}
+	else
+	{
+		X = _X + offx;
+	}
+
+	int blockID  = get_group_id(0);
+	int threadID = get_local_id(0);
+	int nBlocks  = ((N - 1) / %TARGET_ROWS) + 1;
+
+	__local int iShared;
+	__local int jShared;
+
+	// Get (i,j) of Block
+	if ( threadID == 0)
+	{
+		int _i = 0, _j = 0;
+		//for ( _j = 0; _j < nBlocks; _j++)
+		for ( _j = (blockID / nBlocks); _j < nBlocks; _j++)
+		{
+			_i = blockID - ((_j*((2* nBlocks) + 1 - _j))/2) + _j;
+			if ( _i < nBlocks && ( _i >= 0) )
+			{
+				break;
+			}
+		}
+
+		iShared = _i;
+		jShared = _j;
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int i = iShared;
+	int j = jShared;
+
+
+	int ref_x = i * %TARGET_ROWS;
+	int ref_y = j * %TARGET_ROWS;
+
+	// Load data into xShared and yShared
+	// Not a common task among blocks in the present implementation..
+
+	// Diagonal Blocks : Should handle not reading diagonal element complex value
+	// Diagonal blocks : Should handle the last block as well
+	// Scalar code in Present implementation
+	if ( i == j)
+	{
+ 		int ncols = ((ref_y + %TARGET_ROWS) < N) ? %TARGET_ROWS : (N-ref_y);
+        int nrows = ((ref_x + %TARGET_ROWS) < N) ? %TARGET_ROWS : (N-ref_x);
+        int nElements = ((nrows) * ((ncols) + 1)) >> 1;
+        nrows -= 1;
+        ncols -= 1;
+        for(i = threadID; i < nElements; i += get_local_size(0))
+        {
+            int r = -1, c = -1;
+            for(int k = 1; (k <= %TARGET_ROWS); k ++)
+            {
+                int temp = ((k - 1) * k) >> 1;
+                r = ((i >= temp) && (i <= (temp + k - 1))) ? k - 1 : r;
+            }
+            c = i - (((r + 1) * r) >> 1);
+
+            r = ref_x + r;
+            c = ref_y + c;
+
+            %TYPE res;
+            res = alpha * X[c * incx];
+            res = res * X[r * incx];
+            A[r + c * lda] += res;
+        }
+	}
+	else if ( i == (nBlocks - 1)) // Last Row Strip blocks ( May not fit into target region)
+	{
+
+		%TYPE%V loadedA;
+
+		// Populating xShared: May not fit into target region
+		for( int i = (ref_x + threadID); i < N; i += get_local_size(0))
+		{
+			xShared[i - ref_x] = X[ i * incx];
+		}
+
+		// Populating yShared: Always fits well..
+		for( int i = (ref_y  + threadID); (i - ref_y) < %TARGET_ROWS; i += get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			yShared[(i - ref_y) ] = loadedX;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop %TARGET_ROWS / TARGET_WIDTH times
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = (threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V;
+
+
+		int startRow = ref_x + rowShift;
+		%TYPE%V  loadedX;
+
+		if ( startRow  < (N - (%V - 1)) )
+		{
+			loadedX=  *((__local %TYPE%V*)( xShared + rowShift));
+		}
+
+		for( int i= 1; i <=  nLoops; i++)
+		{
+			int startCol = ref_y + colShift + ( i - 1 ) * TARGET_WIDTH;
+
+			if ( ( startRow  < N ) && ( startCol  < (ref_y + %TARGET_ROWS ) ) )// threads that fall into target region
+			{
+				if(( startRow + %V) > N )// Loop serially as can't do VLOAD
+				{
+					%TYPE yValue = yShared[ startCol - ref_y];
+
+					for(int row = startRow; row < N; row++)
+					{
+						%TYPE xValue = xShared[ row - ref_x];
+						%TYPE res1, res2;
+						res1 = alpha * yValue;
+						%MUL( res2, res1,  xValue);
+						A[ row + startCol * lda] += res2;
+					}
+				}
+				else
+				{
+					loadedA  	= %VLOAD( 0, (&A[ startRow + startCol * lda]));
+
+					%TYPE 	 loadedY= yShared[ startCol - ref_y];
+					%TYPE 	 res;
+					res =  loadedY * alpha;
+					%TYPE%V  resVec;
+					resVec = %VMAKEVEC(res);
+					%VMAD( loadedA, loadedX, resVec);
+					%VSTORE(  loadedA, 0, (&A[ startRow + startCol * lda]));
+				}
+			}
+		}
+	}
+	else // blocks that fit exactly.
+	{
+
+		%TYPE%V loadedA;
+
+		// Populating xShared
+		for( int i = (ref_x + threadID); (i - ref_x) < %TARGET_ROWS; i += get_local_size(0))
+		{
+			xShared[i - ref_x] = X[ i * incx];
+		}
+
+		// Populating yShared
+		for( int i = (ref_y + threadID); (i - ref_y) < %TARGET_ROWS; i += get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			yShared[i - ref_y] = loadedX;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop %TARGET_ROWS / TARGET_WIDTH times
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = (threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V;
+
+		int startRow = ref_x + rowShift;
+		int startCol = ref_y + colShift;
+		%TYPE%V  loadedX;
+
+		if ( startCol < ( ref_y + %TARGET_ROWS) ) // threads that fall into target region
+		{
+			loadedX	 =  *((__local %TYPE%V*)( xShared + rowShift));
+		}
+
+		//#pragma unroll
+		for( int i= 1; i <= nLoops; i++)
+		{
+			startCol = ref_y + colShift + ( i - 1 ) * TARGET_WIDTH;
+
+			if ( startCol < ( ref_y + %TARGET_ROWS) ) // threads that fall into target region
+			{
+				loadedA  	= %VLOAD( 0, (&A[ startRow + startCol * lda]));
+				%TYPE 	 loadedY= yShared[ startCol - ref_y];
+				%TYPE 	 res;
+				res =  loadedY * alpha;
+				%TYPE%V  resVec;
+				resVec = %VMAKEVEC(res);
+				%VMAD( loadedA, loadedX, resVec);
+				%VSTORE(  loadedA, 0, (&A[ startRow + startCol * lda]));
+			}
+		}
+	}
+}
+\n";
+
+// Column-Major Upper
+static const char *syr_CU_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#define TARGET_ROWS_BY_VEC   	(%TARGET_ROWS / %V)
+#define TARGET_WIDTH     		(%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+#define TARGET_HEIGHT        	(%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+
+// Column-Major Upper
+// nBlocks 	= (N - 1)/ TR + 1
+// totalBlocks 	= (nBlocks * ( nBlocks + 1)) / 2
+__kernel void %PREFIXsyr_CU_kernel( __global %TYPE* _A, __global const %TYPE* _X, int N, int offx, int incx, int offa, int lda, %PTYPE alpha )
+{
+	__global %TYPE* X;
+	__global %TYPE *A;
+
+	__local %TYPE xShared[%TARGET_ROWS];
+	__local %TYPE yShared[%TARGET_ROWS];
+
+	A = _A + offa;
+	if ( incx < 0 ) // Goto end of vector
+	{
+		X = _X + offx - ( N - 1) * incx;
+	}
+	else
+	{
+		X = _X + offx;
+	}
+
+	int blockID  = get_group_id(0);
+	int threadID = get_local_id(0);
+	int nBlocks  = ((N - 1) / %TARGET_ROWS) + 1;
+
+	__local int iShared;
+	__local int jShared;
+
+	// Get (i,j) of Block
+	if ( threadID == 0)
+	{
+		int _i = 0, _j = 0;
+		//for ( _j = 0; _j < nBlocks; _j++)
+		for ( _j = (blockID / nBlocks); _j < nBlocks; _j++)
+		{
+			_i = blockID - ((_j*((2* nBlocks) + 1 - _j))/2) + _j;
+			if ( _i < nBlocks && ( _i >= 0) )
+			{
+				break;
+			}
+		}
+
+		iShared = _i;
+		jShared = _j;
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int i = iShared;
+	int j = jShared;
+
+	int ref_x = (N- 1) - i * %TARGET_ROWS;
+	int ref_y = (N- 1) - j * %TARGET_ROWS;
+
+	// Load data into xShared and yShared
+	// Not a common task among blocks in the present implementation..
+
+	// Diagonal Blocks : Should handle not reading diagonal element complex value
+	// Diagonal blocks : Should handle the last block as well
+	// Scalar code in Present implementation
+	if ( i == j)
+	{
+		int ncols = ((ref_y - %TARGET_ROWS) >= 0) ? %TARGET_ROWS : (ref_y+1);
+		int nrows = ((ref_x - %TARGET_ROWS) >= 0) ? %TARGET_ROWS : (ref_x+1);
+		int nElements = ((nrows) * ((ncols) + 1)) >> 1;
+		nrows -= 1;
+		ncols -= 1;
+		for(i = threadID; i < nElements; i += get_local_size(0))
+		{
+			int r, c = -1;
+			for(int k = 1; (k <= %TARGET_ROWS); k ++)
+			{
+				int temp = ((k - 1) * k) >> 1;
+				c = ((i >= temp) && (i <= (temp + k - 1))) ? k - 1 : c;
+			}
+			r = i - (((c + 1) * c) >> 1);
+
+			r = ref_x - (nrows) + r;
+			c = ref_y - (ncols) + c;
+
+			%TYPE res;
+            res = alpha * X[c * incx];
+            res = res * X[r * incx];
+            A[r + c * lda] += res;
+		}
+	}
+	else if ( i == (nBlocks - 1)) // Last Row Strip blocks ( May not fit into target region)
+	{
+
+		%TYPE%V loadedA;
+
+		// Populating xShared: May not fit into target region
+		for( int i = (ref_x - threadID); i >= 0; i -= get_local_size(0))
+		{
+			// FIXME: Assumes BLOCKSIZE >= TARGET_ROWS
+			// FIXME: Works correctly only for 1 ITERATION
+			//xShared[(%TARGET_ROWS - 1) - threadID] = X[ i * incx];
+			xShared[(%TARGET_ROWS - 1) -(ref_x - i)] = X[ i * incx];
+		}
+
+		// Populating yShared: Always fits well..
+		for( int i = (ref_y - threadID); (ref_y - i) < %TARGET_ROWS; i -= get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			yShared[(ref_y - i)] = loadedX;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = ((threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V) + (%V - 1);
+
+		int startRow = ref_x - rowShift;
+		%TYPE%V  loadedX;
+
+		if ( startRow  >= 0 )
+		{
+			loadedX=  *((__local %TYPE%V*)( &xShared[ (%TARGET_ROWS - 1) - rowShift]));
+		}
+
+		for( int i= 1; i <=  nLoops; i++)
+		{
+			int startCol = ref_y - colShift - ( i - 1 ) * TARGET_WIDTH;
+
+			// threads that fall into target region
+			if( ( startRow  > -(%V) ) && (startCol > (ref_y - %TARGET_ROWS)) )
+			{
+				if( startRow  < 0 )// Loop serially as can't do VLOAD
+				{
+					%TYPE yValue = yShared[ ref_y - startCol];
+
+					for(int row = startRow + (%V - 1); row >= 0; row--)
+					{
+						%TYPE xValue = xShared[ %TARGET_ROWS - 1 - (ref_x - row)];
+						%TYPE res1, res2;
+						res1 = alpha * yValue;
+						%MUL( res2, res1,  xValue);
+						A[ row + startCol * lda] += res2;
+					}
+				}
+				else
+				{
+					loadedA  = %VLOAD( 0, (&A[ startRow + startCol * lda]));
+
+					%TYPE 	 loadedY= yShared[ ref_y - startCol];
+					%TYPE 	 res;
+					res =  loadedY * alpha;
+					%TYPE%V  resVec;
+					resVec = %VMAKEVEC(res);
+					%VMAD( loadedA, loadedX, resVec);
+					%VSTORE(  loadedA, 0, (&A[ startRow + startCol * lda]));
+				}
+			}
+		}
+	}
+	else // blocks that fit exactly.
+	{
+		%TYPE%V loadedA;
+
+		// Populating xShared
+		for( int i = (ref_x - threadID); ((ref_x - i) < %TARGET_ROWS); i -= get_local_size(0))
+		{
+			xShared[ (%TARGET_ROWS - 1) - (ref_x - i)] = X[ i * incx];
+		}
+
+		// Populating yShared
+		for( int i = (ref_y - threadID); (ref_y - i) < %TARGET_ROWS; i -= get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			yShared[(ref_y - i)] = loadedX;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop %TARGET_ROWS / TARGET_WIDTH times
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = ((threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V) + (%V - 1);
+
+
+		int startRow = ref_x - rowShift;
+		int startCol = ref_y - colShift;
+		%TYPE%V  loadedX;
+		// Not all threads should do this..
+		// Depends on whether blocksize width is > target_rows
+		if ( startCol > ( ref_y - %TARGET_ROWS) ) // threads that fall into target region
+		{
+			loadedX=  *((__local %TYPE%V*)( &xShared[ (%TARGET_ROWS - 1)- rowShift]));
+		}
+
+		for( int i = 1; i <= nLoops; i++)
+		{
+			startCol = ref_y - colShift - ( i - 1 ) * TARGET_WIDTH;
+
+			if ( startCol > ( ref_y - %TARGET_ROWS) ) // threads that fall into target region
+			{
+				loadedA = %VLOAD( 0, (&A[ startRow + startCol * lda]));
+				%TYPE  loadedY = yShared[ ref_y - startCol];
+				%TYPE  res;
+				res = loadedY * alpha;
+				%TYPE%V  resVec;
+				resVec = %VMAKEVEC(res);
+				%VMAD( loadedA, loadedX, resVec);
+				%VSTORE(  loadedA, 0, (&A[ startRow + startCol * lda]));
+			}
+		}
+
+	}
+}
+\n";
+
diff --git a/src/library/blas/gens/clTemplates/syr2.cl b/src/library/blas/gens/clTemplates/syr2.cl
new file mode 100644
index 0000000..0846b2d
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/syr2.cl
@@ -0,0 +1,1209 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/************************************************/
+//NOTE: THIS FILE IS NOT USED. SEE SYR2_HER2.CLT
+//      THIS FILE IS FOR LEGACY PURPOSES.
+
+//Column Major Lower
+static const char *syr2_CL_kernel = "
+
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+	#pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#define TARGET_ROWS_BY_VEC 	(%TARGET_ROWS / %V)
+#define TARGET_WIDTH		(%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+#define TARGET_HEIGHT		(%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+// Column-Major Lower
+// nBlocks 	= (N - 1)/ TR + 1
+// totalBlocks 	= (nBlocks * ( nBlocks + 1)) / 2
+__kernel void %PREFIXsyr2_CL_kernel( __global const %TYPE* _A, __global const %TYPE* _X, __global const %TYPE* _Y, int N, int offx, int incx, int offy, int incy, int offa, int lda, %TYPE alpha)
+{
+
+	__global const %TYPE* X;
+	__global const %TYPE* Y;
+	__global %TYPE* A;
+
+	__local %TYPE xShared[%TARGET_ROWS];
+	__local %TYPE yShared[%TARGET_ROWS];
+	__local %TYPE xSharedConj[%TARGET_ROWS];
+	__local %TYPE ySharedConj[%TARGET_ROWS];
+
+	int nBlocks = ((N - 1) / %TARGET_ROWS) + 1;
+
+	A = _A + offa;
+
+	if ( incx < 0 ) // Goto end of vector
+	{
+		X	 = _X + offx - ( N - 1) * incx;
+	}
+	else
+	{
+		X = _X + offx;
+	}
+
+	if ( incy < 0 ) // Goto end of vector
+	{
+		Y	 = _Y + offy - ( N - 1) * incy;
+	}
+	else
+	{
+		Y = _Y + offy;
+	}
+
+	int blockID  = get_group_id(0);
+	int threadID = get_local_id(0);
+
+	__local int iShared;
+	__local int jShared;
+
+	// Get (i,j) of Block
+	if ( threadID == 0)
+	{
+		int _i = 0, _j = 0;
+		for ( _j = (blockID / nBlocks); _j < nBlocks; _j++)
+		{
+			_i = blockID - ((_j*((2* nBlocks) + 1 - _j))/2) + _j;
+			if ( _i < nBlocks && ( _i >= 0) )
+			{
+				break;
+			}
+		}
+
+		iShared = _i;
+		jShared = _j;
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int i = iShared;
+	int j = jShared;
+
+
+	int ref_x = i * %TARGET_ROWS;
+	int ref_y = j * %TARGET_ROWS;
+
+	// Load data into xShared and yShared
+	// Not a common task among blocks in the present implementation..
+	// Diagonal Blocks : Should handle not reading diagonal element complex value
+	// Diagonal blocks : Should handle the last block as well
+	// Scalar code in Present implementation
+	if ( i == j)
+	{
+		int ncols = ((ref_y + %TARGET_ROWS) < N) ? %TARGET_ROWS : (N-ref_y);
+        int nrows = ((ref_x + %TARGET_ROWS) < N) ? %TARGET_ROWS : (N-ref_x);
+        int nElements = ((nrows) * ((ncols) + 1)) >> 1;
+        nrows -= 1;
+        ncols -= 1;
+        for(i = threadID; i < nElements; i += get_local_size(0))
+        {
+            int r = -1, c = -1;
+            for(int k = 1; (k <= %TARGET_ROWS); k ++)
+            {
+                int temp = ((k - 1) * k) >> 1;
+                r = ((i >= temp) && (i <= (temp + k - 1))) ? k - 1 : r;
+            }
+            c = i - (((r + 1) * r) >> 1);
+
+            r = ref_x + r;
+            c = ref_y + c;
+
+            %TYPE res1, res2;
+            res1 = alpha * X[c * incx];
+			res2 = alpha * X[r * incx];
+            res1 = res1 * Y[r * incx];
+			res2 = res2 * Y[c * incx];
+
+			A[r + c * lda] += (res1 + res2);
+        }
+	}
+	else if ( i == (nBlocks - 1)) // Last Row Strip blocks ( May not fit into target region)
+	{
+
+		%TYPE%V loadedA;
+
+		// Populating xShared: May not fit into target region
+		for( int i = (ref_x + threadID); i < N; i += get_local_size(0))
+		{
+			xShared[i - ref_x] = X[ i * incx];
+			yShared[i - ref_x] = Y[ i * incy];
+		}
+
+		// Populating yShared: Always fits well..
+		for( int i = (ref_y  + threadID); (i - ref_y) < %TARGET_ROWS; i += get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			xSharedConj[(i - ref_y) ] = loadedX;
+			ySharedConj[(i - ref_y) ] = loadedY;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop %TARGET_ROWS / TARGET_WIDTH times
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = (threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V;
+
+
+		int startRow = ref_x + rowShift;
+		%TYPE%V  loadedX, loadedY;
+
+		if ( startRow  < (N - (%V - 1)) )
+		{
+			loadedX=  *((__local %TYPE%V*)( xShared + rowShift));
+			loadedY=  *((__local %TYPE%V*)( yShared + rowShift));
+		}
+
+		for( int i= 1; i <=  nLoops; i++)
+		{
+			int startCol = ref_y + colShift + ( i - 1 ) * TARGET_WIDTH;
+
+			if ( ( startRow  < N ) && ( startCol  < (ref_y + %TARGET_ROWS ) ) )// threads that fall into target region
+			{
+				if(( startRow + %V) > N )// Loop serially as can't do VLOAD
+				{
+					%TYPE yValueConj = ySharedConj[ startCol - ref_y];
+					%TYPE xValueConj = xSharedConj[ startCol - ref_y];
+
+					for(int row = startRow; row < N; row++)
+					{
+						%TYPE xValue = xShared[ row - ref_x];
+						%TYPE yValue = yShared[ row - ref_x];
+
+						%TYPE res1, res2;
+						// X * Y(H)
+						%MUL(res1, alpha, yValueConj);
+						%MUL( res2, res1,  xValue);
+
+						// Y * X(H)
+						%MUL(res1, alpha, xValueConj);
+						%MAD( res2, res1,  yValue);
+						A[ row + startCol * lda] += res2;
+					}
+				}
+				else
+				{
+					loadedA  	= %VLOAD( 0, (&A[ startRow + startCol * lda]));
+
+					%TYPE 	 loadedYConj = ySharedConj[ startCol - ref_y];
+					%TYPE 	 loadedXConj = xSharedConj[ startCol - ref_y];
+					%TYPE 	 res;
+
+					// X * Y(H)
+					%MUL(res, loadedYConj, alpha);
+					%TYPE%V  resVec;
+					resVec = %VMAKEVEC(res);
+					%VMAD( loadedA, loadedX, resVec);
+
+					// Y * X(H)
+					%MUL(res, loadedXConj, alpha);
+					resVec = %VMAKEVEC(res);
+					%VMAD( loadedA, loadedY, resVec);
+
+					%VSTORE(  loadedA, 0, (&A[ startRow + startCol * lda]));
+				}
+			}
+		}
+	}
+	else // blocks that fit exactly.
+	{
+
+		%TYPE%V loadedA;
+
+		// Populating xShared
+		for( int i = (ref_x + threadID); (i - ref_x) < %TARGET_ROWS; i += get_local_size(0))
+		{
+			xShared[i - ref_x] = X[ i * incx];
+			yShared[i - ref_x] = Y[ i * incy];
+		}
+
+		// Populating yShared
+		for( int i = (ref_y + threadID); (i - ref_y) < %TARGET_ROWS; i += get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			xSharedConj[i - ref_y] = loadedX;
+			ySharedConj[i - ref_y] = loadedY;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop %TARGET_ROWS / TARGET_WIDTH times
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = (threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V;
+
+		int startRow = ref_x + rowShift;
+		int startCol = ref_y + colShift;
+		%TYPE%V  loadedX, loadedY;
+
+		if ( startCol < ( ref_y + %TARGET_ROWS) ) // threads that fall into target region
+		{
+			loadedX	 =  *((__local %TYPE%V*)( xShared + rowShift));
+			loadedY	 =  *((__local %TYPE%V*)( yShared + rowShift));
+		}
+
+		//#pragma unroll
+		for( int i= 1; i <= nLoops; i++)
+		{
+			startCol = ref_y + colShift + ( i - 1 ) * TARGET_WIDTH;
+
+			if ( startCol < ( ref_y + %TARGET_ROWS) ) // threads that fall into target region
+			{
+				loadedA  	 = %VLOAD( 0, (&A[ startRow + startCol * lda]));
+				%TYPE 	 loadedYConj = ySharedConj[ startCol - ref_y];
+				%TYPE 	 loadedXConj = xSharedConj[ startCol - ref_y];
+
+				// X * Y(H)
+				%TYPE 	 res;
+				%MUL(res, loadedYConj, alpha);
+				%TYPE%V  resVec;
+				resVec = %VMAKEVEC(res);
+				%VMAD( loadedA, loadedX, resVec);
+
+				// Y * X(H)
+				%MUL(res, loadedXConj, alpha);
+				resVec = %VMAKEVEC(res);
+				%VMAD( loadedA, loadedY, resVec);
+				%VSTORE(  loadedA, 0, (&A[ startRow + startCol * lda]));
+			}
+		}
+	}
+
+}
+\n";
+
+//Column Major Upper
+static const char *syr2_CU_kernel = "
+
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#define TARGET_ROWS_BY_VEC  (%TARGET_ROWS / %V)
+#define TARGET_WIDTH        (%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+#define TARGET_HEIGHT       (%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+// Column-Major Upper
+// nBlocks 	= (N - 1)/ TR + 1
+// totalBlocks 	= (nBlocks * ( nBlocks + 1)) / 2
+__kernel void %PREFIXsyr2_CU_kernel( __global const %TYPE* _A, __global const %TYPE* _X, __global const %TYPE* _Y, int N, int offx, int incx, int offy, int incy, int offa, int lda, %TYPE alpha)
+{
+
+    __global const %TYPE* X;
+    __global const %TYPE* Y;
+    __global %TYPE* A;
+
+    __local %TYPE xShared[%TARGET_ROWS];
+    __local %TYPE yShared[%TARGET_ROWS];
+    __local %TYPE xSharedConj[%TARGET_ROWS];
+    __local %TYPE ySharedConj[%TARGET_ROWS];
+
+    int nBlocks = ((N - 1) / %TARGET_ROWS) + 1;
+
+    A = _A + offa;
+
+    if ( incx < 0 ) // Goto end of vector
+    {
+        X    = _X + offx - ( N - 1) * incx;
+    }
+    else
+    {
+        X = _X + offx;
+    }
+
+    if ( incy < 0 ) // Goto end of vector
+    {
+        Y    = _Y + offy - ( N - 1) * incy;
+    }
+    else
+    {
+        Y = _Y + offy;
+    }
+
+	int blockID  = get_group_id(0);
+	int threadID = get_local_id(0);
+
+	__local int iShared;
+	__local int jShared;
+
+	// Get (i,j) of Block
+	if ( threadID == 0)
+	{
+		int _i = 0, _j = 0;
+		//for ( _j = 0; _j < nBlocks; _j++)
+		for ( _j = (blockID / nBlocks); _j < nBlocks; _j++)
+		{
+			_i = blockID - ((_j*((2* nBlocks) + 1 - _j))/2) + _j;
+			if ( _i < nBlocks && ( _i >= 0) )
+			{
+				break;
+			}
+		}
+
+		iShared = _i;
+		jShared = _j;
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int i = iShared;
+	int j = jShared;
+
+	int ref_x = (N- 1) - i * %TARGET_ROWS;
+	int ref_y = (N- 1) - j * %TARGET_ROWS;
+
+	// Load data into xShared and yShared
+	// Not a common task among blocks in the present implementation..
+
+	// Diagonal Blocks : Should handle not reading diagonal element complex value
+	// Diagonal blocks : Should handle the last block as well
+	// Scalar code in Present implementation
+	if ( i == j)
+	{
+ 		int ncols = ((ref_y - %TARGET_ROWS) >= 0) ? %TARGET_ROWS : (ref_y+1);
+        int nrows = ((ref_x - %TARGET_ROWS) >= 0) ? %TARGET_ROWS : (ref_x+1);
+        int nElements = ((nrows) * ((ncols) + 1)) >> 1;
+        nrows -= 1;
+        ncols -= 1;
+        for(i = threadID; i < nElements; i += get_local_size(0))
+        {
+            int r, c = -1;
+            for(int k = 1; (k <= %TARGET_ROWS); k ++)
+            {
+                int temp = ((k - 1) * k) >> 1;
+                c = ((i >= temp) && (i <= (temp + k - 1))) ? k - 1 : c;
+            }
+            r = i - (((c + 1) * c) >> 1);
+
+            r = ref_x - (nrows) + r;
+            c = ref_y - (ncols) + c;
+
+            %TYPE res1, res2;
+            res1 = alpha * X[c * incx];
+			res2 = alpha * X[r * incx];
+            res1 = res1 * Y[r * incy];
+			res2 = res2 * Y[c * incy];
+			A[r + c * lda] += (res1 + res2);
+        }
+	}
+	else if ( i == (nBlocks - 1)) // Last Row Strip blocks ( May not fit into target region)
+	{
+
+		%TYPE%V loadedA;
+
+		// Populating xShared: May not fit into target region
+		for( int i = (ref_x - threadID); i >= 0; i -= get_local_size(0))
+		{
+			xShared[(%TARGET_ROWS - 1) - threadID] = X[ i * incx];
+			yShared[(%TARGET_ROWS - 1) - threadID] = Y[ i * incy];
+		}
+
+		// Populating yShared: Always fits well..
+		for( int i = (ref_y - threadID); (ref_y - i) < %TARGET_ROWS; i -= get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			xSharedConj[(ref_y - i)] = loadedX;
+			ySharedConj[(ref_y - i)] = loadedY;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = ((threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V) + (%V - 1);
+
+		int startRow = ref_x - rowShift;
+		%TYPE%V  loadedX, loadedY;
+
+		if ( startRow  >= 0 )
+		{
+			loadedX=  *((__local %TYPE%V*)( &xShared[ (%TARGET_ROWS - 1) - rowShift]));
+			loadedY=  *((__local %TYPE%V*)( &yShared[ (%TARGET_ROWS - 1) - rowShift]));
+		}
+
+		for( int i= 1; i <=  nLoops; i++)
+		{
+			int startCol = ref_y - colShift - ( i - 1 ) * TARGET_WIDTH;
+
+			// threads that fall into target region
+			if( ( startRow  > -(%V) ) && (startCol > (ref_y - %TARGET_ROWS)) )
+			{
+				if( startRow  < 0 )// Loop serially as can't do VLOAD
+				{
+					%TYPE yValueConj = ySharedConj[ ref_y - startCol];
+					%TYPE xValueConj = xSharedConj[ ref_y - startCol];
+
+					for(int row = startRow + (%V - 1); row >= 0; row--)
+					{
+						%TYPE xValue = xShared[ %TARGET_ROWS - 1 - (ref_x - row)];
+						%TYPE yValue = yShared[ %TARGET_ROWS - 1 - (ref_x - row)];
+
+						%TYPE res1, res2;
+
+						// X * Y(H)
+						%MUL(res1, alpha, yValueConj);
+						%MUL( res2, res1,  xValue);
+
+						// Y * X(H)
+						%MUL(res1, alpha, xValueConj);
+						%MAD( res2, res1,  yValue);
+						A[ row + startCol * lda] += res2;
+					}
+				}
+				else
+				{
+					loadedA  = %VLOAD( 0, (&A[ startRow + startCol * lda]));
+
+					%TYPE 	 loadedXConj = xSharedConj[ ref_y - startCol];
+					%TYPE 	 loadedYConj = ySharedConj[ ref_y - startCol];
+					%TYPE 	 res;
+					// X * Y(H)
+					%MUL(res, loadedYConj, alpha);
+					%TYPE%V  resVec;
+					resVec = %VMAKEVEC(res);
+					%VMAD( loadedA, loadedX, resVec);
+
+					// Y * X(H)
+					%MUL(res, loadedXConj, alpha);
+					resVec = %VMAKEVEC(res);
+					%VMAD( loadedA, loadedY, resVec);
+					%VSTORE(  loadedA, 0, (&A[ startRow + startCol * lda]));
+				}
+			}
+		}
+	}
+	else // blocks that fit exactly.
+	{
+		%TYPE%V loadedA;
+
+		// Populating xShared
+		for( int i = (ref_x - threadID); ((ref_x - i) < %TARGET_ROWS); i -= get_local_size(0))
+		{
+			xShared[ (%TARGET_ROWS - 1) - (ref_x - i)] = X[ i * incx];
+			yShared[ (%TARGET_ROWS - 1) - (ref_x - i)] = Y[ i * incy];
+		}
+
+		// Populating yShared
+		for( int i = (ref_y - threadID); (ref_y - i) < %TARGET_ROWS; i -= get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			xSharedConj[(ref_y - i)] = loadedX;
+			ySharedConj[(ref_y - i)] = loadedY;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop %TARGET_ROWS / TARGET_WIDTH times
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = ((threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V) + (%V - 1);
+
+
+		int startRow = ref_x - rowShift;
+		int startCol = ref_y - colShift;
+		%TYPE%V  loadedX, loadedY;
+		// Not all threads should do this..
+		// Depends on whether blocksize width is > target_rows
+		if ( startCol > ( ref_y - %TARGET_ROWS) ) // threads that fall into target region
+		{
+			loadedX=  *((__local %TYPE%V*)( &xShared[ (%TARGET_ROWS - 1)- rowShift]));
+			loadedY=  *((__local %TYPE%V*)( &yShared[ (%TARGET_ROWS - 1)- rowShift]));
+		}
+
+		for( int i= 1; i <= nLoops; i++)
+		{
+			startCol = ref_y - colShift - ( i - 1 ) * TARGET_WIDTH;
+
+			if ( startCol > ( ref_y - %TARGET_ROWS) ) // threads that fall into target region
+			{
+				loadedA  		 	 = %VLOAD( 0, (&A[ startRow + startCol * lda]));
+				%TYPE 	 loadedYConj = ySharedConj[ ref_y - startCol];
+	 			%TYPE 	 loadedXConj = xSharedConj[ ref_y - startCol];
+				%TYPE 	 res;
+				// X * Y(H)
+				%MUL(res, loadedYConj, alpha);
+				%TYPE%V  resVec;
+				resVec = %VMAKEVEC(res);
+				%VMAD( loadedA, loadedX, resVec);
+
+				// Y * X(H)
+				%MUL(res, loadedXConj, alpha);
+				resVec = %VMAKEVEC(res);
+				%VMAD( loadedA, loadedY, resVec);
+				%VSTORE(  loadedA, 0, (&A[ startRow + startCol * lda]));
+			}
+		}
+
+	}
+}
+";
+
+/*
+//Row Major Lower
+static const char *syr2_RL_kernel = "
+
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#define TARGET_ROWS_BY_VEC  (%TARGET_ROWS / %V)
+#define TARGET_WIDTH        (%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+#define TARGET_HEIGHT       (%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+//
+//nBlocks = 4
+//
+//Blocks:	9
+//	7 8
+//	4 5 6
+//	0 1 2 3
+//
+
+// Row-Major Lower
+// nBlocks 	= (N - 1)/ TR + 1
+// totalBlocks 	= (nBlocks * ( nBlocks + 1)) / 2
+__kernel void %PREFIXsyr2_RL_kernel( __global const %TYPE* _A, __global const %TYPE* _X, __global const %TYPE* _Y, int N, int offx, int incx, int offy, int incy, int offa, int lda, %TYPE alpha)
+{
+
+    __global const %TYPE* X;
+    __global const %TYPE* Y;
+    __global %TYPE* A;
+
+    __local %TYPE xShared[%TARGET_ROWS];
+    __local %TYPE yShared[%TARGET_ROWS];
+    __local %TYPE xSharedConj[%TARGET_ROWS];
+    __local %TYPE ySharedConj[%TARGET_ROWS];
+
+    int nBlocks = ((N - 1) / %TARGET_ROWS) + 1;
+
+    A = _A + offa;
+
+    if ( incx < 0 ) // Goto end of vector
+    {
+        X    = _X + offx - ( N - 1) * incx;
+    }
+    else
+    {
+        X = _X + offx;
+    }
+
+    if ( incy < 0 ) // Goto end of vector
+    {
+        Y    = _Y + offy - ( N - 1) * incy;
+    }
+    else
+    {
+        Y = _Y + offy;
+    }
+
+	int blockID  = get_group_id(0);
+	int threadID = get_local_id(0);
+
+	__local int iShared;
+	__local int jShared;
+
+	// Get (i,j) of Block
+	if ( threadID == 0)
+	{
+		int _i = 0, _j = 0;
+		for ( _j = (blockID / nBlocks); _j < nBlocks; _j++)
+		{
+			_i = blockID - ((_j*((2* nBlocks) + 1 - _j))/2) + _j;
+			if ( _i < nBlocks && ( _i >= 0) )
+			{
+				break;
+			}
+		}
+
+		iShared = _i;
+		jShared = _j;
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int i = iShared;
+	int j = jShared;
+
+
+	int ref_x = ((nBlocks - 1) * %TARGET_ROWS) - (j * %TARGET_ROWS);
+	int ref_y = (i -j) * %TARGET_ROWS;
+
+	// Load data into xShared and yShared
+	// Not a common task among blocks in the present implementation..
+
+	// Diagonal Blocks : Should handle not reading diagonal element complex value
+	// Diagonal blocks : Should handle the last block as well
+	// Scalar code in Present implementation
+	if ( ref_x == ref_y )
+	{
+		// Need only xShared, not using yShared
+		for( int i = (ref_x + threadID); (i < N && (i - ref_x) < %TARGET_ROWS); i += get_local_size(0))
+		{
+			xShared[(i - ref_x)] = X[ i * incx];
+			yShared[(i - ref_x)] = Y[ i * incy];
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		int rowShift = threadID / %TARGET_ROWS;
+		int colShift = (threadID & ( %TARGET_ROWS - 1));
+
+		int target_height = %BLOCKSIZE / %TARGET_ROWS;
+
+		int nLoops = ((%TARGET_ROWS - 1)/ target_height)  + 1;
+		int startRow = ref_x + rowShift;
+
+		%TYPE yValue, xValue, yValueRead, xValueRead, yValueConj, xValueConj;
+
+		// startCol remains constant on looping
+		// Therefore, following code is based on startCol
+		int startCol = ref_y + colShift;
+
+		bool readXY		  = false;
+		for( int i= 1; i <=  nLoops; i++)
+		{
+			startRow = ref_x + rowShift + ( i - 1 ) * target_height;
+
+			bool activeThread =  ( startCol < N) && ( startRow < N) && ((startRow -  ref_x) < %TARGET_ROWS);
+
+			if (activeThread)
+			{
+				// Avoid reading Y again if already read using readXY
+				if (( startRow >= startCol) && (!readXY))
+				{
+					yValueRead = yShared[ startCol - ref_y];
+					xValueRead = xShared[ startCol - ref_y];
+					readXY 	   = true;
+				}
+
+				if ( startRow > startCol )
+				{
+					%TYPE res1, res2;
+					yValueConj = yValueRead;
+					xValueConj = xValueRead;
+
+					xValue = xShared[ startRow - ref_x ];
+					yValue = yShared[ startRow - ref_x ];
+
+					// X * Y(H)
+					%MUL(res1, alpha, yValueConj);
+					%MUL( res2, res1,  xValue);
+
+					// Y * X(H)
+					%MUL(res1, alpha, xValueConj);
+					%MAD( res2, res1, yValue);
+					A[ startRow * lda + startCol] += res2;
+				}
+				else if ( startRow == startCol) // Diagonal
+				{
+					yValueConj = yValueRead;
+					xValueConj = xValueRead;
+					yValue 	   = yValueRead;
+					xValue     = xValueRead;
+
+					%TYPE res1, res2;
+					// X * Y(H)
+					%MUL(res1, alpha, yValueConj);
+					%MUL( res2, res1,  xValue);
+
+					// Y * X(H)
+					%MUL(res1, alpha, xValueConj);
+					%MAD( res2, res1, yValue);
+
+					// Discard the imaginary component of A
+					%ADD(A[ startRow * lda + startCol], A[ startRow * lda + startCol], res2);
+				}
+			}
+		}
+	}
+	else if ( ref_x == ((nBlocks - 1) * %TARGET_ROWS)) // Last Row Strip blocks ( May not fit into target region)
+	{
+
+		%TYPE%V loadedA;
+
+		// Populating xShared: May not fit into target region
+		for( int i = (ref_x + threadID); i < N; i += get_local_size(0))
+		{
+			xShared[i - ref_x] = X[ i * incx];
+			yShared[i - ref_x] = Y[ i * incy];
+		}
+
+		// Populating yShared: Always fits well..
+		for( int i = (ref_y  + threadID); (i - ref_y) < %TARGET_ROWS; i += get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			xSharedConj[(i - ref_y) ] = loadedX;
+			ySharedConj[(i - ref_y) ] = loadedY;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop %TARGET_ROWS / TARGET_WIDTH times
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_HEIGHT)  + 1;
+
+		int rowShift = threadID / TARGET_ROWS_BY_VEC;
+		int colShift = (threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V;
+
+
+		int startRow = ref_x + rowShift;
+		int startCol = ref_y + colShift; // Remains fixed
+		%TYPE%V  loadedYConj, loadedXConj;
+
+		if ( startRow  < N )
+		{
+			loadedYConj =  *((__local %TYPE%V*)( ySharedConj + colShift));
+			loadedXConj =  *((__local %TYPE%V*)( xSharedConj + colShift));
+		}
+
+		for( int i= 1; i <=  nLoops; i++)
+		{
+			int startRow = ref_x + rowShift + ( i - 1 ) * TARGET_HEIGHT;
+
+			if (  startRow  < N  )// threads that fall into target region
+			{
+					loadedA  	= %VLOAD( 0, (&A[ startRow * lda + startCol]));
+
+					%TYPE 	 loadedX= xShared[ startRow - ref_x];
+					%TYPE 	 loadedY= yShared[ startRow - ref_x];
+					%TYPE 	 res;
+					// X * Y(H)
+					%MUL(res, loadedX, alpha);
+					%TYPE%V  resVec;
+					resVec = %VMAKEVEC(res);
+					%VMAD( loadedA, loadedYConj, resVec);
+
+					// Y * X(H)
+					%MUL(res, loadedY, alpha);
+					resVec = %VMAKEVEC(res);
+					%VMAD( loadedA, loadedXConj, resVec);
+					%VSTORE(  loadedA, 0, (&A[ startRow * lda + startCol]));
+			}
+		}
+	}
+	else // blocks that fit exactly.
+	{
+
+		%TYPE%V loadedA;
+
+		// Populating xShared
+		for( int i = (ref_x + threadID); (i - ref_x) < %TARGET_ROWS; i += get_local_size(0))
+		{
+			xShared[i - ref_x] = X[ i * incx];
+			yShared[i - ref_x] = Y[ i * incy];
+		}
+
+		// Populating yShared
+		for( int i = (ref_y + threadID); (i - ref_y) < %TARGET_ROWS; i += get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			xSharedConj[i - ref_y] = loadedX;
+			ySharedConj[i - ref_y] = loadedY;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop %TARGET_ROWS / TARGET_WIDTH times
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_HEIGHT)  + 1;
+
+		int rowShift = threadID / TARGET_ROWS_BY_VEC;
+		int colShift = (threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V;
+
+		int startRow = ref_x + rowShift;
+		int startCol = ref_y + colShift;
+		%TYPE%V  loadedYConj, loadedXConj;
+
+		if ( startRow < ( ref_x + %TARGET_ROWS) ) // threads that fall into target region
+		{
+			loadedYConj	 =  *((__local %TYPE%V*)( ySharedConj + colShift));
+			loadedXConj	 =  *((__local %TYPE%V*)( xSharedConj + colShift));
+		}
+
+		//#pragma unroll
+		for( int i= 1; i <= nLoops; i++)
+		{
+			startRow = ref_x + rowShift + ( i - 1 ) * TARGET_HEIGHT;
+
+			if ( startRow < ( ref_x + %TARGET_ROWS) ) // threads that fall into target region
+			{
+				loadedA  	= %VLOAD( 0, (&A[ startRow * lda + startCol]));
+
+				%TYPE 	 loadedX = xShared[ startRow - ref_x];
+				%TYPE 	 loadedY = yShared[ startRow - ref_x];
+
+				%TYPE 	 res;
+				// X * Y(H)
+				%MUL(res, loadedX, alpha);
+				%TYPE%V  resVec;
+				resVec = %VMAKEVEC(res);
+				%VMAD( loadedA, loadedYConj, resVec);
+
+				// X * Y(H)
+				%MUL(res, loadedY, alpha);
+				resVec = %VMAKEVEC(res);
+				%VMAD( loadedA, loadedXConj, resVec);
+
+				%VSTORE(  loadedA, 0, (&A[ startRow * lda + startCol]));
+			}
+		}
+	}
+}
+\n";
+
+//Row Major Upper
+static const char *syr2_RU_kernel = "
+
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#define TARGET_ROWS_BY_VEC  (%TARGET_ROWS / %V)
+#define TARGET_WIDTH        (%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+#define TARGET_HEIGHT       (%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+//
+//nBlocks = 4
+//
+//Blocks:	9
+//
+//	3 2 1 0
+//	  6 5 4
+//	    7 8
+//		  9
+//
+//	-----------
+//	|<-threadID|
+//	| 3 2 1 0  |
+//	----------(ref_x, ref_y)
+//
+
+// Row-Major Upper
+// nBlocks 	= (N - 1)/ TR + 1
+// totalBlocks 	= (nBlocks * ( nBlocks + 1)) / 2
+__kernel void %PREFIXher2_RU_kernel( __global const %TYPE* _A, __global const %TYPE* _X, __global const %TYPE* _Y, int N, int offx, int incx, int offy, int incy, int offa, int lda, %TYPE alpha)
+{
+
+    __global const %TYPE* X;
+    __global const %TYPE* Y;
+    __global %TYPE* A;
+
+    __local %TYPE xShared[%TARGET_ROWS];
+    __local %TYPE yShared[%TARGET_ROWS];
+    __local %TYPE xSharedConj[%TARGET_ROWS];
+    __local %TYPE ySharedConj[%TARGET_ROWS];
+
+    int nBlocks = ((N - 1) / %TARGET_ROWS) + 1;
+
+    A = _A + offa;
+
+    if ( incx < 0 ) // Goto end of vector
+    {
+        X    = _X + offx - ( N - 1) * incx;
+    }
+    else
+    {
+        X = _X + offx;
+    }
+
+    if ( incy < 0 ) // Goto end of vector
+    {
+        Y    = _Y + offy - ( N - 1) * incy;
+    }
+    else
+    {
+        Y = _Y + offy;
+    }
+
+	int blockID  = get_group_id(0);
+	int threadID = get_local_id(0);
+
+	__local int iShared;
+	__local int jShared;
+
+	// Get (i,j) of Block
+	if ( threadID == 0)
+	{
+		int _i = 0, _j = 0;
+		for ( _j = (blockID / nBlocks); _j < nBlocks; _j++)
+		{
+			_i = blockID - ((_j*((2* nBlocks) + 1 - _j))/2) + _j;
+			if ( _i < nBlocks && ( _i >= 0) )
+			{
+				break;
+			}
+		}
+
+		iShared = _i;
+		jShared = _j;
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int i = iShared;
+	int j = jShared;
+
+
+	int ref_x = (N - 1) - ((nBlocks - 1 - j) * %TARGET_ROWS);
+	int ref_y = (N - 1) - (i -j) * %TARGET_ROWS;
+
+	// Load data into xShared and yShared
+	// Not a common task among blocks in the present implementation..
+
+	// Diagonal Blocks : Should handle not reading diagonal element complex value
+	// Diagonal blocks : Should handle the last block as well
+	// Scalar code in Present implementation
+	if ( ref_x == ref_y )
+	{
+		// Need only xShared, not using yShared
+		// Can use ref_x or ref_y
+		for( int i =  (ref_x - threadID); (i >= 0 && ((ref_x - i) < %TARGET_ROWS)); i -= get_local_size(0))
+		{
+			xShared[ (ref_x - i)] = X[ i * incx];
+			yShared[ (ref_x - i)] = Y[ i * incy];
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		int rowShift = threadID / %TARGET_ROWS;
+		int colShift = (threadID & ( %TARGET_ROWS - 1));
+
+		int target_height = %BLOCKSIZE / %TARGET_ROWS;
+
+		int nLoops = ((%TARGET_ROWS - 1)/ target_height)  + 1;
+		int startRow = ref_x - rowShift;
+
+		%TYPE yValue, xValue, yValueRead, xValueRead, xValueConj, yValueConj;
+
+		int startCol = ref_y - colShift; // remains fixed on looping
+
+		bool readXY = false;
+		for( int i= 1; i <=  nLoops; i++)
+		{
+			startRow = ref_x - rowShift - ( i - 1 ) * target_height;
+			bool activeThread = ( startRow >= 0) && ( startCol >= 0) && (( ref_x - startRow) < %TARGET_ROWS);
+
+			if ( activeThread)
+			{
+				// Avoid reading yValue again for threads that have it already while looping
+				if (( startCol >= startRow ) && (!readXY))
+				{
+					xValueRead = xShared[ ref_y - startCol];
+					yValueRead = yShared[ ref_y - startCol];
+					readXY = true;
+				}
+
+				if ( startCol > startRow )
+				{
+					%TYPE res1, res2;
+					yValueConj = yValueRead;
+					xValueConj = xValueRead;
+
+					xValue = xShared[ ref_x - startRow];
+					yValue = yShared[ ref_x - startRow];
+
+					// X * Y(H)
+					%MUL(res1, alpha, yValueConj);
+					%MUL(res2, res1, xValue);
+
+					// X * Y(H)
+					%MUL(res1, alpha, xValueConj);
+					%MAD( res2, res1,  yValue);
+
+					A[ startRow * lda + startCol] += res2;
+				}
+				else if (startRow == startCol) // Diagonal
+				{
+					// The y Values can be obtained from  xValues
+					xValue     = xValueRead;
+					yValue 	   = yValueRead;
+					xValueConj = xValueRead;
+					yValueConj = yValueRead;
+
+					%TYPE res1, res2;
+					// X * Y(H)
+					%MUL(res1, alpha, yValueConj);
+					%MUL(res2, res1,  xValue);
+					// Y * X(H)
+					%MUL(res1, alpha, xValueConj);
+					%MAD(res2, res1,  yValue);
+
+					// Discard the imaginary component of A
+					%ADD(A[ startRow * lda + startCol], A[ startRow * lda + startCol], res2);
+				}
+			}
+		}
+	}
+	else if ( ref_x == (( N - 1) - ((nBlocks - 1) * %TARGET_ROWS))) // First Row Strip blocks ( May not fit into target region)
+	{
+		%TYPE%V loadedA;
+
+		// Populating xShared: May not fit into target region
+		for( int i = (ref_x - threadID); i >= 0; i -= get_local_size(0))
+		{
+			xShared[ref_x - i] = X[ i * incx];
+			yShared[ref_x - i] = Y[ i * incy];
+		}
+
+		// Populating yShared: Always fits well..
+		for( int i = (ref_y - threadID); (ref_y - i) < %TARGET_ROWS; i -= get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			xSharedConj[(%TARGET_ROWS - 1) - (ref_y - i)] = loadedX;
+			ySharedConj[(%TARGET_ROWS - 1) - (ref_y - i)] = loadedY;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_HEIGHT)  + 1;
+
+		int rowShift = threadID / TARGET_ROWS_BY_VEC;
+		int colShift = ((threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V) + (%V - 1);
+
+		int startRow = ref_x - rowShift;
+		int startCol = ref_y - colShift;
+
+		%TYPE%V  loadedYConj, loadedXConj;
+
+		if ( startRow  >= 0 )
+		{
+			loadedYConj =  *((__local %TYPE%V*)( &ySharedConj[ (%TARGET_ROWS - 1) - colShift]));
+			loadedXConj =  *((__local %TYPE%V*)( &xSharedConj[ (%TARGET_ROWS - 1) - colShift]));
+		}
+
+		for( int i= 1; i <=  nLoops; i++)
+		{
+			int startRow = ref_x - rowShift - ( i - 1 ) * TARGET_HEIGHT;
+
+			if ( startRow  >= 0 )
+			{
+					loadedA  = %VLOAD( 0, (&A[ startRow * lda + startCol]));
+
+					%TYPE 	 loadedX = xShared[ ref_x - startRow];
+					%TYPE 	 loadedY = yShared[ ref_x - startRow];
+					%TYPE 	 res;
+					// X * Y(H)
+					%MUL(res, loadedX, alpha);
+					%TYPE%V  resVec;
+					resVec = %VMAKEVEC(res);
+					%VMAD( loadedA, loadedYConj, resVec);
+
+					// Y * X(H)
+					%MUL(res, loadedY, alpha);
+					resVec = %VMAKEVEC(res);
+					%VMAD( loadedA, loadedXConj, resVec);
+
+					%VSTORE(  loadedA, 0, (&A[ startRow * lda + startCol]));
+			}
+		}
+	}
+	else // blocks that fit exactly.
+	{
+		%TYPE%V loadedA;
+
+		// Populating xShared
+		for( int i = (ref_x - threadID); ((ref_x - i) < %TARGET_ROWS); i -= get_local_size(0))
+		{
+			xShared[(ref_x - i)] = X[ i * incx];
+			yShared[(ref_x - i)] = Y[ i * incy];
+		}
+
+		// Populating yShared
+		for( int i = (ref_y - threadID); (ref_y - i) < %TARGET_ROWS; i -= get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			xSharedConj[(%TARGET_ROWS - 1) - (ref_y - i)] = loadedX;
+			ySharedConj[(%TARGET_ROWS - 1) - (ref_y - i)] = loadedY;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop %TARGET_ROWS / TARGET_WIDTH times
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_HEIGHT)  + 1;
+
+		int rowShift = threadID / TARGET_ROWS_BY_VEC;
+		int colShift = ((threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V) + (%V - 1);
+
+
+		int startRow = ref_x - rowShift;
+		int startCol = ref_y - colShift;
+
+		%TYPE%V  loadedYConj, loadedXConj;
+		// Not all threads should do this..
+		// Depends on whether blocksize width is > target_rows
+		if ( startRow > ( ref_x - %TARGET_ROWS) ) // threads that fall into target region
+		{
+			loadedYConj =  *((__local %TYPE%V*)( &ySharedConj[ (%TARGET_ROWS - 1) - colShift]));
+			loadedXConj =  *((__local %TYPE%V*)( &xSharedConj[ (%TARGET_ROWS - 1) - colShift]));
+		}
+
+		for( int i= 1; i <= nLoops; i++)
+		{
+			startRow = ref_x - rowShift - ( i - 1 ) * TARGET_HEIGHT;
+
+			if ( startRow > ( ref_x - %TARGET_ROWS) ) // threads that fall into target region
+			{
+				loadedA  = %VLOAD( 0, (&A[ startRow * lda + startCol]));
+
+				%TYPE 	 loadedX = xShared[ ref_x - startRow];
+				%TYPE 	 loadedY = yShared[ ref_x - startRow];
+				%TYPE 	 res;
+				// X * Y(H)
+				%MUL(res, loadedX, alpha);
+				%TYPE%V  resVec;
+				resVec = %VMAKEVEC(res);
+				%VMAD( loadedA, loadedYConj, resVec);
+
+				// Y * X(H)
+				%MUL(res, loadedY, alpha);
+				resVec = %VMAKEVEC(res);
+				%VMAD( loadedA, loadedXConj, resVec);
+
+				%VSTORE(  loadedA, 0, (&A[ startRow * lda + startCol]));
+
+			}
+		}
+
+	}
+}
+\n";
+*/
diff --git a/src/library/blas/gens/clTemplates/syr2_her2.cl b/src/library/blas/gens/clTemplates/syr2_her2.cl
new file mode 100644
index 0000000..b335e8c
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/syr2_her2.cl
@@ -0,0 +1,743 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+//Column Major Lower
+static const char *syr2_her2_CL_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+	#pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#ifdef PACKED
+	#define A( row, col) (*( A + ((( col *((2*N) + 1 - col)) / 2) + (row - col))))
+#else
+	#define A( row, col) A[ row + col * lda]
+#endif
+
+#define TARGET_ROWS_BY_VEC 	(%TARGET_ROWS / %V)
+#define TARGET_WIDTH		(%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+#define TARGET_HEIGHT		(%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+// Column-Major Lower
+// nBlocks 	= (N - 1)/ TR + 1
+// totalBlocks 	= (nBlocks * ( nBlocks + 1)) / 2
+__kernel void %PREFIXsyr2_her2_CL_kernel( __global %TYPE* _A, __global const %TYPE* _X, __global const %TYPE* _Y, int N, int offx, int incx, int offy, int incy, int offa, int lda, %TYPE alpha)
+{
+
+	__global const %TYPE* X;
+	__global const %TYPE* Y;
+	__global %TYPE* A;
+
+	__local %TYPE xShared[%TARGET_ROWS];
+	__local %TYPE yShared[%TARGET_ROWS];
+	__local %TYPE xSharedConj[%TARGET_ROWS];
+	__local %TYPE ySharedConj[%TARGET_ROWS];
+
+	#ifdef HER2_ONLY
+		if( (alpha.even == 0.0f) && (alpha.odd == 0.0f) )
+			return;
+	#else
+		if(alpha == 0.0f)
+			return;
+	#endif
+
+	int nBlocks = ((N - 1) / %TARGET_ROWS) + 1;
+
+	A = _A + offa;
+
+	if ( incx < 0 ) // Goto end of vector
+	{
+		X	 = _X + offx + ( N - 1) * abs(incx);
+	}
+	else
+	{
+		X = _X + offx;
+	}
+
+	if ( incy < 0 ) // Goto end of vector
+	{
+		Y	 = _Y + offy + ( N - 1) * abs(incy);
+	}
+	else
+	{
+		Y = _Y + offy;
+	}
+
+	int blockID  = get_group_id(0);
+	int threadID = get_local_id(0);
+
+	__local int iShared;
+	__local int jShared;
+
+	// Get (i,j) of Block
+	if ( threadID == 0)
+	{
+		int _i = 0, _j = 0;
+		for ( _j = (blockID / nBlocks); _j < nBlocks; _j++)
+		{
+			_i = blockID - ((_j*((2* nBlocks) + 1 - _j))/2) + _j;
+			if ( _i < nBlocks && ( _i >= 0) )
+			{
+				break;
+			}
+		}
+
+		iShared = _i;
+		jShared = _j;
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int i = iShared;
+	int j = jShared;
+
+
+	int ref_x = i * %TARGET_ROWS;
+	int ref_y = j * %TARGET_ROWS;
+	#ifdef HER2_ONLY
+		%TYPE conjAlpha	 = alpha;
+		%CONJUGATE( 1, conjAlpha );
+	#endif
+
+	// Load data into xShared and yShared
+	// Not a common task among blocks in the present implementation..
+	// Diagonal Blocks : Should handle not reading diagonal element complex value
+	// Diagonal blocks : Should handle the last block as well
+	// Scalar code in Present implementation
+	if ( i == j)
+	{
+		int ncols = ((ref_y + %TARGET_ROWS) < N) ? %TARGET_ROWS : (N-ref_y);
+        int nrows = ((ref_x + %TARGET_ROWS) < N) ? %TARGET_ROWS : (N-ref_x);
+        int nElements = ((nrows) * ((ncols) + 1)) >> 1;
+        nrows -= 1;
+        ncols -= 1;
+        for(i = threadID; i < nElements; i += get_local_size(0))
+        {
+            int r = -1, c = -1;
+            for(int k = 1; (k <= %TARGET_ROWS); k ++)
+            {
+                int temp = ((k - 1) * k) >> 1;
+                r = ((i >= temp) && (i <= (temp + k - 1))) ? k - 1 : r;
+            }
+            c = i - (((r + 1) * r) >> 1);
+
+            r = ref_x + r;
+            c = ref_y + c;
+
+            %TYPE res1, res2, res3, res4, res5;
+            res1 = X[r * incx];
+            res2 = X[c * incx];
+            #ifdef HER2_ONLY
+				#ifdef HER2_ROWMAJOR
+					%CONJUGATE(1, res1);
+				#endif
+			#endif
+            %MUL( res5, alpha, res1 );
+            res1 = Y[c * incx];
+            res3 = Y[r * incx];
+            #ifdef HER2_ONLY
+				#ifndef HER2_ROWMAJOR
+					%CONJUGATE(1, res1);
+				#endif
+			#endif
+            %MUL( res4, res5, res1 );
+            #ifdef HER2_ONLY
+				#ifdef HER2_ROWMAJOR
+					%CONJUGATE(1, res3);
+				#else
+					%CONJUGATE(1, res2);
+				#endif
+				%MUL( res5, conjAlpha, res3 );
+            #else
+				%MUL( res5, alpha, res3 );
+            #endif
+            %MAD( res4, res5, res2 );
+            res1 = A( r , c );
+            %ADD( res2, res1, res4 );
+            #ifdef HER2_ONLY
+			 /* HER2 defn: On output, if alpha not equal to 0.0, then imaginary part of A is set to zero. */
+
+				res2.odd = (r == c) ? 0.0 : res2.odd;
+
+			#endif
+
+			A( r , c ) = res2;
+        }
+	}
+	else if ( i == (nBlocks - 1)) // Last Row Strip blocks ( May not fit into target region)
+	{
+
+		%TYPE%V loadedA;
+
+		// Populating xShared: May not fit into target region
+		for( int i = (ref_x + threadID); i < N; i += get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			#ifdef HER2_ONLY
+				#ifdef HER2_ROWMAJOR
+					%CONJUGATE(1, loadedX);		//taking conjugate while loading
+					%CONJUGATE(1, loadedY);
+				#endif
+			#endif
+			xShared[i - ref_x] = loadedX;
+			yShared[i - ref_x] = loadedY;
+		}
+
+		// Populating yShared: Always fits well..
+		for( int i = (ref_y  + threadID); (i - ref_y) < %TARGET_ROWS; i += get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			#ifdef HER2_ONLY
+				#ifndef HER2_ROWMAJOR
+					%CONJUGATE(1, loadedX);		//taking conjugate while loading
+					%CONJUGATE(1, loadedY);
+				#endif
+			#endif
+			xSharedConj[(i - ref_y) ] = loadedX;
+			ySharedConj[(i - ref_y) ] = loadedY;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop %TARGET_ROWS / TARGET_WIDTH times
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = (threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V;
+
+
+		int startRow = ref_x + rowShift;
+		%TYPE%V  loadedX, loadedY;
+
+		if ( startRow  < (N - (%V - 1)) )
+		{
+			loadedX=  *((__local %TYPE%V*)( xShared + rowShift));
+			loadedY=  *((__local %TYPE%V*)( yShared + rowShift));
+		}
+
+		for( int i= 1; i <=  nLoops; i++)
+		{
+			int startCol = ref_y + colShift + ( i - 1 ) * TARGET_WIDTH;
+
+			if ( ( startRow  < N ) && ( startCol  < (ref_y + %TARGET_ROWS ) ) )// threads that fall into target region
+			{
+				if(( startRow + %V) > N )// Loop serially as can't do VLOAD
+				{
+					%TYPE yValueConj = ySharedConj[ startCol - ref_y];
+					%TYPE xValueConj = xSharedConj[ startCol - ref_y];
+
+					for(int row = startRow; row < N; row++)
+					{
+						%TYPE xValue = xShared[ row - ref_x];
+						%TYPE yValue = yShared[ row - ref_x];
+
+						%TYPE res1, res2;
+						// X * Y(H)
+						%MUL(res1, alpha, yValueConj);
+						%MUL( res2, res1,  xValue);
+
+						// Y * X(H)
+						#ifdef HER2_ONLY
+							%MUL(res1, conjAlpha, xValueConj);
+						#else
+							%MUL(res1, alpha, xValueConj);
+						#endif
+						%MAD( res2, res1,  yValue);
+						A( row , startCol ) += res2;
+					}
+				}
+				else
+				{
+					loadedA  	= %VLOAD( 0, (&A( startRow , startCol )));
+
+					%TYPE 	 loadedYConj = ySharedConj[ startCol - ref_y];
+					%TYPE 	 loadedXConj = xSharedConj[ startCol - ref_y];
+					%TYPE 	 res;
+
+					// X * Y(H)
+					%MUL(res, loadedYConj, alpha);
+					%TYPE%V  resVec;
+					resVec = %VMAKEVEC(res);
+					%VMAD( loadedA, loadedX, resVec);
+
+					// Y * X(H)
+					#ifdef HER2_ONLY
+						%MUL(res, loadedXConj, conjAlpha);
+					#else
+						%MUL(res, loadedXConj, alpha);
+					#endif
+					resVec = %VMAKEVEC(res);
+					%VMAD( loadedA, loadedY, resVec);
+
+					%VSTORE(  loadedA, 0, (&A( startRow , startCol )));
+				}
+			}
+		}
+	}
+	else // blocks that fit exactly.
+	{
+
+		%TYPE%V loadedA;
+
+		// Populating xShared
+		for( int i = (ref_x + threadID); (i - ref_x) < %TARGET_ROWS; i += get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			#ifdef HER2_ONLY
+				#ifdef HER2_ROWMAJOR
+					%CONJUGATE(1, loadedX);		//taking conjugate while loading
+					%CONJUGATE(1, loadedY);
+				#endif
+			#endif
+			xShared[i - ref_x] = loadedX;
+			yShared[i - ref_x] = loadedY;
+		}
+
+		// Populating yShared
+		for( int i = (ref_y + threadID); (i - ref_y) < %TARGET_ROWS; i += get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			#ifdef HER2_ONLY
+				#ifndef HER2_ROWMAJOR
+					%CONJUGATE(1, loadedX);		//taking conjugate while loading
+					%CONJUGATE(1, loadedY);
+				#endif
+			#endif
+			xSharedConj[i - ref_y] = loadedX;
+			ySharedConj[i - ref_y] = loadedY;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop %TARGET_ROWS / TARGET_WIDTH times
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = (threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V;
+
+		int startRow = ref_x + rowShift;
+		int startCol = ref_y + colShift;
+		%TYPE%V  loadedX, loadedY;
+
+		if ( startCol < ( ref_y + %TARGET_ROWS) ) // threads that fall into target region
+		{
+			loadedX	 =  *((__local %TYPE%V*)( xShared + rowShift));
+			loadedY	 =  *((__local %TYPE%V*)( yShared + rowShift));
+		}
+
+		//#pragma unroll
+		for( int i= 1; i <= nLoops; i++)
+		{
+			startCol = ref_y + colShift + ( i - 1 ) * TARGET_WIDTH;
+
+			if ( startCol < ( ref_y + %TARGET_ROWS) ) // threads that fall into target region
+			{
+				loadedA  	 = %VLOAD( 0, (&A( startRow , startCol )));
+				%TYPE 	 loadedYConj = ySharedConj[ startCol - ref_y];
+				%TYPE 	 loadedXConj = xSharedConj[ startCol - ref_y];
+
+				// X * Y(H)
+				%TYPE 	 res;
+				%MUL(res, loadedYConj, alpha);
+				%TYPE%V  resVec;
+				resVec = %VMAKEVEC(res);
+				%VMAD( loadedA, loadedX, resVec);
+
+				// Y * X(H)
+				#ifdef HER2_ONLY
+					%MUL(res, loadedXConj, conjAlpha);
+				#else
+					%MUL(res, loadedXConj, alpha);
+				#endif
+				resVec = %VMAKEVEC(res);
+				%VMAD( loadedA, loadedY, resVec);
+				%VSTORE(  loadedA, 0, (&A( startRow , startCol )));
+			}
+		}
+	}
+
+}
+\n";
+
+//Column Major Upper
+static const char *syr2_her2_CU_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#ifdef PACKED
+	#define A( row, col) (*( A + ((col*(col+1))/2 + row)))
+#else
+	#define A( row, col) A[ row + col * lda]
+#endif
+
+#define TARGET_ROWS_BY_VEC  (%TARGET_ROWS / %V)
+#define TARGET_WIDTH        (%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+#define TARGET_HEIGHT       (%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+// Column-Major Upper
+// nBlocks 	= (N - 1)/ TR + 1
+// totalBlocks 	= (nBlocks * ( nBlocks + 1)) / 2
+__kernel void %PREFIXsyr2_her2_CU_kernel( __global %TYPE* _A, __global const %TYPE* _X, __global const %TYPE* _Y, int N, int offx, int incx, int offy, int incy, int offa, int lda, %TYPE alpha)
+{
+
+    __global const %TYPE* X;
+    __global const %TYPE* Y;
+    __global %TYPE* A;
+
+    __local %TYPE xShared[%TARGET_ROWS];
+    __local %TYPE yShared[%TARGET_ROWS];
+    __local %TYPE xSharedConj[%TARGET_ROWS];
+    __local %TYPE ySharedConj[%TARGET_ROWS];
+
+	#ifdef HER2_ONLY
+		if( (alpha.even == 0.0f) && (alpha.odd == 0.0f) )
+			return;
+	#else
+		if(alpha == 0.0f)
+			return;
+	#endif
+
+    int nBlocks = ((N - 1) / %TARGET_ROWS) + 1;
+
+    A = _A + offa;
+
+    if ( incx < 0 ) // Goto end of vector
+    {
+        X    = _X + offx + ( N - 1) * abs(incx);
+    }
+    else
+    {
+        X = _X + offx;
+    }
+
+    if ( incy < 0 ) // Goto end of vector
+    {
+        Y    = _Y + offy + ( N - 1) * abs(incy);
+    }
+    else
+    {
+        Y = _Y + offy;
+    }
+
+	int blockID  = get_group_id(0);
+	int threadID = get_local_id(0);
+
+	__local int iShared;
+	__local int jShared;
+
+	// Get (i,j) of Block
+	if ( threadID == 0)
+	{
+		int _i = 0, _j = 0;
+		//for ( _j = 0; _j < nBlocks; _j++)
+		for ( _j = (blockID / nBlocks); _j < nBlocks; _j++)
+		{
+			_i = blockID - ((_j*((2* nBlocks) + 1 - _j))/2) + _j;
+			if ( _i < nBlocks && ( _i >= 0) )
+			{
+				break;
+			}
+		}
+
+		iShared = _i;
+		jShared = _j;
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int i = iShared;
+	int j = jShared;
+
+	#ifdef HER2_ONLY
+		%TYPE conjAlpha	 = alpha;
+		%CONJUGATE( 1, conjAlpha );
+	#endif
+
+	int ref_x = (N- 1) - i * %TARGET_ROWS;
+	int ref_y = (N- 1) - j * %TARGET_ROWS;
+
+	// Load data into xShared and yShared
+	// Not a common task among blocks in the present implementation..
+
+	// Diagonal Blocks : Should handle not reading diagonal element complex value
+	// Diagonal blocks : Should handle the last block as well
+	// Scalar code in Present implementation
+	if ( i == j)
+	{
+ 		int ncols = ((ref_y - %TARGET_ROWS) >= 0) ? %TARGET_ROWS : (ref_y+1);
+        int nrows = ((ref_x - %TARGET_ROWS) >= 0) ? %TARGET_ROWS : (ref_x+1);
+        int nElements = ((nrows) * ((ncols) + 1)) >> 1;
+        nrows -= 1;
+        ncols -= 1;
+        for(i = threadID; i < nElements; i += get_local_size(0))
+        {
+            int r, c = -1;
+            for(int k = 1; (k <= %TARGET_ROWS); k ++)
+            {
+                int temp = ((k - 1) * k) >> 1;
+                c = ((i >= temp) && (i <= (temp + k - 1))) ? k - 1 : c;
+            }
+            r = i - (((c + 1) * c) >> 1);
+
+            r = ref_x - (nrows) + r;
+            c = ref_y - (ncols) + c;
+
+            %TYPE res1, res2, res3, res4, res5;
+            res1 = X[r * incx];
+            res2 = X[c * incx];
+            #ifdef HER2_ONLY
+				#ifdef HER2_ROWMAJOR
+					%CONJUGATE(1, res1);
+				#endif
+			#endif
+            %MUL( res5, alpha, res1 );
+            res1 = Y[c * incx];
+            res3 = Y[r * incx];
+            #ifdef HER2_ONLY
+				#ifndef HER2_ROWMAJOR
+					%CONJUGATE(1, res1);
+				#endif
+			#endif
+            %MUL( res4, res5, res1 );
+            #ifdef HER2_ONLY
+				#ifdef HER2_ROWMAJOR
+					%CONJUGATE(1, res3);
+				#else
+					%CONJUGATE(1, res2);
+				#endif
+				%MUL( res5, conjAlpha, res3 );
+			#else
+				%MUL( res5, alpha, res3 );
+            #endif
+            %MAD( res4, res5, res2 );
+            res1 = A( r , c );
+            %ADD( res2, res1, res4 );
+            #ifdef HER2_ONLY
+			 /* HER2 defn: On output, if alpha not equal to 0.0, then imaginary part of A is set to zero. */
+
+				res2.odd = (r == c) ? 0.0 : res2.odd;
+
+			#endif
+
+			A( r , c ) = res2;
+        }
+	}
+	else if ( i == (nBlocks - 1)) // Last Row Strip blocks ( May not fit into target region)
+	{
+
+		%TYPE%V loadedA;
+
+		// Populating xShared: May not fit into target region
+		for( int i = (ref_x - threadID); i >= 0; i -= get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			#ifdef HER2_ONLY
+				#ifdef HER2_ROWMAJOR
+					%CONJUGATE(1, loadedX);		//taking conjugate while loading
+					%CONJUGATE(1, loadedY);
+				#endif
+			#endif
+			xShared[(%TARGET_ROWS - 1) - threadID] = loadedX;
+			yShared[(%TARGET_ROWS - 1) - threadID] = loadedY;
+		}
+
+		// Populating yShared: Always fits well..
+		for( int i = (ref_y - threadID); (ref_y - i) < %TARGET_ROWS; i -= get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			#ifdef HER2_ONLY
+				#ifndef HER2_ROWMAJOR
+					%CONJUGATE(1, loadedX);		//taking conjugate while loading
+					%CONJUGATE(1, loadedY);
+				#endif
+			#endif
+			xSharedConj[(ref_y - i)] = loadedX;
+			ySharedConj[(ref_y - i)] = loadedY;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = ((threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V) + (%V - 1);
+
+		int startRow = ref_x - rowShift;
+		%TYPE%V  loadedX, loadedY;
+
+		if ( startRow  >= 0 )
+		{
+			loadedX=  *((__local %TYPE%V*)( &xShared[ (%TARGET_ROWS - 1) - rowShift]));
+			loadedY=  *((__local %TYPE%V*)( &yShared[ (%TARGET_ROWS - 1) - rowShift]));
+		}
+
+		for( int i= 1; i <=  nLoops; i++)
+		{
+			int startCol = ref_y - colShift - ( i - 1 ) * TARGET_WIDTH;
+
+			// threads that fall into target region
+			if( ( startRow  > -(%V) ) && (startCol > (ref_y - %TARGET_ROWS)) )
+			{
+				if( startRow  < 0 )// Loop serially as can't do VLOAD
+				{
+					%TYPE yValueConj = ySharedConj[ ref_y - startCol];
+					%TYPE xValueConj = xSharedConj[ ref_y - startCol];
+
+					for(int row = startRow + (%V - 1); row >= 0; row--)
+					{
+						%TYPE xValue = xShared[ %TARGET_ROWS - 1 - (ref_x - row)];
+						%TYPE yValue = yShared[ %TARGET_ROWS - 1 - (ref_x - row)];
+
+						%TYPE res1, res2;
+
+						// X * Y(H)
+						%MUL(res1, alpha, yValueConj);
+						%MUL( res2, res1,  xValue);
+
+						// Y * X(H)
+						#ifdef HER2_ONLY
+							%MUL(res1, conjAlpha, xValueConj);
+						#else
+							%MUL(res1, alpha, xValueConj);
+						#endif
+						%MAD( res2, res1,  yValue);
+						A( row , startCol ) += res2;
+					}
+				}
+				else
+				{
+					loadedA  = %VLOAD( 0, (&A( startRow , startCol )));
+
+					%TYPE 	 loadedXConj = xSharedConj[ ref_y - startCol];
+					%TYPE 	 loadedYConj = ySharedConj[ ref_y - startCol];
+					%TYPE 	 res;
+					// X * Y(H)
+					%MUL(res, loadedYConj, alpha);
+					%TYPE%V  resVec;
+					resVec = %VMAKEVEC(res);
+					%VMAD( loadedA, loadedX, resVec);
+
+					// Y * X(H)
+					#ifdef HER2_ONLY
+						%MUL(res, loadedXConj, conjAlpha);
+					#else
+						%MUL(res, loadedXConj, alpha);
+					#endif
+					resVec = %VMAKEVEC(res);
+					%VMAD( loadedA, loadedY, resVec);
+					%VSTORE(  loadedA, 0, (&A( startRow , startCol )));
+				}
+			}
+		}
+	}
+	else // blocks that fit exactly.
+	{
+		%TYPE%V loadedA;
+
+		// Populating xShared
+		for( int i = (ref_x - threadID); ((ref_x - i) < %TARGET_ROWS); i -= get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			#ifdef HER2_ONLY
+				#ifdef HER2_ROWMAJOR
+					%CONJUGATE(1, loadedX);		//taking conjugate while loading
+					%CONJUGATE(1, loadedY);
+				#endif
+			#endif
+			xShared[ (%TARGET_ROWS - 1) - (ref_x - i)] = loadedX;
+			yShared[ (%TARGET_ROWS - 1) - (ref_x - i)] = loadedY;
+		}
+
+		// Populating yShared
+		for( int i = (ref_y - threadID); (ref_y - i) < %TARGET_ROWS; i -= get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			%TYPE loadedY = Y[ i * incy];
+			#ifdef HER2_ONLY
+				#ifndef HER2_ROWMAJOR
+					%CONJUGATE(1, loadedX);		//taking conjugate while loading
+					%CONJUGATE(1, loadedY);
+				#endif
+			#endif
+			xSharedConj[(ref_y - i)] = loadedX;
+			ySharedConj[(ref_y - i)] = loadedY;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop %TARGET_ROWS / TARGET_WIDTH times
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = ((threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V) + (%V - 1);
+
+
+		int startRow = ref_x - rowShift;
+		int startCol = ref_y - colShift;
+		%TYPE%V  loadedX, loadedY;
+		// Not all threads should do this..
+		// Depends on whether blocksize width is > target_rows
+		if ( startCol > ( ref_y - %TARGET_ROWS) ) // threads that fall into target region
+		{
+			loadedX=  *((__local %TYPE%V*)( &xShared[ (%TARGET_ROWS - 1)- rowShift]));
+			loadedY=  *((__local %TYPE%V*)( &yShared[ (%TARGET_ROWS - 1)- rowShift]));
+		}
+
+		for( int i= 1; i <= nLoops; i++)
+		{
+			startCol = ref_y - colShift - ( i - 1 ) * TARGET_WIDTH;
+
+			if ( startCol > ( ref_y - %TARGET_ROWS) ) // threads that fall into target region
+			{
+				loadedA  		 	 = %VLOAD( 0, (&A( startRow , startCol )));
+				%TYPE 	 loadedYConj = ySharedConj[ ref_y - startCol];
+	 			%TYPE 	 loadedXConj = xSharedConj[ ref_y - startCol];
+				%TYPE 	 res;
+				// X * Y(H)
+				%MUL(res, loadedYConj, alpha);
+				%TYPE%V  resVec;
+				resVec = %VMAKEVEC(res);
+				%VMAD( loadedA, loadedX, resVec);
+
+				// Y * X(H)
+				#ifdef HER2_ONLY
+					%MUL(res, loadedXConj, conjAlpha);
+				#else
+					%MUL(res, loadedXConj, alpha);
+				#endif
+				resVec = %VMAKEVEC(res);
+				%VMAD( loadedA, loadedY, resVec);
+				%VSTORE(  loadedA, 0, (&A( startRow , startCol )));
+			}
+		}
+
+	}
+}
+";
diff --git a/src/library/blas/gens/clTemplates/syr_her.cl b/src/library/blas/gens/clTemplates/syr_her.cl
new file mode 100644
index 0000000..f58e468
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/syr_her.cl
@@ -0,0 +1,577 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+//Column Major Lower SYR and HER kernels
+static const char *syr_her_CL_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#ifdef PACKED
+	#define A( row, col) (*( A + ((( col *((2*N) + 1 - col)) / 2) + (row - col))))
+#else
+	#define A( row, col) A[ row + col * lda]
+#endif
+
+#define TARGET_ROWS_BY_VEC 	(%TARGET_ROWS / %V)
+#define TARGET_WIDTH		(%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+#define TARGET_HEIGHT		(%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+
+// Column-Major Lower
+// nBlocks 	= (N - 1)/ TR + 1
+// totalBlocks 	= (nBlocks * ( nBlocks + 1)) / 2
+__kernel void %PREFIXsyr_her_CL_kernel( __global %TYPE* _A, __global const %TYPE* _X, int N, int offx, int incx, int offa, int lda, %PTYPE alpha )
+{
+	__global %TYPE* X;
+	__global %TYPE *A;
+	__local %TYPE xShared[%TARGET_ROWS];
+	__local %TYPE yShared[%TARGET_ROWS];
+
+	// If alpha is zero, the computation yields to a zero value and therefore doesnot update the A matrix.
+	if(alpha == 0.0)
+		return;
+
+	A = _A + offa;
+	if ( incx < 0 ) // Goto end of vector
+	{
+		X = _X + offx + ( N - 1) * abs(incx);
+	}
+	else
+	{
+		X = _X + offx;
+	}
+
+	int blockID  = get_group_id(0);
+	int threadID = get_local_id(0);
+	int nBlocks  = ((N - 1) / %TARGET_ROWS) + 1;
+
+	__local int iShared;
+	__local int jShared;
+
+	// Get (i,j) of Block
+	if ( threadID == 0)
+	{
+		int _i = 0, _j = 0;
+		//for ( _j = 0; _j < nBlocks; _j++)
+		for ( _j = (blockID / nBlocks); _j < nBlocks; _j++)
+		{
+			_i = blockID - ((_j*((2* nBlocks) + 1 - _j))/2) + _j;
+			if ( _i < nBlocks && ( _i >= 0) )
+			{
+				break;
+			}
+		}
+
+		iShared = _i;
+		jShared = _j;
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int i = iShared;
+	int j = jShared;
+
+
+	int ref_x = i * %TARGET_ROWS;
+	int ref_y = j * %TARGET_ROWS;
+
+	// Load data into xShared and yShared
+	// Not a common task among blocks in the present implementation..
+
+	// Diagonal Blocks : Should handle not reading diagonal element complex value
+	// Diagonal blocks : Should handle the last block as well
+	// Scalar code in Present implementation
+	if ( i == j)
+	{
+ 		int ncols = ((ref_y + %TARGET_ROWS) < N) ? %TARGET_ROWS : (N-ref_y);
+        int nrows = ((ref_x + %TARGET_ROWS) < N) ? %TARGET_ROWS : (N-ref_x);
+        int nElements = ((nrows) * ((ncols) + 1)) >> 1;
+        nrows -= 1;
+        ncols -= 1;
+        for(i = threadID; i < nElements; i += get_local_size(0))
+        {
+            int r = -1, c = -1;
+            for(int k = 1; (k <= %TARGET_ROWS); k ++)
+            {
+                int temp = ((k - 1) * k) >> 1;
+                r = ((i >= temp) && (i <= (temp + k - 1))) ? k - 1 : r;
+            }
+            c = i - (((r + 1) * r) >> 1);
+
+            r = ref_x + r;
+            c = ref_y + c;
+
+			%TYPE res1, res2, res;
+            res1 = alpha * X[r * incx];
+			res2 = X[c * incx];
+
+			#ifdef HER_ONLY
+				#ifdef HERMITIAN_ROWMAJOR
+					%CONJUGATE(1, res1);
+				#else
+					%CONJUGATE(1, res2);
+				#endif
+			#endif
+
+			res = A( r , c );
+		    %MAD( res, res1, res2);
+
+			#ifdef HER_ONLY
+				/*
+			      HER defn: On input, the imaginary parts of the diagonal elements of the
+			      complex Hermitian matrix A are assumed to be zero, so you do not have to set
+			      these values. On output, the imaginary parts of diagonal elements are set to zero.
+				*/
+
+				res.odd = (r == c) ? 0.0 : res.odd;
+			#endif
+
+			A( r, c ) = res;
+        }
+	}
+	else if ( i == (nBlocks - 1)) // Last Row Strip blocks ( May not fit into target region)
+	{
+
+		%TYPE%V loadedA;
+
+		// Populating xShared: May not fit into target region
+		for( int i = (ref_x + threadID); i < N; i += get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			#ifdef HER_ONLY
+				#ifdef HERMITIAN_ROWMAJOR
+					%CONJUGATE(1, loadedX); // Taking conjugate while loading only
+				#endif
+			#endif
+			xShared[i - ref_x] = loadedX;
+		}
+
+		// Populating yShared: Always fits well..
+		for( int i = (ref_y  + threadID); (i - ref_y) < %TARGET_ROWS; i += get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			#ifdef HER_ONLY
+				#ifndef HERMITIAN_ROWMAJOR
+					%CONJUGATE(1, loadedX); // Taking conjugate while loading only
+				#endif
+			#endif
+			yShared[(i - ref_y) ] = loadedX;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop %TARGET_ROWS / TARGET_WIDTH times
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = (threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V;
+
+
+		int startRow = ref_x + rowShift;
+		%TYPE%V  loadedX;
+
+		if ( startRow  < (N - (%V - 1)) )
+		{
+			loadedX=  *((__local %TYPE%V*)( xShared + rowShift));
+		}
+
+		for( int i= 1; i <=  nLoops; i++)
+		{
+			int startCol = ref_y + colShift + ( i - 1 ) * TARGET_WIDTH;
+
+			if ( ( startRow  < N ) && ( startCol  < (ref_y + %TARGET_ROWS ) ) )// threads that fall into target region
+			{
+				if(( startRow + %V) > N )// Loop serially as can't do VLOAD
+				{
+					%TYPE yValue = yShared[ startCol - ref_y];
+
+					for(int row = startRow; row < N; row++)
+					{
+						%TYPE xValue = xShared[ row - ref_x];
+						%TYPE res1, res2;
+						res1 = alpha * xValue;
+						%MUL( res2, res1, yValue);
+						A( row , startCol ) += res2;
+					}
+				}
+				else
+				{
+					loadedA  	= %VLOAD( 0, (&A( startRow , startCol )));
+
+					%TYPE 	 loadedY= yShared[ startCol - ref_y];
+					%TYPE 	 res;
+					res =  loadedY * alpha;
+					%TYPE%V  resVec;
+					resVec = %VMAKEVEC(res);
+					%VMAD( loadedA, loadedX, resVec);
+					%VSTORE(  loadedA, 0, (&A( startRow , startCol)));
+				}
+			}
+		}
+	}
+	else // blocks that fit exactly.
+	{
+
+		%TYPE%V loadedA;
+
+		// Populating xShared
+		for( int i = (ref_x + threadID); (i - ref_x) < %TARGET_ROWS; i += get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			#ifdef HER_ONLY
+				#ifdef HERMITIAN_ROWMAJOR
+					%CONJUGATE(1, loadedX); // Taking conjugate while loading only
+				#endif
+			#endif
+			xShared[i - ref_x] = loadedX;
+		}
+
+		// Populating yShared
+		for( int i = (ref_y + threadID); (i - ref_y) < %TARGET_ROWS; i += get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			#ifdef HER_ONLY
+				#ifndef HERMITIAN_ROWMAJOR
+					%CONJUGATE(1, loadedX); // Taking conjugate while loading only
+				#endif
+			#endif
+			yShared[(i - ref_y) ] = loadedX;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop %TARGET_ROWS / TARGET_WIDTH times
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = (threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V;
+
+		int startRow = ref_x + rowShift;
+		int startCol = ref_y + colShift;
+		%TYPE%V  loadedX;
+
+		if ( startCol < ( ref_y + %TARGET_ROWS) ) // threads that fall into target region
+		{
+			loadedX	 =  *((__local %TYPE%V*)( xShared + rowShift));
+		}
+
+		//#pragma unroll
+		for( int i= 1; i <= nLoops; i++)
+		{
+			startCol = ref_y + colShift + ( i - 1 ) * TARGET_WIDTH;
+
+			if ( startCol < ( ref_y + %TARGET_ROWS) ) // threads that fall into target region
+			{
+				loadedA  	= %VLOAD( 0, (&A( startRow , startCol)));
+				%TYPE 	 loadedY= yShared[ startCol - ref_y];
+				%TYPE 	 res;
+				res =  loadedY * alpha;
+				%TYPE%V  resVec;
+				resVec = %VMAKEVEC(res);
+				%VMAD( loadedA, loadedX, resVec);
+				%VSTORE(  loadedA, 0, (&A( startRow , startCol )));
+			}
+		}
+	}
+}
+";
+
+// Column-Major Upper SYR and HER kernels
+static const char *syr_her_CU_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#ifdef PACKED
+	#define A( row, col) (*( A + ((col*(col+1))/2 + row)))
+#else
+	#define A( row, col) A[ row + col * lda]
+#endif
+
+#define TARGET_ROWS_BY_VEC   	(%TARGET_ROWS / %V)
+#define TARGET_WIDTH     		(%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+#define TARGET_HEIGHT        	(%BLOCKSIZE / TARGET_ROWS_BY_VEC )
+
+// Column-Major Upper
+// nBlocks 	= (N - 1)/ TR + 1
+// totalBlocks 	= (nBlocks * ( nBlocks + 1)) / 2
+__kernel void %PREFIXsyr_her_CU_kernel( __global %TYPE* _A, __global const %TYPE* _X, int N, int offx, int incx, int offa, int lda, %PTYPE alpha )
+{
+	__global %TYPE* X;
+	__global %TYPE *A;
+
+	__local %TYPE xShared[%TARGET_ROWS];
+	__local %TYPE yShared[%TARGET_ROWS];
+
+	// If alpha is zero, the computation yields to a zero value and therefore doesnot update the A matrix.
+	if(alpha == 0.0)
+		return;
+
+	A = _A + offa;
+	if ( incx < 0 ) // Goto end of vector
+	{
+		X = _X + offx + ( N - 1) * abs(incx);
+	}
+	else
+	{
+		X = _X + offx;
+	}
+
+	int blockID  = get_group_id(0);
+	int threadID = get_local_id(0);
+	int nBlocks  = ((N - 1) / %TARGET_ROWS) + 1;
+
+	__local int iShared;
+	__local int jShared;
+
+	// Get (i,j) of Block
+	if ( threadID == 0)
+	{
+		int _i = 0, _j = 0;
+		//for ( _j = 0; _j < nBlocks; _j++)
+		for ( _j = (blockID / nBlocks); _j < nBlocks; _j++)
+		{
+			_i = blockID - ((_j*((2* nBlocks) + 1 - _j))/2) + _j;
+			if ( _i < nBlocks && ( _i >= 0) )
+			{
+				break;
+			}
+		}
+
+		iShared = _i;
+		jShared = _j;
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int i = iShared;
+	int j = jShared;
+
+	int ref_x = (N- 1) - i * %TARGET_ROWS;
+	int ref_y = (N- 1) - j * %TARGET_ROWS;
+
+	// Load data into xShared and yShared
+	// Not a common task among blocks in the present implementation..
+
+	// Diagonal Blocks : Should handle not reading diagonal element complex value
+	// Diagonal blocks : Should handle the last block as well
+	// Scalar code in Present implementation
+	if ( i == j)
+	{
+		int ncols = ((ref_y - %TARGET_ROWS) >= 0) ? %TARGET_ROWS : (ref_y+1);
+		int nrows = ((ref_x - %TARGET_ROWS) >= 0) ? %TARGET_ROWS : (ref_x+1);
+		int nElements = ((nrows) * ((ncols) + 1)) >> 1;
+		nrows -= 1;
+		ncols -= 1;
+		for(i = threadID; i < nElements; i += get_local_size(0))
+		{
+			int r, c = -1;
+			for(int k = 1; (k <= %TARGET_ROWS); k ++)
+			{
+				int temp = ((k - 1) * k) >> 1;
+				c = ((i >= temp) && (i <= (temp + k - 1))) ? k - 1 : c;
+			}
+			r = i - (((c + 1) * c) >> 1);
+
+			r = ref_x - (nrows) + r;
+			c = ref_y - (ncols) + c;
+
+			%TYPE res1, res2, res;
+            res1 = alpha * X[r * incx];
+			res2 = X[c * incx];
+
+			#ifdef HER_ONLY
+				#ifdef HERMITIAN_ROWMAJOR
+					%CONJUGATE(1, res1);
+				#else
+					%CONJUGATE(1, res2);
+				#endif
+			#endif
+
+			res = A(r , c );
+		    %MAD( res, res1, res2);
+
+			#ifdef HER_ONLY
+				/*
+			      HER defn: On input, the imaginary parts of the diagonal elements of the
+			      complex Hermitian matrix A are assumed to be zero, so you do not have to set
+			      these values. On output, the imaginary parts of diagonal elements are set to zero.
+				*/
+
+				res.odd = (r == c) ? 0.0 : res.odd;
+			#endif
+
+			A( r , c ) = res;
+		}
+	}
+	else if ( i == (nBlocks - 1)) // Last Row Strip blocks ( May not fit into target region)
+	{
+
+		%TYPE%V loadedA;
+
+		// Populating xShared: May not fit into target region
+		for( int i = (ref_x - threadID); i >= 0; i -= get_local_size(0))
+		{
+			// FIXME: Assumes BLOCKSIZE >= TARGET_ROWS
+			// FIXME: Works correctly only for 1 ITERATION
+			//xShared[(%TARGET_ROWS - 1) - threadID] = X[ i * incx];
+			%TYPE loadedX = X[ i * incx];
+			#ifdef HER_ONLY
+				#ifdef HERMITIAN_ROWMAJOR
+					%CONJUGATE(1, loadedX); // Taking conjugate while loading only
+				#endif
+			#endif
+			xShared[(%TARGET_ROWS - 1) -(ref_x - i)] = loadedX;
+		}
+
+		// Populating yShared: Always fits well..
+		for( int i = (ref_y - threadID); (ref_y - i) < %TARGET_ROWS; i -= get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			#ifdef HER_ONLY
+				#ifndef HERMITIAN_ROWMAJOR
+					%CONJUGATE(1, loadedX); // Taking conjugate while loading only
+				#endif
+			#endif
+			yShared[(ref_y - i)] = loadedX;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = ((threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V) + (%V - 1);
+
+		int startRow = ref_x - rowShift;
+		%TYPE%V  loadedX;
+
+		if ( startRow  >= 0 )
+		{
+			loadedX=  *((__local %TYPE%V*)( &xShared[ (%TARGET_ROWS - 1) - rowShift]));
+		}
+
+		for( int i= 1; i <=  nLoops; i++)
+		{
+			int startCol = ref_y - colShift - ( i - 1 ) * TARGET_WIDTH;
+
+			// threads that fall into target region
+			if( ( startRow  > -(%V) ) && (startCol > (ref_y - %TARGET_ROWS)) )
+			{
+				if( startRow  < 0 )// Loop serially as can't do VLOAD
+				{
+					%TYPE yValue = yShared[ ref_y - startCol];
+
+					for(int row = startRow + (%V - 1); row >= 0; row--)
+					{
+						%TYPE xValue = xShared[ %TARGET_ROWS - 1 - (ref_x - row)];
+						%TYPE res1, res2;
+						res1 = alpha * xValue;
+						%MUL( res2, res1, yValue);
+						A( row , startCol ) += res2;
+					}
+				}
+				else
+				{
+					loadedA  = %VLOAD( 0, (&A( startRow , startCol )));
+
+					%TYPE 	 loadedY= yShared[ ref_y - startCol];
+					%TYPE 	 res;
+					res =  loadedY * alpha;
+					%TYPE%V  resVec;
+					resVec = %VMAKEVEC(res);
+					%VMAD( loadedA, loadedX, resVec);
+					%VSTORE(  loadedA, 0, (&A( startRow , startCol )));
+				}
+			}
+		}
+	}
+	else // blocks that fit exactly.
+	{
+		%TYPE%V loadedA;
+
+		// Populating xShared
+		for( int i = (ref_x - threadID); ((ref_x - i) < %TARGET_ROWS); i -= get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			#ifdef HER_ONLY
+				#ifdef HERMITIAN_ROWMAJOR
+					%CONJUGATE(1, loadedX); // Taking conjugate while loading only
+				#endif
+			#endif
+			xShared[(%TARGET_ROWS - 1) -(ref_x - i)] = loadedX;
+		}
+
+		// Populating yShared
+		for( int i = (ref_y - threadID); (ref_y - i) < %TARGET_ROWS; i -= get_local_size(0))
+		{
+			%TYPE loadedX = X[ i * incx];
+			#ifdef HER_ONLY
+				#ifndef HERMITIAN_ROWMAJOR
+					%CONJUGATE(1, loadedX); // Taking conjugate while loading only
+				#endif
+			#endif
+			yShared[(ref_y - i)] = loadedX;
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Loop %TARGET_ROWS / TARGET_WIDTH times
+		int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH)  + 1;
+
+		int colShift = threadID / TARGET_ROWS_BY_VEC;
+		int rowShift = ((threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V) + (%V - 1);
+
+
+		int startRow = ref_x - rowShift;
+		int startCol = ref_y - colShift;
+		%TYPE%V  loadedX;
+		// Not all threads should do this..
+		// Depends on whether blocksize width is > target_rows
+		if ( startCol > ( ref_y - %TARGET_ROWS) ) // threads that fall into target region
+		{
+			loadedX=  *((__local %TYPE%V*)( &xShared[ (%TARGET_ROWS - 1)- rowShift]));
+		}
+
+		for( int i = 1; i <= nLoops; i++)
+		{
+			startCol = ref_y - colShift - ( i - 1 ) * TARGET_WIDTH;
+
+			if ( startCol > ( ref_y - %TARGET_ROWS) ) // threads that fall into target region
+			{
+				loadedA = %VLOAD( 0, (&A( startRow , startCol )));
+				%TYPE  loadedY = yShared[ ref_y - startCol];
+				%TYPE  res;
+				res = loadedY * alpha;
+				%TYPE%V  resVec;
+				resVec = %VMAKEVEC(res);
+				%VMAD( loadedA, loadedX, resVec);
+				%VSTORE(  loadedA, 0, (&A( startRow , startCol )));
+			}
+		}
+
+	}
+}
+";
diff --git a/src/library/blas/gens/clTemplates/trmv.cl b/src/library/blas/gens/clTemplates/trmv.cl
new file mode 100644
index 0000000..21af6ce
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/trmv.cl
@@ -0,0 +1,931 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+
+// Column-Major Upper Case
+static const char *trmv_CU_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#ifdef PACKED
+	#define A( row, col) (*( A + ((col*(col+1))/2 + row)))
+#else
+	#define A( row, col) A[ row + col * lda]
+#endif
+
+#define TARGET_ROWS_BY_VEC  ((%TARGET_ROWS)/(%V))
+#define TARGET_WIDTH ((%BLOCKSIZE)/(TARGET_ROWS_BY_VEC))
+
+__kernel void %PREFIXtrmv_CU_kernel( __global %TYPE const* restrict _A, __global %TYPE * _xnew, __global %TYPE const* restrict _x_vector, uint N,
+									int incx, int isUnity, uint lda, int doConj, uint offa, uint offx
+#ifdef HEMV_ONLY
+, int incy, uint offy, %TYPE alpha, %TYPE beta
+#endif
+ )
+{
+	__global %TYPE const* x_vector;
+	__global %TYPE* xnew;
+	__global %TYPE const* restrict A;
+
+	A = _A + offa;
+	if ( incx < 0 ) // Goto end of vector
+	{
+		#ifdef HEMV_ONLY
+			x_vector = _x_vector + offx + ( N - 1) * abs(incx);
+		#else
+			x_vector = _x_vector + ( N - 1) * abs(incx);
+			xnew	 = _xnew  + (N - 1) * abs(incx) + offx;
+		#endif
+	}
+	else
+	{
+		#ifdef HEMV_ONLY
+			x_vector = _x_vector + offx;
+		#else
+			x_vector = _x_vector;
+			xnew 	 = _xnew + offx;
+		#endif
+	}
+
+	#ifdef HEMV_ONLY
+	if(incy < 0)
+		xnew  = _xnew + offy + ( N - 1) * abs(incy);
+	else
+		xnew = _xnew + offy;
+	#endif
+
+
+	__local %TYPE  sXData[ TARGET_WIDTH ]; // Each column is multiplied with a common x_vector element
+
+	const int gIdx = get_global_id(0);
+	const int bIdx = get_group_id(0);
+	const int threadIdx = get_local_id(0);
+	const int TARGET_ROWS  = %TARGET_ROWS;
+
+	// Last block always targets the top rows
+	// which may be less than or equal to 64
+	int nBlocks = (N-1)/ %TARGET_ROWS + 1;
+
+	if( bIdx == (nBlocks-1))
+	{
+		// Variables that don't change while looping
+		int startRow = bIdx * %TARGET_ROWS;
+		int destRow  = (startRow + threadIdx) ;
+		if( destRow >= N)
+		{
+			return;
+		}
+
+		//float acc = 0.0f;
+		%TYPE acc 	= %MAKEVEC( 0.0);
+		%TYPE accTemp 	= %MAKEVEC( 0.0);
+
+		for ( int j= ( N - 1 ) ; j > destRow ; j--)
+		{
+			//acc += A( destRow, j) * x_vector[ j * incx];
+			accTemp = A( destRow, j);
+			%CONJUGATE(doConj, accTemp);
+			%MAD(acc, accTemp, x_vector[ j * incx]);
+		}
+
+		if ( isUnity )
+		{
+			#ifdef HEMV_ONLY
+				%TYPE acc1, temp;
+                %MUL(acc1, acc, alpha);
+                temp = xnew[ destRow * incy];
+                %ADD(xnew[ destRow * incy], temp, acc1);
+			#else
+				%ADD(xnew[ destRow * incx] , acc, x_vector[ destRow * incx]);
+			#endif
+		}
+		else
+		{
+			//xnew[ destRow * incx] = acc + A( destRow , destRow) * x_vector[ destRow * incx];
+			accTemp = A( destRow, destRow);
+
+			#ifdef HEMV_ONLY
+                #ifndef SPMV_ONLY
+				    // accTemp.odd = 0.0f;
+                    %CLEAR_IMAGINARY( accTemp );
+			    #endif
+            #endif
+
+			%CONJUGATE(doConj, accTemp);
+			%MAD(acc, accTemp, x_vector[ destRow * incx]);
+
+			#ifdef HEMV_ONLY
+				%TYPE temp, acc1;
+				%MUL(temp, xnew[ destRow * incy], beta);
+				%MUL(acc1, acc, alpha);
+				%ADD(xnew[ destRow * incy], temp, acc1);
+			#else
+				xnew[ destRow * incx] = acc;
+			#endif
+		}
+	}
+	else
+	{
+		%TYPE sumTemp= %MAKEVEC( 0.0);
+		%TYPE%V sum = %VMAKEVEC( sumTemp);
+
+		// Variables that don't change while looping
+		int startRow = bIdx * %TARGET_ROWS;
+		//int rowShift = ((threadIdx & ( TARGET_ROWS_BY_VEC -1 )) * %V);
+		int rowShift = ((threadIdx %  (TARGET_ROWS_BY_VEC)) * %V);
+		int colShift = threadIdx / TARGET_ROWS_BY_VEC;
+
+		int row	= startRow + rowShift;
+
+		// gIdx is not destination row.
+
+		// startRow may be less than 4
+		// So nLoops will be negative
+		// and the FOR loop doesn't execute
+		int nLoops = (( N - (startRow + %TARGET_ROWS))/ TARGET_WIDTH) - 1;
+
+		for( int j=0; j <= (nLoops); j++)
+		{
+			int startCol	= N - (j + 1)* TARGET_WIDTH;
+			int col 	= startCol + colShift;
+
+			//
+			// Only TARGET_WIDTH threads points are to be read from X-vector
+			// We dont't use VLOAD here because incx could be > 1
+			// Minimal prototyping shows that having separate loading code
+			// for incx value of 1 does not change anything in performance
+			// In fact, the extra IF costs us.
+			//
+			barrier(CLK_LOCAL_MEM_FENCE);
+			if (threadIdx < TARGET_WIDTH)
+			{
+				sXData[threadIdx] = x_vector[(startCol + threadIdx) * incx];
+			}
+
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			// TARGET_ROWS_BY_VEC way bank-conflict : May broadcast if TARGET_ROWS = BLOCKSIZE, which reduces occupancy
+			// And we loose performance as we don't have enough blocks to hide memory access and compute latenties per MP
+			%TYPE xData =  sXData[colShift];
+
+			//sum += vload4(0, &A( row, col)) * ((float4)( xData, xData, xData, xData));
+			// ((float4)( xData, xData, xData, xData));
+			%TYPE%V loadedA = %VLOAD(0, (&A( row, col)));
+			%CONJUGATE(doConj, loadedA);
+
+			%TYPE%V xDataTemp = %VMAKEVEC(xData);
+			%VMAD( sum, loadedA, xDataTemp);
+		}
+
+
+		volatile __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * TARGET_WIDTH];
+		volatile __local %TYPE* sData = sDataTemp;
+		//sDataTemp[(threadIdx & ( TARGET_ROWS_BY_VEC -1 )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
+		sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Reduce each block by DTARGET_ROWS threads to generate DTARGET_ROWS acc values
+		if ( threadIdx < %TARGET_ROWS)
+		{
+			//float acc = 0.0f;
+			%TYPE acc 	= %MAKEVEC( 0.0);
+			%TYPE accTemp 	= %MAKEVEC( 0.0);
+			int desRow  = (bIdx * %TARGET_ROWS)+ threadIdx;
+
+			//#pragma unroll TARGET_WIDTH
+			for( int j=0; j < TARGET_WIDTH; j++)
+			{
+				//acc += sData[ threadIdx + j * FTARGET_ROWS];
+				%ADD(acc, acc, sData[ threadIdx + j * TARGET_ROWS]);
+			}
+
+			for ( int j= (N  - (nLoops+1)* TARGET_WIDTH - 1) ; j > desRow; j--)
+			{
+				//acc += A( desRow, j) * x_vector[ j * incx];
+				accTemp = A( desRow, j);
+				%CONJUGATE(doConj, accTemp);
+				%MAD(acc, accTemp, x_vector[ j * incx]);
+			}
+
+			if ( isUnity )
+			{
+				//%ADD(xnew[ desRow * incx], acc, x_vector[ desRow * incx]);
+				#ifdef HEMV_ONLY
+		            %TYPE acc1, temp;
+                    %MUL(acc1, acc, alpha);
+                    temp = xnew[ desRow * incy];
+                    %ADD(xnew[ desRow * incy], temp, acc1);
+            	#else
+                	%ADD(xnew[ desRow * incx] , acc, x_vector[ desRow * incx]);
+            	#endif
+			}
+			else
+			{
+				// xnew[ desRow * incx] =  acc + A( desRow, desRow) * x_vector[ desRow * incx];
+				accTemp = A( desRow, desRow );
+
+            	#ifdef HEMV_ONLY
+                    #ifndef SPMV_ONLY
+            	        //accTemp.odd = 0.0f;
+                        %CLEAR_IMAGINARY( accTemp );
+            	    #endif
+                #endif
+
+				%CONJUGATE(doConj, accTemp);
+				%MAD(acc, accTemp, x_vector[ desRow * incx]);
+
+	            #ifdef HEMV_ONLY
+    	            %TYPE temp, acc1;
+            	    %MUL(temp, xnew[ desRow * incy], beta);
+					%MUL(acc1, acc, alpha);
+               		%ADD(xnew[ desRow * incy], temp, acc1);
+            	#else
+                	xnew[ desRow * incx] = acc;
+            	#endif
+			}
+		}
+		barrier(CLK_GLOBAL_MEM_FENCE);
+	}
+}";
+
+// Column-Major Lower Case
+
+static const char *trmv_CL_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+#ifdef PACKED
+	#define A( row, col) (*( A + ((( col *((2*N) + 1 - col)) / 2) + (row - col))))
+#else
+	#define A( row, col) A[ row + col * lda]
+#endif
+
+#define TARGET_ROWS_BY_VEC  ((%TARGET_ROWS)/(%V))
+#define TARGET_WIDTH ((%BLOCKSIZE)/(TARGET_ROWS_BY_VEC))
+__kernel void %PREFIXtrmv_CL_kernel( __global %TYPE const* restrict _A, __global %TYPE* _xnew, __global %TYPE const* restrict _x_vector,
+									uint N, int incx, int isUnity, uint lda, int doConj, uint offa, uint offx
+#ifdef HEMV_ONLY
+, int incy, uint offy, %TYPE alpha, %TYPE beta
+#endif
+ )
+{
+	__global %TYPE* x_vector;
+	__global %TYPE* xnew;
+	__global %TYPE const * restrict A;
+
+	A = _A + offa;
+	if ( incx < 0 ) // Goto end of vector
+	{
+		#ifdef HEMV_ONLY
+            x_vector = _x_vector + offx + ( N - 1) * abs(incx);
+        #else
+            x_vector = _x_vector + ( N - 1) * abs(incx);
+            xnew     = _xnew + offx + ( N - 1) * abs(incx);
+        #endif
+	}
+	else
+	{
+		#ifdef HEMV_ONLY
+            x_vector = _x_vector + offx;
+        #else
+            x_vector = _x_vector;
+            xnew     = _xnew + offx;
+        #endif
+	}
+
+
+    #ifdef HEMV_ONLY
+    if(incy < 0)
+        xnew  = _xnew + offy + ( N - 1) * abs(incy);
+    else
+        xnew = _xnew + offy;
+    #endif
+
+	__local %TYPE sXData[ TARGET_WIDTH ]; // Each column is multiplied with a common x_vector element
+
+	size_t gIdx = get_global_id(0);
+	size_t bIdx = get_group_id(0);
+	size_t threadIdx = get_local_id(0);
+	int TARGET_ROWS  = %TARGET_ROWS;
+
+	// Last block always targets the top rows
+	// which may be less than or equal to 64
+	size_t nBlocks = (N-1)/ %TARGET_ROWS + 1;
+
+
+	if( bIdx == (nBlocks-1))
+	{
+		// Target row of xNew is given by threadIdx
+		size_t lastRow  = (N - (nBlocks -1) * %TARGET_ROWS) -1;
+
+		if( threadIdx > lastRow )
+		{
+			return;
+		}
+
+		//float acc = 0.0f;
+		%TYPE acc 	= %MAKEVEC( 0.0);
+		%TYPE accTemp 	= %MAKEVEC( 0.0);
+
+		for ( int j= 0 ; j < threadIdx; j++)
+		{
+			//acc += A(threadIdx, j) * x_vector[ j * incx];
+			accTemp = A(threadIdx, j);
+			%CONJUGATE(doConj, accTemp);
+			%MAD(acc, accTemp, x_vector[ j * incx]);
+		}
+
+		if ( isUnity )
+		{
+			#ifdef HEMV_ONLY
+            	%TYPE acc1, temp;
+                %MUL(acc1, acc, alpha);
+                temp = xnew[ threadIdx * incy];
+                %ADD(xnew[ threadIdx * incy], temp, acc1);
+            #else
+                %ADD(xnew[ threadIdx * incx] , acc, x_vector[ threadIdx * incx]);
+            #endif
+		}
+		else
+		{	//xnew[ threadIdx * incx] =  acc +  A(threadIdx, threadIdx) * x_vector[ threadIdx * incx];
+			accTemp = A(threadIdx, threadIdx);
+
+            #ifdef HEMV_ONLY
+                #ifndef SPMV_ONLY
+                    //accTemp.odd = 0.0f;
+                    %CLEAR_IMAGINARY( accTemp );
+                #endif
+            #endif
+
+			%CONJUGATE(doConj, accTemp);
+			%MAD(acc, accTemp, x_vector[ threadIdx * incx]);
+
+            #ifdef HEMV_ONLY
+                %TYPE temp, acc1;
+                %MUL(temp, xnew[ threadIdx * incy], beta);
+                %MUL(acc1, acc, alpha);
+	            %ADD(xnew[ threadIdx * incy], temp, acc1);
+            #else
+                xnew[ threadIdx * incx] = acc;
+            #endif
+		}
+	}
+	else
+	{
+		%TYPE sumTemp= %MAKEVEC( 0.0);
+		%TYPE%V sum = %VMAKEVEC( sumTemp);
+
+		// Variables that don't change while looping
+		size_t startRow = N - (bIdx + 1)* %TARGET_ROWS;
+		//size_t rowShift = ((threadIdx & ( TARGET_ROWS_BY_VEC -1 )) * %V);
+		size_t rowShift = ((threadIdx % ( TARGET_ROWS_BY_VEC  )) * %V);
+		size_t colShift = threadIdx / TARGET_ROWS_BY_VEC;
+
+		size_t row	= startRow + rowShift;
+
+		// gIdx is not destination row.
+		size_t desRow  = startRow + threadIdx;
+
+		// startRow may be less than 4
+		// So nLoops will be negative
+		// and the FOR loop doesn't execute
+		int nLoops = ( startRow / TARGET_WIDTH) - 1;
+
+		for( int j=0; j <= (nLoops); j++)
+		{
+			size_t startCol	= j * TARGET_WIDTH;
+			size_t col 	= startCol + colShift;
+
+			//
+			// Only TARGET_WIDTH threads points are to be read from X-vector
+			// We dont't use VLOAD here because incx could be > 1
+			// Minimal prototyping shows that having separate loading code
+			// for incx value of 1 does not change anything in performance
+			// In fact, the extra IF costs us.
+			//
+			barrier(CLK_LOCAL_MEM_FENCE);
+			if (threadIdx < TARGET_WIDTH)
+			{
+				sXData[threadIdx] = x_vector[(startCol + threadIdx) * incx];
+			}
+
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			// TARGET_ROWS_BY_VEC way bank-conflict : May broadcast if TARGET_ROWS = BLOCKSIZE, which reduces occupancy
+			// And we loose performance as we don't have enough blocks to hide memory access and compute latenties per MP
+			%TYPE xData =  sXData[colShift];
+
+			//sum += vload4(0, &A( row, col)) * ((float4)( xData, xData, xData, xData));
+			// ((float4)( xData, xData, xData, xData));
+			%TYPE%V loadedA = %VLOAD(0, (&A( row, col)));
+			%CONJUGATE(doConj, loadedA);
+
+			%TYPE%V xDataTemp = %VMAKEVEC(xData);
+			%VMAD(sum, loadedA, xDataTemp);
+		}
+
+
+		__local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * TARGET_WIDTH];
+		__local %TYPE* sData = sDataTemp;
+		//sDataTemp[(threadIdx & ( TARGET_ROWS_BY_VEC -1 )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
+		sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC  )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		// Reduce each block by DTARGET_ROWS threads to generate DTARGET_ROWS acc values
+		if ( threadIdx < %TARGET_ROWS)
+		{
+			//float acc = 0.0f;
+			%TYPE acc 	= %MAKEVEC( 0.0);
+			%TYPE accTemp 	= %MAKEVEC( 0.0);
+
+			//#pragma unroll TARGET_WIDTH
+			for( int j=0; j < TARGET_WIDTH; j++)
+			{
+				//acc += sData[ threadIdx + j * FTARGET_ROWS];
+				%ADD(acc, acc, sData[ threadIdx + j * TARGET_ROWS]);
+			}
+
+			for ( int j= ((nLoops+1)* TARGET_WIDTH) ; j < desRow; j++)
+			{
+				//acc += A(desRow, j) * x_vector[ j * incx];
+				accTemp = A(desRow, j);
+				%CONJUGATE(doConj, accTemp);
+				%MAD(acc, accTemp, x_vector[ j * incx]);
+			}
+
+			if ( isUnity )
+			{
+				 #ifdef HEMV_ONLY
+                    %TYPE acc1, temp;
+                    %MUL(acc1, acc, alpha);
+                    temp = xnew[ desRow * incy];
+                    %ADD(xnew[ desRow * incy], temp, acc1);
+                #else
+                    %ADD(xnew[ desRow * incx] , acc, x_vector[ desRow * incx]);
+                #endif
+			}
+			else
+			{
+				// xnew[ desRow * incx] =  acc + A(desRow, desRow) * x_vector[ desRow * incx];
+				accTemp = A(desRow, desRow);
+
+                #ifdef HEMV_ONLY
+                    #ifndef SPMV_ONLY
+                        //accTemp.odd = 0.0f;
+                        %CLEAR_IMAGINARY( accTemp );
+                    #endif
+                #endif
+
+				%CONJUGATE(doConj, accTemp);
+				%MAD(acc, accTemp, x_vector[ desRow * incx]);
+
+                #ifdef HEMV_ONLY
+                    %TYPE temp, acc1;
+                    %MUL(temp, xnew[ desRow * incy], beta);
+                    %MUL(acc1, acc, alpha);
+	               	%ADD(xnew[ desRow * incy], temp, acc1);
+                #else
+                    xnew[ desRow * incx] = acc;
+                #endif
+			}
+		}
+	}
+}";
+
+// Column-Major Lower Transpose
+// Threads : %PREFIXBLOCKSIZET, Blocks launched = (N -1) / %PREFIXTARGET_ROWST + 1
+/*
+#define %PREFIXVECTOR_SIZET %V
+#define %PREFIXTARGET_WIDTH_BY_VECT ( %PREFIXBLOCKSIZET / %PREFIXTARGET_ROWST )
+#define %PREFIXTARGET_WIDTHT ( %PREFIXTARGET_WIDTH_BY_VECT * %PREFIXVECTOR_SIZET )
+*/
+
+static const char *trmv_CLT_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#ifdef PACKED
+	#define A( row, col) (*( A + ((( col *((2*N) + 1 - col)) / 2) + (row - col))))
+#else
+	#define A( row, col) A[ row + col * lda]
+#endif
+
+#define TARGET_WIDTH_BY_VEC ((%BLOCKSIZE) / (%TARGET_ROWS) )
+#define TARGET_WIDTH ((TARGET_WIDTH_BY_VEC) * (%V))
+__kernel void %PREFIXtrmv_CLT_kernel( __global %TYPE const* restrict _A, __global %TYPE * _xnew, __global %TYPE const* restrict _x_vector,
+									  uint N, int incx, int isUnity, uint lda, int doConj, uint offa, uint offx
+#ifdef HEMV_ONLY
+, int incy, uint offy, %TYPE alpha, %TYPE beta
+#endif
+ )
+{
+	__global %TYPE* x_vector;
+	__global %TYPE* xnew;
+	__global %TYPE const * restrict A = _A + offa;
+
+	if ( incx < 0 ) // Goto end of vector
+	{
+		 #ifdef HEMV_ONLY
+            x_vector = _x_vector + offx + ( N - 1) * abs(incx);
+        #else
+            x_vector = _x_vector + ( N - 1) * abs(incx);
+            xnew     = _xnew + offx + ( N - 1) * abs(incx);
+        #endif
+	}
+	else
+	{
+		#ifdef HEMV_ONLY
+            x_vector = _x_vector + offx;
+        #else
+            x_vector = _x_vector;
+            xnew     = _xnew + offx;
+        #endif
+	}
+
+
+    #ifdef HEMV_ONLY
+    if(incy < 0)
+        xnew  = _xnew + offy + ( N - 1) * abs(incy);
+    else
+        xnew = _xnew + offy;
+    #endif
+
+	int gIdx 	= get_global_id(0);
+	int blockIdx	= get_group_id(0);
+	int blockDim  	= get_local_size(0);
+	int threadIdx 	= get_local_id(0);
+
+	__local %TYPE xShared[TARGET_WIDTH];
+
+	int startCol  	= blockIdx * %TARGET_ROWS;
+
+	%TYPE accTemp= %INIT( 0.0);
+	%TYPE%V acc  = %VMAKEVEC( accTemp);
+
+	//size_t rowShift = ((threadIdx & ( TARGET_WIDTH_BY_VEC -1 )) * %V);
+	size_t rowShift = ((threadIdx % ( TARGET_WIDTH_BY_VEC  )) * %V);
+	size_t colShift = threadIdx / TARGET_WIDTH_BY_VEC;
+	size_t col	= startCol + colShift;
+	int startRow;
+
+	for( startRow = (N - TARGET_WIDTH); ( startCol + %TARGET_ROWS - 1 ) < startRow; startRow = (startRow - TARGET_WIDTH))
+	{
+		// Load X data into Shared memory
+		barrier(CLK_LOCAL_MEM_FENCE);
+		if (threadIdx < TARGET_WIDTH)
+		{
+			xShared[threadIdx] = x_vector[ (startRow + threadIdx) * incx];
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		//float4 xData = (float4)(xShared[ rowShift ], xShared[ rowShift + 1], xShared[ rowShift + 2], xShared[ rowShift + 3]);
+		%TYPE%V xData;
+		__local %TYPE%V* xSharedTemp = (xShared + rowShift);
+		xData = *(xSharedTemp);
+
+		int row	= startRow + rowShift;
+		//acc   	+= vload4(0, &A(row, col)) * xData;
+		%TYPE%V loadedA = %VLOAD( 0, (&A(row, col)) );
+		%CONJUGATE(doConj, loadedA);
+		%VMAD(acc, loadedA, xData);
+	}
+	// Restore startRow
+	startRow += TARGET_WIDTH;
+
+	__local %TYPE%V sDataTemp[TARGET_WIDTH_BY_VEC * %TARGET_ROWS];
+	__local %TYPE* sData = sDataTemp;
+
+	// blocks that did vectorLoads
+	bool vectorBlocks = ( startRow != N);
+	if ( vectorBlocks )
+	{
+
+		//sDataTemp[ ( threadIdx & ( TARGET_WIDTH_BY_VEC -1 ) ) + (colShift * TARGET_WIDTH_BY_VEC) ] = acc;
+		sDataTemp[ ( threadIdx % ( TARGET_WIDTH_BY_VEC  ) ) + (colShift * TARGET_WIDTH_BY_VEC) ] = acc;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+	}
+
+	%TYPE sum 	= %MAKEVEC( 0.0);
+	%TYPE loadedA 	= %MAKEVEC( 0.0);
+
+	if( threadIdx < %TARGET_ROWS && ( (startCol + threadIdx) < N))
+	{
+		if ( vectorBlocks )
+		{
+			//#pragma unroll	TARGET_WIDTH
+			for( int i=0 ; i < TARGET_WIDTH; i++)
+			{
+				%ADD(sum, sum, sData[i + (threadIdx * TARGET_WIDTH )]);
+			}
+
+		}
+
+		int destRow = blockIdx * %TARGET_ROWS + threadIdx;
+
+		// Loop from startRow - 1 till destRow
+		for( int i= ( startRow - 1); i > destRow; i--)
+		{
+			loadedA = A(i, destRow);
+			%CONJUGATE(doConj, loadedA);
+			%MAD(sum, loadedA, x_vector[ i * incx]);
+		}
+		if ( isUnity)
+		{
+			#ifdef HEMV_ONLY
+                %TYPE acc1, temp;
+                %MUL(acc1, sum, alpha);
+                temp = xnew[ destRow * incy];
+                %ADD(xnew[ destRow * incy], temp, acc1);
+            #else
+            	%ADD(xnew[ destRow * incx] , sum, x_vector[ destRow * incx]);
+            #endif
+		}
+		else
+		{
+			loadedA = A(destRow, destRow);
+
+            #ifdef HEMV_ONLY
+                #ifndef SPMV_ONLY
+            	    //loadedA.odd = 0.0f;
+                    %CLEAR_IMAGINARY( loadedA );
+                #endif
+            #endif
+
+			%CONJUGATE(doConj, loadedA);
+			%MAD(sum, loadedA, x_vector[ destRow * incx]);
+
+			#ifdef HEMV_ONLY
+				%TYPE temp, acc1;
+				%MUL(temp, xnew[ destRow * incy], beta);
+				%MUL(acc1, sum, alpha);
+				%ADD(xnew[ destRow * incy], temp, acc1);
+			#else
+				xnew[ destRow * incx] = sum;
+			#endif
+		}
+	}
+}";
+
+
+
+// Column-Major Upper Transpose
+// Threads : %PREFIXBLOCKSIZET, Blocks launched = (N -1) / %PREFIXTARGET_ROWST + 1
+static const char *trmv_CUT_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#ifdef PACKED
+	#define A( row, col) (*( A + ((col*(col+1))/2 + row)))
+#else
+	#define A( row, col) A[ row + col * lda]
+#endif
+
+#define TARGET_WIDTH_BY_VEC ((%BLOCKSIZE) / (%TARGET_ROWS) )
+#define TARGET_WIDTH ((TARGET_WIDTH_BY_VEC) * (%V))
+
+__kernel void %PREFIXtrmv_CUT_kernel( __global %TYPE const* restrict _A, __global %TYPE * _xnew, __global  %TYPE const* restrict _x_vector,
+									  uint N, int incx, int isUnity, uint lda, int doConj, uint offa, uint offx
+#ifdef HEMV_ONLY
+, int incy, uint offy, %TYPE alpha, %TYPE beta
+#endif
+ )
+{
+	__global %TYPE* x_vector;
+	__global %TYPE* xnew;
+	__global %TYPE const* restrict A = _A + offa;
+
+	if ( incx < 0 ) // Goto end of vector
+	{
+		#ifdef HEMV_ONLY
+            x_vector = _x_vector + offx + ( N - 1) * abs(incx);
+        #else
+            x_vector = _x_vector + ( N - 1) * abs(incx);
+            xnew     = _xnew + offx + ( N - 1) * abs(incx);
+        #endif
+	}
+	else
+	{
+		#ifdef HEMV_ONLY
+            x_vector = _x_vector + offx;
+        #else
+            x_vector = _x_vector;
+            xnew     = _xnew + offx;
+        #endif
+	}
+
+
+    #ifdef HEMV_ONLY
+    if(incy < 0)
+        xnew  = _xnew + offy + ( N - 1) * abs(incy);
+    else
+        xnew = _xnew + offy;
+    #endif
+
+	int gIdx 	= get_global_id(0);
+	int blockIdx	= get_group_id(0);
+	int blockDim  	= get_local_size(0);
+	int threadIdx 	= get_local_id(0);
+
+	__local %TYPE xShared[TARGET_WIDTH];
+
+	int startRow  	= 0;
+	int startCol  	= N - (blockIdx + 1)* %TARGET_ROWS;
+
+	// Do scalar if this condition is true
+	if ( (startRow + TARGET_WIDTH - 1 ) >= startCol)
+	{
+		int destRow = (startCol + threadIdx) ;
+
+		if( (threadIdx < %TARGET_ROWS) && ( destRow >= 0))
+		{
+			%TYPE sum = %MAKEVEC(0.0);
+			%TYPE accTemp = %MAKEVEC(0.0);
+
+			// Loop from (startRow - 1) till destRow
+			for( int i= 0; i < destRow; i++)
+			{
+				accTemp = A(i, destRow);
+				%CONJUGATE(doConj, accTemp);
+				%MAD(sum, accTemp, x_vector[ i * incx]);
+			}
+			if ( isUnity)
+			{
+				#ifdef HEMV_ONLY
+              		%TYPE acc1, temp;
+                    %MUL(acc1, sum, alpha);
+                    temp = xnew[ destRow * incy];
+                    %ADD(xnew[ destRow * incy], temp, acc1);
+           		#else
+                	%ADD(xnew[ destRow * incx] , sum, x_vector[ destRow * incx]);
+            	#endif
+			}
+			else
+			{
+				accTemp = A(destRow, destRow);
+
+	            #ifdef HEMV_ONLY
+                    #ifndef SPMV_ONLY
+                	    //accTemp.odd = 0.0f;
+                        %CLEAR_IMAGINARY( accTemp );
+            	    #endif
+                #endif
+
+				%CONJUGATE(doConj, accTemp);
+				%MAD(sum, accTemp, x_vector[ destRow * incx]);
+
+    	        #ifdef HEMV_ONLY
+        	        %TYPE temp, acc1;
+	                %MUL(temp, xnew[ destRow * incy], beta);
+                	%MUL(acc1, sum, alpha);
+            	    %ADD(xnew[ destRow * incy], temp, acc1);
+        	    #else
+                	xnew[ destRow * incx] = sum;
+            	#endif
+			}
+		}
+	}
+	else
+	{
+		// float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
+		%TYPE accTemp = %MAKEVEC( 0.0);
+		%TYPE%V acc   = %VMAKEVEC(accTemp);
+
+		//size_t rowShift = ((threadIdx & ( TARGET_WIDTH_BY_VEC -1 )) * %V);
+		size_t rowShift = ((threadIdx % ( TARGET_WIDTH_BY_VEC  )) * %V);
+		size_t colShift = threadIdx / TARGET_WIDTH_BY_VEC;
+
+		int col	     = startCol + colShift;
+
+		for( int i=0; ; i++)
+		{
+			// Load X data into Shared memory
+			barrier(CLK_LOCAL_MEM_FENCE);
+			if (threadIdx < TARGET_WIDTH)
+			{
+				xShared[threadIdx] = x_vector[ (startRow + threadIdx) * incx];
+			}
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			//float4 xData = (float4)(xShared[ rowShift ], xShared[ rowShift + 1], xShared[ rowShift + 2], xShared[ rowShift + 3]);
+			%TYPE%V xData;
+			__local %TYPE%V* xSharedTemp = (xShared + rowShift);
+			xData = *(xSharedTemp);
+
+			int row	= startRow + rowShift;
+			// acc   	+= vload4(0, &A(row,col)) * xData;
+			%TYPE%V loadedA = %VLOAD( 0, (&A(row, col)));
+			%CONJUGATE(doConj, loadedA);
+			%VMAD(acc, loadedA, xData);
+
+			startRow = startRow + TARGET_WIDTH;
+			if ( (startRow + TARGET_WIDTH - 1) >= startCol)
+			{
+				break;
+			}
+		}
+
+		//__local float4 sData[16][4];
+		//sData[(threadIdx & 15)][colShift] = acc;
+		//barrier(CLK_LOCAL_MEM_FENCE);
+		__local %TYPE%V sDataTemp[TARGET_WIDTH_BY_VEC * %TARGET_ROWS];
+		__local %TYPE* sData = sDataTemp;
+
+		//sDataTemp[ ( threadIdx & ( TARGET_WIDTH_BY_VEC -1 ) ) + (colShift * TARGET_WIDTH_BY_VEC) ] = acc;
+		sDataTemp[ ( threadIdx % ( TARGET_WIDTH_BY_VEC  ) ) + (colShift * TARGET_WIDTH_BY_VEC) ] = acc;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		//acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
+		%TYPE sum 	= %MAKEVEC( 0.0);
+		%TYPE loadedA 	= %MAKEVEC( 0.0);
+
+
+		if( threadIdx < %TARGET_ROWS )
+		{
+			//#pragma unroll	TARGET_WIDTH
+			for( int i=0 ; i < TARGET_WIDTH; i++)
+			{
+				%ADD(sum, sum, sData[i + (threadIdx * TARGET_WIDTH )]);
+			}
+
+			int destRow = (startCol + threadIdx) ;
+
+			// Loop from startRow - 1 till destRow
+			for( int i= startRow; i < destRow; i++)
+			{
+				loadedA = A(i, destRow);
+				%CONJUGATE(doConj, loadedA);
+				%MAD(sum, loadedA, x_vector[ i * incx]);
+			}
+			if ( isUnity)
+			{
+				#ifdef HEMV_ONLY
+                    %TYPE acc1, temp;
+                    %MUL(acc1, sum, alpha);
+					temp = xnew[ destRow * incy];
+                    %ADD(xnew[ destRow * incy], temp, acc1);
+                #else
+                    %ADD(xnew[ destRow * incx] , sum, x_vector[ destRow * incx]);
+                #endif
+			}
+			else
+			{
+				loadedA = A(destRow, destRow);
+
+                #ifdef HEMV_ONLY
+                    #ifndef SPMV_ONLY
+                        //loadedA.odd = 0.0f;
+                        %CLEAR_IMAGINARY( loadedA );
+                    #endif
+                #endif
+
+				%CONJUGATE(doConj, loadedA);
+				%MAD(sum, loadedA, x_vector[ destRow * incx]);
+
+                #ifdef HEMV_ONLY
+                    %TYPE temp, acc1;
+                    %MUL(temp, xnew[ destRow * incy], beta);
+                    %MUL(acc1, sum, alpha);
+                    %ADD(xnew[ destRow * incy], temp, acc1);
+                #else
+                    xnew[ destRow * incx] = sum;
+                #endif
+			}
+		}
+	}
+}";
+
+
diff --git a/src/library/blas/gens/clTemplates/trsv.cl b/src/library/blas/gens/clTemplates/trsv.cl
new file mode 100644
index 0000000..56da9c8
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/trsv.cl
@@ -0,0 +1,437 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+// TRSV Column-Major Upper Kernel
+//#include <TRSV.h>
+
+
+const char * trsv_CU_SolveTriangle_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+
+#ifdef PACKED
+    #define A( row, col) (*( A + (((col)*((col)+1))/2 + (row))))
+#elif defined(BANDED)
+    #define A( row, col) A[ (row) * lda + (col)]
+#else
+    #define A( row, col) A[ (row) + (col) * lda]
+#endif
+// Only one workgroup of threads launched
+__kernel void %PREFIXtrsv_CU_SolveTriangle_kernel( __global %TYPE const * restrict _A, __global %TYPE* _xnew, uint N, int incx, int isUnity,
+                                                   uint lda, int doConj, int startRow, int startCol, uint offa, uint offx
+#ifdef BANDED
+                                                   , uint KU
+#endif
+                                                 )
+{
+    __global %TYPE* xnew;
+    __global %TYPE const * restrict A = _A + offa;
+
+    if ( incx < 0 ) // Goto end of vector
+    {
+        xnew     = _xnew + offx + ( N - 1) * abs(incx);
+    }
+    else
+    {
+        xnew     = _xnew + offx;
+    }
+
+    __local %TYPE  xShared; // To share solved x value with other threads..
+
+    size_t gIdx     = get_global_id(0);
+    size_t bIdx     = get_group_id(0);
+    size_t threadIdx= get_local_id(0);
+
+    %TYPE sum     = %MAKEVEC(0.0);
+    %TYPE xVal    = %MAKEVEC(0.0);
+    %TYPE loadedA     = %MAKEVEC(0.0);
+
+    int targetCol     = startCol;
+    int targetRow     = startRow + threadIdx;
+    int loops     = (startCol - startRow) + 1;
+
+#ifdef BANDED
+    int bandCol = (loops - 1) - threadIdx;
+#endif
+
+    for( int i=0; i < loops; i++)
+    {
+        if ( targetRow == targetCol)
+        {
+            xVal = xnew[ targetRow * incx];
+            %SUB(sum, xVal, sum);
+
+            if( isUnity)
+            {
+                xShared = sum;
+            }
+            else // Handle diagonal element
+            {
+#ifdef BANDED
+                loadedA = A((targetRow), (bandCol));
+#else
+                loadedA = A((targetRow), (targetCol));
+#endif
+                %CONJUGATE(doConj, loadedA);
+                %DIV(xShared, sum, loadedA);
+            }
+            xnew[ targetRow * incx ] = xShared;
+        }
+        // Sync so that xShared it available to all threads
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        if ( targetRow < targetCol)
+        {
+#ifdef BANDED
+            loadedA = A((targetRow), (bandCol));
+#else
+            loadedA = A((targetRow), (targetCol));
+#endif
+            %CONJUGATE(doConj, loadedA);
+            %MAD(sum, loadedA, xShared);
+        }
+
+        // Avoid Race...
+        barrier(CLK_LOCAL_MEM_FENCE);
+        targetCol--;
+#ifdef BANDED
+        bandCol--;
+#endif
+    }
+}";
+
+
+const char * trsv_CL_SolveTriangle_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+#ifdef PACKED
+    #define A(row, col) (*( A + ((( (col) *((2*N) + 1 - (col))) / 2) + ((row) - (col)))))
+#elif defined(BANDED)
+    #define A(row, col) A[ (row) * lda + (col)]
+#else
+    #define A(row, col) A[ (row) + (col) * lda]
+#endif
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+// Only one block of threads launched
+__kernel void %PREFIXtrsv_CL_SolveTriangle_kernel( __global const %TYPE* _A, __global %TYPE* _xnew, uint N, int incx, int isUnity,
+                                                   uint lda, int doConj, int startCol, int endRow, uint offa, uint offx
+#ifdef BANDED
+                                                   , uint KL
+#endif
+                                                            )
+{
+    __global %TYPE* xnew;
+    __global %TYPE* A = _A + offa;
+
+    if ( incx < 0 ) // Goto end of vector
+    {
+        xnew     = _xnew + offx + ( N - 1) * abs(incx);
+    }
+    else
+    {
+        xnew     = _xnew + offx;
+    }
+
+    __local %TYPE  xShared; // To share solved x value with other threads..
+
+    size_t gIdx     = get_global_id(0);
+    size_t bIdx     = get_group_id(0);
+    size_t threadIdx= get_local_id(0);
+
+    %TYPE sum     = %MAKEVEC(0.0);
+    %TYPE xVal    = %MAKEVEC(0.0);
+    %TYPE loadedA     = %MAKEVEC(0.0);
+
+    int targetCol     = startCol;
+    int targetRow     = endRow - threadIdx;
+    int loops     = (endRow - startCol) + 1;
+#ifdef BANDED
+    int bandCol = (KL + 1) - loops + threadIdx;
+#endif
+
+//    printf(\"%u : bandCol %d targetCol %d targetRow %d loops %d KL %d\\n\", threadIdx, bandCol, targetCol, targetRow, loops, KL);
+
+    for( int i=0; i < loops; i++)
+    {
+        if ( targetRow == targetCol)
+        {
+            xVal = xnew[ targetRow * incx];
+            //printf(\"Before1 %u : xShared %f, sum %f\\n\", threadIdx, xShared, sum);
+            %SUB(sum, xVal, sum);
+            //printf(\"Before2 %u : xShared %f, sum %f XvAL %f, targetRow %d\\n\", threadIdx, xShared, sum, xVal, targetRow);
+
+            if( isUnity)
+            {
+                xShared = sum;
+            }
+            else // Handle diagonal element
+            {
+#ifndef BANDED
+                loadedA = A((targetRow), (targetCol));
+#else
+                loadedA = A((targetRow), (bandCol));
+#endif
+                %CONJUGATE(doConj, loadedA);
+                %DIV(xShared, sum, loadedA);
+            }
+            //printf(\"After %u : xShared %f, sum %f\\n\", threadIdx, xShared, sum);
+            xnew[ targetRow * incx ] = xShared;
+        }
+        // Sync so that xShared it available to all threads
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        if ( targetRow > targetCol)
+        {
+#ifndef BANDED
+                loadedA = A((targetRow), (targetCol));
+#else
+                loadedA = A((targetRow), (bandCol));
+#endif
+            %CONJUGATE(doConj, loadedA);
+            //printf(\"%u : xShared %f, sum %f loadedA %f\\n\", threadIdx, xShared, sum, loadedA);
+            %MAD(sum, loadedA, xShared);
+        }
+
+        // Avoid Race...
+        barrier(CLK_LOCAL_MEM_FENCE);
+        targetCol++;
+#ifdef BANDED
+        bandCol++;
+#endif
+    }
+}
+";
+
+const char * trsv_CUT_SolveTriangle_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+#ifdef PACKED
+    #define A( row, col) (*( A + (((col)*((col)+1))/2 + (row))))
+#elif defined(BANDED)
+    #define A(row, col) A[ (row) * lda + (col)]
+#else
+    #define A( row, col) A[ (row) + (col) * lda]
+#endif
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+__kernel void %PREFIXtrsv_CUT_SolveTriangle_kernel(     __global const %TYPE* _A,
+                                                __global %TYPE* _xnew,
+                                                uint N,
+                                                int incx,
+                                                int isUnity,
+                                                uint lda,
+                                                int doConj,
+                                                int startRow, int endRow, uint offa, uint offx
+#ifdef BANDED
+                                                , uint KU
+#endif
+                                                         )
+{
+        __global %TYPE* xnew;
+        __global const %TYPE* A = _A + offa;
+        if ( incx < 0 ) // Goto end of vector
+        {
+            xnew     = _xnew + offx  + ( N - 1) * abs(incx);
+        }
+        else
+        {
+            xnew     = _xnew + offx;
+        }
+
+        int blockSize = get_local_size(0);
+        int threadID = get_local_id(0);
+        int targetRow;
+#ifdef BANDED
+        int bandRow = startRow;
+        int bandCol = threadID;
+//        printf(\"threadID %d, bandRow %d bandCol %d\\n\",threadID, bandRow, bandCol);
+#endif
+        __local volatile %TYPE saccShared[%TRIANGLE_HEIGHT];
+
+        targetRow = startRow + threadID;
+        //#pragma unroll
+        for( int idx = threadID; (idx < %TRIANGLE_HEIGHT) && ((startRow + idx) < endRow); idx += blockSize)
+        {
+            saccShared[idx] = xnew[ (startRow + idx) * incx];
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+        %TYPE diagA = %INIT(0.0);
+        if(targetRow < endRow)
+        {
+#ifndef BANDED
+            diagA = A((targetRow), (targetRow));
+#else
+            diagA = A((startRow + threadID), (0));
+#endif
+            %CONJUGATE(doConj, diagA);
+        }
+        %TYPE tempA, tempS;
+        for(int i = 0; i < %TRIANGLE_HEIGHT; i++)
+        {
+            if((i <= threadID) && (i > 0) && (targetRow < endRow))
+            {
+#ifndef BANDED
+                tempA = A((startRow + i - 1), (targetRow));
+#else
+                tempA = A((bandRow - 1), (bandCol + 1));
+  //              printf(\"threadID %d, bandRow %d bandCol %d A %f\\n\",threadID, bandRow, bandCol, tempA);
+#endif
+                %CONJUGATE(doConj, tempA);
+                %MUL(tempS, tempA, saccShared[i-1]);
+                %SUB(saccShared[threadID], saccShared[threadID], tempS);
+            }
+            if((i == threadID) && (targetRow < endRow) && (!isUnity))
+            {
+                tempS = saccShared[threadID];
+    //            printf(\"threadID %d, saccShared %f, diagA %f\\n\", threadID, tempS, diagA);
+                %DIV(saccShared[threadID], tempS, diagA);
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+#ifdef BANDED
+            bandRow++; bandCol--;
+#endif
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(targetRow < endRow)
+        {
+            xnew[(targetRow * incx)] = saccShared[threadID];
+        }
+}
+";
+
+const char * trsv_CLT_SolveTriangle_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+#ifdef PACKED
+    #define A(row, col) (*( A + ((( (col) *((2*N) + 1 - (col))) / 2) + ((row) - (col)))))
+#elif defined(BANDED)
+    #define A(row, col) A[ (row) * lda + (col)]
+#else
+    #define A(row, col) A[ (row) + (col) * lda]
+#endif
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+// Column-Major Lower Non-Unity case
+// StartRow points to actual Row to start from( absolute Column number)
+// endRow points to actual Row to stop + 1( absolute Column number)
+__kernel void %PREFIXtrsv_CLT_SolveTriangle_kernel(     __global const %TYPE* _A,
+                                                __global %TYPE* _xnew,
+                                                uint N,
+                                                int incx,
+                                                int isUnity,
+                                                uint lda,
+                                                int doConj,
+                                                int startRow, int endRow, uint offa, uint offx
+#ifdef BANDED
+                                                ,uint KL
+#endif
+                    )
+{
+        __global %TYPE* xnew;
+        __global const %TYPE *A = _A + offa;
+        if ( incx < 0 ) // Goto end of vector
+        {
+            xnew     = _xnew  + offx + ( N - 1) * abs(incx);
+        }
+        else
+        {
+            xnew     = _xnew + offx;
+        }
+
+        int blockSize = get_local_size(0);
+        int threadID = get_local_id(0);
+        __local volatile %TYPE saccShared[%TRIANGLE_HEIGHT];
+        int targetRow;
+        targetRow = (endRow - 1) - threadID;
+
+#ifdef BANDED
+        int bandRow = (endRow - 1);
+        int bandCol = (KL) - threadID;
+#endif
+
+        //#pragma unroll
+        for( int idx = threadID; (idx < %TRIANGLE_HEIGHT) && (((endRow - 1) - idx) >= startRow); idx += blockSize)
+        {
+            saccShared[idx] = xnew[((endRow - 1) - idx) * incx];
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        %TYPE diagA = %INIT(0.0);
+        if(targetRow >= startRow)
+        {
+#ifndef BANDED
+            diagA = A((targetRow), (targetRow));
+#else
+            diagA = A((bandRow - threadID), (KL));
+//            printf(\"ThreadID %d, bandRow %d bandCol %d\\n\", threadID, bandRow, bandCol);
+#endif
+            %CONJUGATE(doConj, diagA);
+        }
+        %TYPE tempA, tempS;
+
+        for( int i = (endRow - 1); i >= startRow; i--)
+        {
+            if((targetRow == i) && (!isUnity))
+            {
+                tempS = saccShared[threadID];
+                %DIV(saccShared[threadID], tempS, diagA);
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if((targetRow < i) && (targetRow >= startRow))
+            {
+#ifndef BANDED
+                tempA = A((i), (targetRow));
+#else
+                tempA = A((bandRow), (bandCol));
+#endif
+                %CONJUGATE(doConj, tempA);
+                %MUL(tempS, tempA, saccShared[(endRow - 1) - i]);
+                %SUB(saccShared[threadID], saccShared[threadID], tempS);
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+#ifdef BANDED
+            bandRow--; bandCol++;
+#endif
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(targetRow >= startRow)
+        {
+            xnew[(targetRow * incx)] = saccShared[threadID];
+        }
+}
+";
+
diff --git a/src/library/blas/gens/clTemplates/trsv_gemv.cl b/src/library/blas/gens/clTemplates/trsv_gemv.cl
new file mode 100644
index 0000000..f1f6cd4
--- /dev/null
+++ b/src/library/blas/gens/clTemplates/trsv_gemv.cl
@@ -0,0 +1,1487 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+// Compute Rectangle + Traingle
+
+const char * trsv_CU_ComputeRectangle_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+#define TARGET_ROWS_BY_VEC ((%TARGET_ROWS)/(%V))
+#ifdef PACKED
+    #define A( row, col) (*( A + (((col)*((col)+1))/2 + (row))))
+#else
+    #define A( row, col) A[ (row) + (col) * lda]
+#endif
+
+__kernel void %PREFIXtrsv_CU_ComputeRectangle_kernel( __global %TYPE const * restrict _A, __global %TYPE* _xnew, uint N, int incx, int isUnity, uint lda, int doConj, int startCol, int rowsLeft, uint offa, uint offx)
+{
+	__global %TYPE* xnew;
+	__global %TYPE* A = _A + offa;
+
+	if ( incx < 0 ) // Goto end of vector
+	{
+		xnew	 = _xnew + offx + ( N - 1) * abs(incx);
+	}
+	else
+	{
+		xnew 	= _xnew + offx;
+	}
+
+	size_t bIdx 	= get_group_id(0);
+	size_t threadIdx= get_local_id(0);
+
+	// Get total blocks launched
+	size_t nBlocks  = ((rowsLeft - 1) / %TARGET_ROWS) + 1;
+
+	%TYPE sum 	= %MAKEVEC( 0.0);
+	%TYPE loadedA 	= %MAKEVEC( 0.0);
+
+	// First Block does scalar stuff...
+	// Only this gets executed if nBlocks == 1
+	if ( bIdx == 0)
+	{
+		int targetCol 	= startCol;
+		int targetRow 	= threadIdx;
+		int lastRow	= rowsLeft - ( nBlocks - 1) * %TARGET_ROWS - 1;
+
+		if ( nBlocks > 1)
+		{
+			if ( targetRow <= lastRow)
+			{
+				for( int i=0; i < %TARGET_ROWS; i++)
+				{
+					// All threads look at same xnew
+					// Should use Shared Memory ..
+					%TYPE xVal =  xnew[ targetCol * incx];
+					loadedA = A((targetRow), (targetCol));
+					%CONJUGATE(doConj, loadedA);
+					%MAD(sum, loadedA, xVal);
+					targetCol--;
+				}
+
+				%SUB(xnew[ targetRow * incx], xnew[targetRow * incx], sum);
+			}
+		}
+		else // Solve the traingle -- no more kernel launches required
+		{
+			if ( targetRow <= lastRow)
+			{
+				for( int i=0; i < %TARGET_ROWS; i++)
+				{
+					// All threads look at same xnew
+					// Should use Shared Memory ..
+					%TYPE xVal =  xnew[ targetCol * incx];
+					loadedA = A((targetRow), (targetCol));
+					%CONJUGATE(doConj, loadedA);
+					%MAD(sum, loadedA, xVal);
+					targetCol--;
+				}
+			}
+
+			// Change targetCol to point to Triangle last column for all threads
+			// As the above condition ( targetRow <= lastRow) changes targetCol for only threads with condition true
+			targetCol 	= startCol - %TARGET_ROWS;
+
+			__local %TYPE  xShared; // To share solved x value with other threads..
+
+			for( int i=0; i < (lastRow + 1); i++)
+			{
+				if ( targetRow == targetCol)
+				{
+					%TYPE xVal = xnew[ targetRow * incx];
+					%SUB(sum, xVal, sum);
+					xShared = sum;
+					xnew[ targetRow * incx ] = xShared;
+				}
+
+				barrier(CLK_LOCAL_MEM_FENCE);
+
+				if (  targetRow < targetCol)
+				{
+					loadedA = A((targetRow), (targetCol));
+					%CONJUGATE(doConj, loadedA);
+					%MAD(sum, loadedA, xShared);
+				}
+
+				// Avoid Race
+				barrier(CLK_LOCAL_MEM_FENCE);
+				targetCol--;
+			}
+		}
+	}
+	else
+	{
+		size_t rowShift = ((threadIdx % ( TARGET_ROWS_BY_VEC  )) * %V);
+		size_t colShift = threadIdx / TARGET_ROWS_BY_VEC;
+
+		int rowStart 	= rowsLeft  - ( %TARGET_ROWS * (nBlocks - bIdx) );
+		int row		= rowStart + rowShift;
+
+		%TYPE   sumTemp = %MAKEVEC(0.0);
+		%TYPE%V sum	= %VMAKEVEC(sumTemp);
+
+		__local %TYPE xData[ %TARGET_WIDTH];
+
+		//#pragma unroll
+		for( int i=1; i <= %NLOOPS; i++)
+		{
+			// Put startCol to start of BLOCKSIZE Block
+			int startColp	= startCol - (%TARGET_WIDTH * i) + 1;
+
+			if ( threadIdx < %TARGET_WIDTH)
+			{
+				xData[threadIdx] = xnew[ (startColp + threadIdx) * incx];
+			}
+
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			int col 	= startColp + colShift;
+
+			%TYPE xDataVal	= xData[ colShift ];
+			%TYPE%V xDataVec= %VMAKEVEC( xDataVal);
+
+			%TYPE%V loadedA  = %VLOAD( 0, &A((row), (col)));
+			%CONJUGATE(doConj, loadedA);
+			%VMAD(sum, loadedA, xDataVec);
+			barrier(CLK_LOCAL_MEM_FENCE);
+		}
+
+		__local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
+		//__local %TYPE* sData = sDataTemp;
+		sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		//int TARGET_ROWS		= %TARGET_ROWS;
+
+		// Last Block
+		// Do Scalar reduction for last block
+		// Followed by solving the triangle
+		if ( bIdx == ( nBlocks - 1))
+		{
+			%TYPE sumTemp 	    = %MAKEVEC(0.0);
+			%TYPE%V sumVec      = %VMAKEVEC(sumTemp);
+		        %TYPE%V loadedAVec  = %VMAKEVEC(sumTemp);
+
+			//int targetRow = rowStart + threadIdx;
+			int targetCol = startCol- %TARGET_ROWS; // Col where triangle last col overlaps
+
+			// Do vector reduction
+			if ( threadIdx <  TARGET_ROWS_BY_VEC )
+			{
+				//#pragma unroll
+				for( int j=0; j < %TARGET_WIDTH; j++)
+				{
+					%ADD(sumVec, sumVec, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]);
+				}
+			}
+
+			__local %TYPE xShared[%V];
+
+			int targetRowTemp = rowStart + threadIdx * %V;
+			int VECTOR_SIZE   = %V;
+
+			//#pragma unroll
+			for( int i=0; i < (TARGET_ROWS_BY_VEC); i++)
+			{
+				if ( threadIdx == (TARGET_ROWS_BY_VEC - 1 - i))
+			    {
+					// Read X-vector
+					%TYPE xVal[%V];
+					//#pragma unroll
+					for( int j = 0; j < %V; j++)
+					{
+						xVal[j] = xnew[ (targetRowTemp + j)* incx];
+					}
+
+					// Read A %Vx%V region into reg
+					%TYPE reg[%V][%V];
+					//#pragma unroll
+					for( int idx = 0; idx < ( %V * %V); idx++)
+					{
+						int m = idx / ( %V ); // Row : Col-Major idx...
+						int n = idx % ( %V );    // Col
+						if ( n > m )
+						{
+							reg[m][n] = A( (targetRowTemp + m), (targetCol -( %V - 1 - n)));
+							%CONJUGATE(doConj, reg[m][n]);
+						}
+					}
+
+					%TYPE sumVecReg[%V];
+					%VSTOREWITHINCX(sumVecReg, sumVec, 1);
+
+					// Solve for first x - Do the rest in loop
+					%TYPE x[%V];
+					%SUB(x[VECTOR_SIZE - 1], xVal[VECTOR_SIZE - 1], sumVecReg[VECTOR_SIZE - 1]);
+					xShared[%V - 1] = x[%V - 1];
+					xnew[ (targetRowTemp + %V - 1)* incx ] = x[%V - 1];
+
+					//#pragma unroll
+					for(int m= ( %V - 2); m >=0; m--)
+					{
+						%SUB(x[m], xVal[m], sumVecReg[m]);
+					}
+
+					//#pragma unroll
+					for( int idx = (( ( %V * %V) - 1) - %V); idx > 0; idx--)
+					{
+						int m = idx / %V;       // Row : Row-Major idx, x[3] is solved before x[2]
+						int n = idx % ( %V );// Col
+						if ( n > m)
+						{
+							//x[m] = x[m] - reg[m][n] * x[n];
+							%MAD(x[m], reg[m][n], (-x[n]));
+						}
+					}
+
+					// Store results
+					//#pragma unroll
+					for(int m = 0; m < %V; m++)
+					{
+						xShared[m] = x[m];
+						xnew[ (targetRowTemp + m)* incx ] = x[m];
+					}
+			    }
+
+
+			    // Sync so that xShared it available to all threads
+			    barrier(CLK_LOCAL_MEM_FENCE);
+
+			    if ( threadIdx < (TARGET_ROWS_BY_VEC - 1 - i))
+				{
+						//#pragma unroll
+						for( int j=0; j < %V; j++)
+						{
+							//sumVec += vload4( 0, &A((targetRowTemp), (targetCol -j))) * xShared[%V - 1 -j];
+							%TYPE%V loadedAVec  = %VLOAD( 0, &A((targetRowTemp), (targetCol -j)));
+							%CONJUGATE(doConj, loadedAVec);
+							%VMAD(sumVec, loadedAVec, xShared[VECTOR_SIZE - 1 -j]);
+						}
+				}
+
+				targetCol = targetCol - %V;
+			        // Avoid Race...
+			    barrier(CLK_LOCAL_MEM_FENCE);
+			}
+		}
+		else
+		{
+			// Do Vector Reduction on each block except the last Block
+			if ( threadIdx < TARGET_ROWS_BY_VEC)
+			{
+				%TYPE   accTemp = %MAKEVEC(0.0);
+				%TYPE%V acc 	= %VMAKEVEC(accTemp);
+
+				//#pragma unroll
+				for( int j=0; j < %TARGET_WIDTH; j++)
+				{
+					%ADD(acc, acc, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]);
+				}
+
+				// Store the result
+				int targetRow = rowStart + threadIdx * %V;
+
+				__global %TYPE* xNewPtr =  xnew + targetRow * incx;
+				//float4 value = (float4)( xNewPtr[0], xNewPtr[incx], xNewPtr[incx * 2], xNewPtr[incx *3]);
+				%TYPE%V value;
+				%VLOADWITHINCX(value, xNewPtr, incx);
+
+				// Compute result
+				%SUB(value, value, acc);
+
+				// Store results
+				//VSTOREWITHINCX( xNewPtr, value, incx);
+				%VSTOREWITHINCX(xNewPtr, value, incx);
+			}
+		}
+	}
+}
+";
+
+const char *trsv_CU_ComputeRectangle_NonUnity_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+#define TARGET_ROWS_BY_VEC ((%TARGET_ROWS)/(%V))
+#ifdef PACKED
+    #define A( row, col) (*( A + (((col)*((col)+1))/2 + (row))))
+#else
+    #define A( row, col) A[ (row) + (col) * lda]
+#endif
+// Compute Rectangle + Traingle
+__kernel void %PREFIXtrsv_CU_ComputeRectangle_NonUnity_kernel( __global %TYPE const * restrict _A, __global %TYPE* _xnew, uint N, int incx, int isUnity, uint lda, int doConj, int startCol, int rowsLeft, uint offa, uint offx)
+{
+	__global %TYPE* xnew;
+	__global %TYPE* A = _A + offa;
+
+	if ( incx < 0 ) // Goto end of vector
+	{
+		xnew	 = _xnew + offx + ( N - 1) * abs(incx);
+	}
+	else
+	{
+		xnew 	= _xnew + offx;
+	}
+
+	size_t bIdx 	= get_group_id(0);
+	size_t threadIdx= get_local_id(0);
+
+	// Get total blocks launched
+	size_t nBlocks  = (rowsLeft - 1) / %TARGET_ROWS + 1;
+
+	%TYPE sum 	= %MAKEVEC( 0.0);
+	%TYPE loadedA 	= %MAKEVEC( 0.0);
+
+	// First Block does scalar stuff...
+	// Only this gets executed if nBlocks == 1
+	if ( bIdx == 0)
+	{
+		int targetCol 	= startCol;
+		int targetRow 	= threadIdx;
+		int lastRow	= rowsLeft - ( nBlocks - 1) * %TARGET_ROWS - 1;
+
+		if ( nBlocks > 1)
+		{
+			if ( targetRow <= lastRow)
+			{
+				for( int i=0; i < %TARGET_ROWS; i++)
+				{
+					// All threads look at same xnew
+					// Should use Shared Memory ..
+					%TYPE xVal =  xnew[ targetCol * incx];
+					loadedA = A((targetRow), (targetCol));
+					%CONJUGATE(doConj, loadedA);
+					%MAD(sum, loadedA, xVal);
+					targetCol--;
+				}
+
+				%SUB(xnew[ targetRow * incx], xnew[targetRow * incx], sum);
+			}
+		}
+		else // Solve the traingle -- no more kernel launches required
+		{
+			if ( targetRow <= lastRow)
+			{
+				for( int i=0; i < %TARGET_ROWS; i++)
+				{
+					// All threads look at same xnew
+					// Should use Shared Memory ..
+					%TYPE xVal =  xnew[ targetCol * incx];
+					loadedA = A((targetRow), (targetCol));
+					%CONJUGATE(doConj, loadedA);
+					%MAD(sum, loadedA, xVal);
+					targetCol--;
+				}
+			}
+
+			// Change targetCol to point to Triangle last column for all threads
+			// As the above condition ( targetRow <= lastRow) changes targetCol for only threads with condition true
+			targetCol 	= startCol - %TARGET_ROWS;
+
+			__local %TYPE  xShared; // To share solved x value with other threads..
+
+			for( int i=0; i < (lastRow + 1); i++)
+			{
+				if ( targetRow == targetCol)
+				{
+					%TYPE xVal = xnew[ targetRow * incx];
+					sum  =  xVal -  sum;
+
+					// Handle diagonal element
+					loadedA = A((targetRow), (targetCol));
+					%CONJUGATE(doConj, loadedA);
+					%DIV(xShared, sum, loadedA);
+
+					xnew[ targetRow * incx ] = xShared;
+				}
+
+				barrier(CLK_LOCAL_MEM_FENCE);
+
+				if (  targetRow < targetCol)
+				{
+					loadedA = A((targetRow), (targetCol));
+					%CONJUGATE(doConj, loadedA);
+					%MAD(sum, loadedA, xShared);
+				}
+
+				// Avoid Race
+				barrier(CLK_LOCAL_MEM_FENCE);
+				targetCol--;
+			}
+		}
+	}
+	else
+	{
+		size_t rowShift = ((threadIdx % ( TARGET_ROWS_BY_VEC )) * %V);
+		size_t colShift = threadIdx / TARGET_ROWS_BY_VEC;
+
+		int rowStart 	= rowsLeft  - ( %TARGET_ROWS * (nBlocks - bIdx) );
+		int row		= rowStart + rowShift;
+
+		%TYPE   sumTemp = %MAKEVEC(0.0);
+		%TYPE%V sum	= %VMAKEVEC(sumTemp);
+
+		__local %TYPE xData[ %TARGET_WIDTH];
+
+		//#pragma unroll
+		for( int i=1; i <= %NLOOPS; i++)
+		{
+			// Put startCol to start of BLOCKSIZE Block
+			int startColp	= startCol - (%TARGET_WIDTH * i) + 1;
+
+			if ( threadIdx < %TARGET_WIDTH)
+			{
+				xData[threadIdx] = xnew[ (startColp + threadIdx) * incx];
+			}
+
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			int col 	= startColp + colShift;
+
+			%TYPE xDataVal	= xData[ colShift ];
+			%TYPE%V xDataVec= %VMAKEVEC( xDataVal);
+
+			%TYPE%V loadedA  = %VLOAD( 0, &A((row), (col)));
+			%CONJUGATE(doConj, loadedA);
+			%VMAD(sum, loadedA, xDataVec);
+			barrier(CLK_LOCAL_MEM_FENCE);
+		}
+
+		__local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
+		//__local %TYPE* sData = sDataTemp;
+		sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		//int TARGET_ROWS		= %TARGET_ROWS;
+
+		// Last Block
+		// Do Scalar reduction for last block
+		// Followed by solving the triangle
+		if ( bIdx == ( nBlocks - 1))
+		{
+			%TYPE sumTemp 	    = %MAKEVEC(0.0);
+			%TYPE%V sumVec      = %VMAKEVEC(sumTemp);
+		        %TYPE%V loadedAVec  = %VMAKEVEC(sumTemp);
+
+			//int targetRow = rowStart + threadIdx;
+			int targetCol = startCol- %TARGET_ROWS; // Col where triangle last col overlaps
+
+			// Do vector reduction
+			if ( threadIdx <  TARGET_ROWS_BY_VEC )
+			{
+				//#pragma unroll
+				for( int j=0; j < %TARGET_WIDTH; j++)
+				{
+					%ADD(sumVec, sumVec, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]);
+				}
+			}
+
+			__local %TYPE xShared[%V];
+
+			int targetRowTemp = rowStart + threadIdx * %V;
+			int VECTOR_SIZE   = %V;
+
+			//#pragma unroll
+			for( int i=0; i < (TARGET_ROWS_BY_VEC); i++)
+			{
+				if ( threadIdx == (TARGET_ROWS_BY_VEC - 1 - i))
+			        {
+					// Read X-vector
+					%TYPE xVal[%V];
+					//#pragma unroll
+					for( int j = 0; j < %V; j++)
+					{
+						xVal[j] = xnew[ (targetRowTemp + j)* incx];
+					}
+
+					// Read A %Vx%V region into reg
+					%TYPE reg[%V][%V];
+					//#pragma unroll
+					for( int idx = 0; idx < ( %V * %V); idx++)
+					{
+						int m = idx % ( %V ); // Row : Col-Major idx...
+						int n = idx / ( %V );    // Col
+						if ( n >= m )
+						{
+							reg[m][n] = A((targetRowTemp + m), (targetCol -( %V - 1 - n)));
+							%CONJUGATE(doConj, reg[m][n]);
+						}
+					}
+
+					%TYPE sumVecReg[%V];
+					%VSTOREWITHINCX(sumVecReg, sumVec, 1);
+
+					// Solve for first x - Do the rest in loop
+					%TYPE x[%V];
+					%SUB(x[VECTOR_SIZE - 1], xVal[VECTOR_SIZE - 1], sumVecReg[VECTOR_SIZE - 1]);
+					%DIV(sumVecReg[VECTOR_SIZE - 1], x[VECTOR_SIZE -1], reg[VECTOR_SIZE - 1][VECTOR_SIZE - 1]);
+					x[VECTOR_SIZE -1] = sumVecReg[VECTOR_SIZE - 1];
+					xShared[%V - 1] = x[%V - 1];
+					xnew[ (targetRowTemp + %V - 1)* incx ] = x[%V - 1];
+
+					//#pragma unroll
+					for(int m = ( %V - 2); m >=0; m--)
+					{
+						%SUB(x[m], xVal[m], sumVecReg[m]);
+					}
+
+					//#pragma unroll
+					for( int idx = (( ( %V * %V) - 1) - %V); idx >= 0; idx--)
+					{
+						int m = idx / %V;       // Row : Row-Major idx, x[3] is solved before x[2]
+						int n = idx % ( %V );// Col
+						if ( n > m)
+						{
+							//x[m] = x[m] - reg[m][n] * x[n];
+							%MAD(x[m], reg[m][n], (-x[n]));
+						}
+						else if ( m == n)
+						{
+							%DIV(sumVecReg[m], x[m], reg[m][m]);
+							x[m] = sumVecReg[m];
+						}
+					}
+
+					// Store results
+					//#pragma unroll
+					for(int m = 0; m < %V; m++)
+					{
+						xShared[m] = x[m];
+						xnew[ (targetRowTemp + m)* incx ] = x[m];
+					}
+			        }
+
+
+			        // Sync so that xShared it available to all threads
+			        barrier(CLK_LOCAL_MEM_FENCE);
+
+			      	if ( threadIdx < (TARGET_ROWS_BY_VEC - 1 - i))
+				{
+						//#pragma unroll
+						for( int j=0; j < %V; j++)
+						{
+							//sumVec += vload4( 0, &A((targetRowTemp), (targetCol -j))) * xShared[%V - 1 -j];
+							%TYPE%V loadedAVec  = %VLOAD( 0, &A((targetRowTemp), (targetCol -j)));
+							%CONJUGATE(doConj, loadedAVec);
+							%VMAD(sumVec, loadedAVec, xShared[VECTOR_SIZE - 1 -j]);
+						}
+				}
+
+				targetCol = targetCol - %V;
+			        // Avoid Race...
+			        barrier(CLK_LOCAL_MEM_FENCE);
+			}
+		}
+		else
+		{
+			// Do Vector Reduction on each block except the last Block
+			if ( threadIdx < TARGET_ROWS_BY_VEC)
+			{
+				%TYPE   accTemp = %MAKEVEC(0.0);
+				%TYPE%V acc 	= %VMAKEVEC(accTemp);
+
+				//#pragma unroll
+				for( int j=0; j < %TARGET_WIDTH; j++)
+				{
+					%ADD(acc, acc, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]);
+				}
+
+				// Store the result
+				int targetRow = rowStart + threadIdx * %V;
+
+				__global %TYPE* xNewPtr =  xnew + targetRow * incx;
+				//float4 value = (float4)( xNewPtr[0], xNewPtr[incx], xNewPtr[incx * 2], xNewPtr[incx *3]);
+				%TYPE%V value;
+				%VLOADWITHINCX(value, xNewPtr, incx);
+
+				// Compute result
+				%SUB(value, value, acc);
+
+				// Store results
+				//VSTOREWITHINCX( xNewPtr, value, incx);
+				%VSTOREWITHINCX(xNewPtr, value, incx);
+			}
+		}
+	}
+
+}
+";
+
+
+const char *trsv_CL_ComputeRectangle_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+#define TARGET_ROWS_BY_VEC ((%TARGET_ROWS)/(%V))
+#ifdef PACKED
+    #define A(row, col) (*( A + ((( (col) *((2*N) + 1 - (col))) / 2) + ((row) - (col)))))
+#else
+    #define A(row, col) A[ (row) + (col) * lda]
+#endif
+// Compute Rectangle + Traingle
+__kernel void %PREFIXtrsv_CL_ComputeRectangle_kernel( __global const %TYPE* _A, __global %TYPE* _xnew, uint N, int incx, int isUnity, uint lda, int doConj, int startCol, int rowsLeft, uint offa, uint offx)
+{
+	__global %TYPE* xnew;
+	__global %TYPE* A = _A + offa;
+
+	if ( incx < 0 ) // Goto end of vector
+	{
+		xnew	 = _xnew + offx + ( N - 1) * abs(incx);
+	}
+	else
+	{
+		xnew 	= _xnew + offx;
+	}
+
+	size_t bIdx 	= get_group_id(0);
+	size_t threadIdx= get_local_id(0);
+
+	// Get total blocks launched
+	size_t nBlocks  = (rowsLeft - 1) / %TARGET_ROWS + 1;
+
+	%TYPE sum 	= %MAKEVEC( 0.0);
+	%TYPE loadedA 	= %MAKEVEC( 0.0);
+
+	// Last Block does scalar stuff...
+	// Only this gets executed if nBlocks == 1
+	if ( bIdx == (nBlocks - 1))
+	{
+		int targetCol 	= startCol;
+		int startRow 	= (N - rowsLeft) + ( bIdx) * %TARGET_ROWS;
+		int targetRow 	= startRow  +  threadIdx;
+		int lastRow	= startRow + rowsLeft - ( nBlocks - 1) * %TARGET_ROWS - 1;
+
+		if ( nBlocks > 1)
+		{
+			if ( targetRow <= lastRow)
+			{
+				for( int i=0; i < %TARGET_ROWS; i++)
+				{
+					// All threads look at same xnew
+					// Should use Shared Memory ..
+					%TYPE xVal =  xnew[ targetCol * incx];
+					loadedA = A((targetRow), (targetCol));
+					%CONJUGATE(doConj, loadedA);
+					%MAD(sum, loadedA, xVal);
+					targetCol++;
+				}
+
+				%SUB(xnew[ targetRow * incx], xnew[targetRow * incx], sum);
+			}
+		}
+		else // Solve the traingle -- no more kernel launches required
+		{
+			if ( targetRow <= lastRow)
+			{
+				for( int i=0; i < %TARGET_ROWS; i++)
+				{
+					// All threads look at same xnew
+					// Should use Shared Memory ..
+					%TYPE xVal =  xnew[ targetCol * incx];
+					loadedA = A((targetRow), (targetCol));
+					%CONJUGATE(doConj, loadedA);
+					%MAD(sum, loadedA, xVal);
+					targetCol++;
+				}
+			}
+
+			// Change targetCol to point to Triangle last column for all threads
+			// As the above condition ( targetRow <= lastRow) changes targetCol for only threads with condition true
+			targetCol 	= startCol + %TARGET_ROWS;
+
+			__local %TYPE  xShared; // To share solved x value with other threads..
+
+			for( int i=0; i < ((lastRow -startRow) + 1); i++)
+			{
+				if ( targetRow == targetCol)
+				{
+					%TYPE xVal = xnew[ targetRow * incx];
+					sum  =  xVal -  sum;
+
+					if( isUnity)
+					{
+						xShared = sum;
+					}
+					else // Handle diagonal element
+					{
+						loadedA = A((targetRow), (targetCol));
+						%CONJUGATE(doConj, loadedA);
+						%DIV(xShared, sum, loadedA);
+					}
+
+					xnew[ targetRow * incx ] = xShared;
+				}
+
+				barrier(CLK_LOCAL_MEM_FENCE);
+
+				if (  targetRow <= lastRow)
+				{
+					loadedA = A((targetRow), (targetCol));
+					%CONJUGATE(doConj, loadedA);
+					%MAD(sum, loadedA, xShared);
+				}
+
+				// Avoid Race
+				barrier(CLK_LOCAL_MEM_FENCE);
+				targetCol++;
+			}
+		}
+	}
+	else
+	{
+		size_t rowShift = ((threadIdx % ( TARGET_ROWS_BY_VEC )) * %V);
+		size_t colShift = threadIdx / TARGET_ROWS_BY_VEC;
+
+		int rowStart 	= (N - rowsLeft) + ( bIdx) * %TARGET_ROWS;
+		int row		= rowStart + rowShift;
+
+		%TYPE   sumTemp = %MAKEVEC(0.0);
+		%TYPE%V sum	= %VMAKEVEC(sumTemp);
+
+		__local %TYPE xData[ %TARGET_WIDTH];
+
+		//#pragma unroll
+		for( int i=1; i <= %NLOOPS; i++)
+		{
+			// Put startCol to start of BLOCKSIZE Block
+			int startColp	= startCol + (%TARGET_WIDTH * (i - 1));
+
+			if ( threadIdx < %TARGET_WIDTH)
+			{
+				xData[threadIdx] = xnew[ (startColp + threadIdx) * incx];
+			}
+
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			int col 	= startColp + colShift;
+
+			%TYPE xDataVal	= xData[ colShift ];
+			%TYPE%V xDataVec= %VMAKEVEC( xDataVal);
+
+			%TYPE%V loadedA  = %VLOAD( 0, &A((row), (col)));
+			%CONJUGATE(doConj, loadedA);
+			%VMAD(sum, loadedA, xDataVec);
+			barrier(CLK_LOCAL_MEM_FENCE);
+		}
+
+		__local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
+		//__local %TYPE* sData = sDataTemp;
+		sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		//int TARGET_ROWS		= %TARGET_ROWS;
+
+		// Last Block
+		// Do Scalar reduction for last block
+		// Followed by solving the triangle
+		if ( bIdx == 0 )
+		{
+			%TYPE sumTemp 	    = %MAKEVEC(0.0);
+			%TYPE%V sumVec      = %VMAKEVEC(sumTemp);
+		        %TYPE%V loadedAVec  = %VMAKEVEC(sumTemp);
+
+			//int targetRow = rowStart + threadIdx;
+			int targetCol = startCol + %TARGET_ROWS; // Col where triangle last col overlaps
+
+			// Do vector reduction
+			if ( threadIdx <  TARGET_ROWS_BY_VEC )
+			{
+				//#pragma unroll
+				for( int j=0; j < %TARGET_WIDTH; j++)
+				{
+					%ADD(sumVec, sumVec, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]);
+				}
+			}
+
+			__local %TYPE xShared[%V];
+
+			int targetRowTemp = rowStart + threadIdx * %V;
+			int VECTOR_SIZE   = %V;
+
+			//#pragma unroll
+			for( int i=0; i < (TARGET_ROWS_BY_VEC); i++)
+			{
+				if ( threadIdx == i )
+			        {
+					// Read X-vector
+					%TYPE xVal[%V];
+					//#pragma unroll
+					for( int j = 0; j < %V; j++)
+					{
+						xVal[j] = xnew[ (targetRowTemp + j)* incx];
+					}
+
+					// Read A %Vx%V region into reg
+					%TYPE reg[%V][%V];
+					//#pragma unroll
+					for( int idx = 0; idx < ( %V * %V); idx++)
+					{
+						int m = idx % ( %V ); // Row : Col-Major idx...
+						int n = idx / ( %V );    // Col
+						if ( m > n )
+						{
+							reg[m][n] = A((targetRowTemp + m), (targetCol + n));
+							%CONJUGATE(doConj, reg[m][n]);
+						}
+					}
+
+					%TYPE sumVecReg[%V];
+					%VSTOREWITHINCX(sumVecReg, sumVec, 1);
+
+					// Solve for first x - Do the rest in loop
+					%TYPE x[%V];
+					%SUB(x[0], xVal[0], sumVecReg[0]);
+					xShared[0] = x[0];
+					xnew[ (targetRowTemp)* incx ] = x[0];
+
+					//#pragma unroll
+					for(int m = 1; m < %V; m++)
+					{
+						%SUB(x[m], xVal[m], sumVecReg[m]);
+					}
+
+					//#pragma unroll
+					for( int idx =  %V; idx < (( %V * %V) - 1); idx++)
+					{
+						int m = idx / %V;       // Row : Row-Major idx, x[1] is solved before x[2]
+						int n = idx % ( %V );// Col
+						if ( m > n)
+						{
+							//x[m] = x[m] - reg[m][n] * x[n];
+							%MAD(x[m], reg[m][n], (-x[n]));
+						}
+					}
+
+					// Store results
+					//#pragma unroll
+					for(int m = 0; m < %V; m++)
+					{
+						xShared[m] = x[m];
+						xnew[ (targetRowTemp + m)* incx ] = x[m];
+					}
+			        }
+
+
+			        // Sync so that xShared it available to all threads
+			        barrier(CLK_LOCAL_MEM_FENCE);
+			      	if ( (threadIdx > i) && ( threadIdx < (TARGET_ROWS_BY_VEC)) )
+				{
+						//#pragma unroll
+						for( int j=0; j < %V; j++)
+						{
+							%TYPE%V loadedAVec  = %VLOAD( 0, &A((targetRowTemp), (targetCol +j)));
+							%CONJUGATE(doConj, loadedAVec);
+							%VMAD(sumVec, loadedAVec, xShared[j]);
+						}
+				}
+
+				targetCol = targetCol + %V;
+			        // Avoid Race...
+			        barrier(CLK_LOCAL_MEM_FENCE);
+			}
+		}
+		else
+		{
+			// Do Vector Reduction on each block except the last Block
+			if ( threadIdx < TARGET_ROWS_BY_VEC)
+			{
+				%TYPE   accTemp = %MAKEVEC(0.0);
+				%TYPE%V acc 	= %VMAKEVEC(accTemp);
+
+				//#pragma unroll
+				for( int j=0; j < %TARGET_WIDTH; j++)
+				{
+					%ADD(acc, acc, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]);
+				}
+
+				// Store the result
+				int targetRow = rowStart + threadIdx * %V;
+
+				__global %TYPE* xNewPtr =  xnew + targetRow * incx;
+				//float4 value = (float4)( xNewPtr[0], xNewPtr[incx], xNewPtr[incx * 2], xNewPtr[incx *3]);
+				%TYPE%V value;
+				%VLOADWITHINCX(value, xNewPtr, incx);
+
+				// Compute result
+				%SUB(value, value, acc);
+
+				// Store results
+				%VSTOREWITHINCX(xNewPtr, value, incx);
+			}
+		}
+	}
+}
+";
+
+const char *trsv_CL_ComputeRectangle_NonUnity_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+#define TARGET_ROWS_BY_VEC ((%TARGET_ROWS)/(%V))
+#ifdef PACKED
+    #define A(row, col) (*( A + ((( (col) *((2*N) + 1 - (col))) / 2) + ((row) - (col)))))
+#else
+    #define A(row, col) A[ (row) + (col) * lda]
+#endif
+// Compute Rectangle + Traingle
+__kernel void %PREFIXtrsv_CL_ComputeRectangle_NonUnity_kernel( __global const %TYPE* _A, __global %TYPE* _xnew, uint N, int incx, int isUnity, uint lda, int doConj, int startCol, int rowsLeft, uint offa, uint offx)
+{
+	__global %TYPE* xnew;
+	__global %TYPE* A = _A + offa;
+
+	if ( incx < 0 ) // Goto end of vector
+	{
+		xnew	 = _xnew + offx + ( N - 1) * abs(incx);
+	}
+	else
+	{
+		xnew 	= _xnew + offx;
+	}
+
+	size_t bIdx 	= get_group_id(0);
+	size_t threadIdx= get_local_id(0);
+
+	// Get total blocks launched
+	size_t nBlocks  = (rowsLeft - 1) / %TARGET_ROWS + 1;
+
+	%TYPE sum 	= %MAKEVEC( 0.0);
+	%TYPE loadedA 	= %MAKEVEC( 0.0);
+
+	// Last Block does scalar stuff...
+	// Only this gets executed if nBlocks == 1
+	if ( bIdx == (nBlocks - 1))
+	{
+		int targetCol 	= startCol;
+		int startRow 	= (N - rowsLeft) + ( bIdx) * %TARGET_ROWS;
+		int targetRow 	= startRow  +  threadIdx;
+		int lastRow	= startRow + rowsLeft - ( nBlocks - 1) * %TARGET_ROWS - 1;
+
+		if ( nBlocks > 1)
+		{
+			if ( targetRow <= lastRow)
+			{
+				for( int i=0; i < %TARGET_ROWS; i++)
+				{
+					// All threads look at same xnew
+					// Should use Shared Memory ..
+					%TYPE xVal =  xnew[ targetCol * incx];
+					loadedA = A((targetRow), (targetCol));
+					%CONJUGATE(doConj, loadedA);
+					%MAD(sum, loadedA, xVal);
+					targetCol++;
+				}
+
+				%SUB(xnew[ targetRow * incx], xnew[targetRow * incx], sum);
+			}
+		}
+		else // Solve the traingle -- no more kernel launches required
+		{
+			if ( targetRow <= lastRow)
+			{
+				for( int i=0; i < %TARGET_ROWS; i++)
+				{
+					// All threads look at same xnew
+					// Should use Shared Memory ..
+					%TYPE xVal =  xnew[ targetCol * incx];
+					loadedA = A((targetRow), (targetCol));
+					%CONJUGATE(doConj, loadedA);
+					%MAD(sum, loadedA, xVal);
+					targetCol++;
+				}
+			}
+
+			// Change targetCol to point to Triangle last column for all threads
+			// As the above condition ( targetRow <= lastRow) changes targetCol for only threads with condition true
+			targetCol 	= startCol + %TARGET_ROWS;
+
+			__local %TYPE  xShared; // To share solved x value with other threads..
+
+			for( int i=0; i < ((lastRow -startRow) + 1); i++)
+			{
+				if ( targetRow == targetCol)
+				{
+					%TYPE xVal = xnew[ targetRow * incx];
+					sum  =  xVal -  sum;
+
+					// Handle diagonal element
+					loadedA = A((targetRow), (targetCol));
+					%CONJUGATE(doConj, loadedA);
+					%DIV(xShared, sum, loadedA);
+					xnew[ targetRow * incx ] = xShared;
+				}
+
+				barrier(CLK_LOCAL_MEM_FENCE);
+
+				if (  targetRow <= lastRow)
+				{
+					loadedA = A((targetRow), (targetCol));
+					%CONJUGATE(doConj, loadedA);
+					%MAD(sum, loadedA, xShared);
+				}
+
+				// Avoid Race
+				barrier(CLK_LOCAL_MEM_FENCE);
+				targetCol++;
+			}
+		}
+	}
+	else
+	{
+		size_t rowShift = ((threadIdx % ( TARGET_ROWS_BY_VEC )) * %V);
+		size_t colShift = threadIdx / TARGET_ROWS_BY_VEC;
+
+		int rowStart 	= (N - rowsLeft) + ( bIdx) * %TARGET_ROWS;
+		int row		= rowStart + rowShift;
+
+		%TYPE   sumTemp = %MAKEVEC(0.0);
+		%TYPE%V sum	= %VMAKEVEC(sumTemp);
+
+		__local %TYPE xData[ %TARGET_WIDTH];
+
+		//#pragma unroll
+		for( int i=1; i <= %NLOOPS; i++)
+		{
+			// Put startCol to start of BLOCKSIZE Block
+			int startColp	= startCol + (%TARGET_WIDTH * (i - 1));
+
+			if ( threadIdx < %TARGET_WIDTH)
+			{
+				xData[threadIdx] = xnew[ (startColp + threadIdx) * incx];
+			}
+
+			barrier(CLK_LOCAL_MEM_FENCE);
+
+			int col 	= startColp + colShift;
+
+			%TYPE xDataVal	= xData[ colShift ];
+			%TYPE%V xDataVec= %VMAKEVEC( xDataVal);
+
+			%TYPE%V loadedA  = %VLOAD( 0, &A((row), (col)));
+			%CONJUGATE(doConj, loadedA);
+			%VMAD(sum, loadedA, xDataVec);
+			barrier(CLK_LOCAL_MEM_FENCE);
+		}
+
+		__local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
+		//__local %TYPE* sData = sDataTemp;
+		sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		//int TARGET_ROWS		= %TARGET_ROWS;
+
+		// Last Block
+		// Do Scalar reduction for last block
+		// Followed by solving the triangle
+		if ( bIdx == 0 )
+		{
+			%TYPE sumTemp 	    = %MAKEVEC(0.0);
+			%TYPE%V sumVec      = %VMAKEVEC(sumTemp);
+		        %TYPE%V loadedAVec  = %VMAKEVEC(sumTemp);
+
+			//int targetRow = rowStart + threadIdx;
+			int targetCol = startCol + %TARGET_ROWS; // Col where triangle last col overlaps
+
+			// Do vector reduction
+			if ( threadIdx <  TARGET_ROWS_BY_VEC )
+			{
+				//#pragma unroll
+				for( int j=0; j < %TARGET_WIDTH; j++)
+				{
+					%ADD(sumVec, sumVec, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]);
+				}
+			}
+
+			__local %TYPE xShared[%V];
+
+			int targetRowTemp = rowStart + threadIdx * %V;
+			int VECTOR_SIZE   = %V;
+
+			//#pragma unroll
+			for( int i=0; i < (TARGET_ROWS_BY_VEC); i++)
+			{
+				if ( threadIdx == i )
+			        {
+					// Read X-vector
+					%TYPE xVal[%V];
+					//#pragma unroll
+					for( int j = 0; j < %V; j++)
+					{
+						xVal[j] = xnew[ (targetRowTemp + j)* incx];
+					}
+
+					// Read A %Vx%V region into reg
+					%TYPE reg[%V][%V];
+					//#pragma unroll
+					for( int idx = 0; idx < ( %V * %V); idx++)
+					{
+						int m = idx % ( %V ); // Row : Col-Major idx...
+						int n = idx / ( %V );    // Col
+						if ( m >= n )
+						{
+							reg[m][n] = A((targetRowTemp + m), (targetCol + n));
+							%CONJUGATE(doConj, reg[m][n]);
+						}
+					}
+
+					%TYPE sumVecReg[%V];
+					%VSTOREWITHINCX(sumVecReg, sumVec, 1);
+
+					// Solve for first x - Do the rest in loop
+					%TYPE x[%V];
+					%SUB(x[0], xVal[0], sumVecReg[0]);
+					%DIV(sumVecReg[0], x[0], reg[0][0]);
+					x[0] = sumVecReg[0];
+					xShared[0] = sumVecReg[0];
+					xnew[ (targetRowTemp)* incx ] = sumVecReg[0];
+
+					//#pragma unroll
+					for(int m = 1; m < %V; m++)
+					{
+						%SUB(x[m], xVal[m], sumVecReg[m]);
+					}
+
+					//#pragma unroll
+					for( int idx =  %V; idx < (%V * %V); idx++)
+					{
+						int m = idx / %V;       // Row : Row-Major idx, x[1] is solved before x[2]
+						int n = idx % ( %V );// Col
+						if ( m > n)
+						{
+							//x[m] = x[m] - reg[m][n] * x[n];
+							%MAD(x[m], reg[m][n], (-x[n]));
+						}
+						else if ( m == n)
+						{
+							%DIV(sumVecReg[m], x[m], reg[m][m]);
+							x[m] = sumVecReg[m];
+						}
+					}
+
+					// Store results
+					//#pragma unroll
+					for(int m = 1; m < %V; m++)
+					{
+						xShared[m] = x[m];
+						xnew[ (targetRowTemp + m)* incx ] = x[m];
+					}
+			        }
+
+
+			        // Sync so that xShared it available to all threads
+			        barrier(CLK_LOCAL_MEM_FENCE);
+			      	if ( (threadIdx > i) && ( threadIdx < (TARGET_ROWS_BY_VEC)) )
+				{
+						//#pragma unroll
+						for( int j=0; j < %V; j++)
+						{
+							%TYPE%V loadedAVec  = %VLOAD( 0, &A((targetRowTemp), (targetCol +j)));
+							%CONJUGATE(doConj, loadedAVec);
+							%VMAD(sumVec, loadedAVec, xShared[j]);
+						}
+				}
+
+				targetCol = targetCol + %V;
+			        // Avoid Race...
+			        barrier(CLK_LOCAL_MEM_FENCE);
+			}
+		}
+		else
+		{
+			// Do Vector Reduction on each block except the last Block
+			if ( threadIdx < TARGET_ROWS_BY_VEC)
+			{
+				%TYPE   accTemp = %MAKEVEC(0.0);
+				%TYPE%V acc 	= %VMAKEVEC(accTemp);
+
+				//#pragma unroll
+				for( int j=0; j < %TARGET_WIDTH; j++)
+				{
+					%ADD(acc, acc, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]);
+				}
+
+				// Store the result
+				int targetRow = rowStart + threadIdx * %V;
+
+				__global %TYPE* xNewPtr =  xnew + targetRow * incx;
+				//float4 value = (float4)( xNewPtr[0], xNewPtr[incx], xNewPtr[incx * 2], xNewPtr[incx *3]);
+				%TYPE%V value;
+				%VLOADWITHINCX(value, xNewPtr, incx);
+
+				// Compute result
+				%SUB(value, value, acc);
+
+				// Store results
+				%VSTOREWITHINCX(xNewPtr, value, incx);
+			}
+		}
+	}
+}
+";
+
+
+const char *trsv_CUT_ComputeRectangle_kernel = "
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+#ifdef PACKED
+    #define A( row, col) (*( A + (((col)*((col)+1))/2 + (row))))
+#else
+    #define A( row, col) A[ (row) + (col) * lda]
+#endif
+__kernel void %PREFIXtrsv_CUT_ComputeRectangle_kernel(__global const %TYPE* _A,			__global %TYPE* _xnew,
+												uint N,
+												int incx,
+												int isUnity,
+												uint lda,
+												int doConj,
+												int startRow, int endRow, uint offa, uint offx)
+{
+	__global %TYPE* xnew;
+	__global %TYPE* A = _A + offa;
+	if ( incx < 0 ) // Goto end of vector
+	{
+		xnew	 = _xnew  + offx + ( N - 1) * abs(incx);
+	}
+	else
+	{
+		xnew 	= _xnew + offx;
+	}
+
+	int threadID = get_local_id(0);
+	int threadID_Y, threadID_X;
+	int blockSize = %BLOCKSIZE, blockSize_x, blockSize_y;
+	int blkid = get_group_id(0);
+	int V= %V;
+
+	__local %TYPE solved[%TRIANGLE_HEIGHT];
+	__local %TYPE reduce[%TARGET_HEIGHT][ %BLOCKSIZE / %TARGET_HEIGHT];
+	__local %TYPE%V *solved_vec;
+	int blockStartRow;
+	int triangleHeight;
+	%TYPE%V acc;
+	%TYPE%V loadedAVec;
+	%TYPE sacc;
+	%TYPE accTemp;
+
+	triangleHeight = endRow - startRow;
+/*
+	if ((triangleHeight != %TRIANGLE_HEIGHT) || ((triangleHeight % V) != 0))
+	{
+		// throw -1;
+
+		//
+		// It is the caller's responsibility to solve triangle whose width
+		// is a multiple of VECTOR SIZE before calling this routine.
+		// This makes the width of the rectangle to be multiple of VECTOR SIZE.
+		// Thus threads can iterate without looking out for vector-unfriendly
+		// dimensions.
+		// This condition can be maintained for any dimension of the input matrix
+		// So, generality is not broken here.
+		//
+		*(__global int*)0 = 0;
+	}
+
+	if (( %BLOCKSIZE % %TARGET_HEIGHT) != 0)
+	{
+		// throw -1;
+
+		//
+		// Awkward Block Size. Impossible to write neat code.
+		// The set of threads belonging to the last threadID_X will not have
+		// blockSize_Y number of threads.
+		//
+		*(__global int*)0 = 0;
+	}
+*/
+	blockSize_y = %TARGET_HEIGHT;
+	blockSize_x = %BLOCKSIZE / %TARGET_HEIGHT;
+
+	threadID_Y = threadID % %TARGET_HEIGHT;
+	threadID_X = threadID / %TARGET_HEIGHT;
+
+	blockStartRow = endRow + (blkid * blockSize_x);
+	blockStartRow += threadID_X;
+
+	for(int i=threadID; i< %TRIANGLE_HEIGHT; i+=blockSize)
+	{
+		solved[i] = xnew[(startRow + i)*incx];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	solved_vec = solved;
+	accTemp = %INIT(0.0);
+	acc = %VMAKEVEC( accTemp);
+
+	if (blockStartRow < N)
+	{
+		for(int i=threadID_Y; i<(triangleHeight/V); i+=blockSize_y)
+		{
+			loadedAVec = %VLOAD(0, &A((startRow + i*V), (blockStartRow)));
+			%CONJUGATE(doConj, loadedAVec);
+			%VMAD(acc, solved_vec[i], loadedAVec); //startRow == startCol as well.
+		}
+		sacc = %REDUCE_SUM(acc);
+
+		// Put stuff in shared memory for final reduction
+		reduce[threadID_Y][threadID_X] = sacc;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if ( threadID < blockSize_x)
+	{
+		sacc = %INIT(0.0);
+		//#pragma unroll
+		for( int i=0; i < %TARGET_HEIGHT; i++)
+		{
+			%ADD(sacc, sacc, reduce[i][threadID]);
+		}
+
+		blockStartRow = endRow + (blkid * blockSize_x);
+		blockStartRow += threadID;
+		if ( blockStartRow < N)
+		{
+			%SUB(xnew[(blockStartRow)*incx], xnew[(blockStartRow)*incx], sacc);
+		}
+	}
+}
+";
+
+const char *trsv_CLT_ComputeRectangle_kernel="
+#ifdef DOUBLE_PRECISION
+    #ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+    #else
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+    #endif
+#endif
+#ifdef PACKED
+    #define A(row, col) (*( A + ((( (col) *((2*N) + 1 - (col))) / 2) + ((row) - (col)))))
+#else
+    #define A(row, col) A[ (row) + (col) * lda]
+#endif
+__kernel void %PREFIXtrsv_CLT_ComputeRectangle_kernel( 	__global const %TYPE* _A,
+												__global %TYPE* _xnew,
+												uint N,
+												int incx,
+												int isUnity,
+												uint lda,
+												int doConj,
+												int startRow, int endRow, uint offa, uint offx)
+{
+
+	__global %TYPE* xnew;
+	__global %TYPE* A = _A + offa;
+	if ( incx < 0 ) // Goto end of vector
+	{
+		xnew	 = _xnew  + offx + ( N - 1) * abs(incx);
+	}
+	else
+	{
+		xnew 	= _xnew + offx;
+	}
+
+	int threadID = get_local_id(0);
+	int threadID_Y, threadID_X;
+	int blockSize = %BLOCKSIZE, blockSize_x, blockSize_y;
+	int blkid = get_group_id(0);
+	int V= %V;
+
+	__local %TYPE solved[%TRIANGLE_HEIGHT];
+	__local %TYPE reduce[%TARGET_HEIGHT][ %BLOCKSIZE / %TARGET_HEIGHT];
+	__local %TYPE%V *solved_vec;
+	int blockStartRow;
+	int triangleHeight;
+	%TYPE%V acc;
+	%TYPE%V loadedAVec;
+	%TYPE sacc;
+	%TYPE accTemp;
+
+	triangleHeight = endRow - startRow;
+
+	blockSize_y = %TARGET_HEIGHT;
+	blockSize_x = %BLOCKSIZE / %TARGET_HEIGHT;
+
+	threadID_Y = threadID % %TARGET_HEIGHT;
+	threadID_X = threadID / %TARGET_HEIGHT;
+
+	blockStartRow = startRow - 1 - (blkid * blockSize_x);
+	blockStartRow -= threadID_X;
+
+	for(int i=threadID; i< %TRIANGLE_HEIGHT; i+=blockSize)
+	{
+		solved[i] = xnew[(startRow + i)*incx];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	solved_vec = solved;
+	accTemp = %INIT(0.0);
+	acc = %VMAKEVEC( accTemp);
+
+	if (blockStartRow >= 0)
+	{
+		for(int i=threadID_Y; i<(triangleHeight/V); i+=blockSize_y)
+		{
+			loadedAVec = %VLOAD(0, &A((startRow+ i*V) , (blockStartRow)));
+			%CONJUGATE(doConj, loadedAVec);
+			%VMAD(acc, solved_vec[i], loadedAVec); //startRow == startCol as well.
+		}
+		sacc = %REDUCE_SUM(acc);
+
+		// Put stuff in shared memory for final reduction
+		reduce[threadID_Y][threadID_X] = sacc;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if ( threadID < blockSize_x)
+	{
+		sacc = %INIT(0.0);
+		//#pragma unroll
+		for( int i=0; i < %TARGET_HEIGHT; i++)
+		{
+			%ADD(sacc, sacc, reduce[i][threadID]);
+		}
+
+		blockStartRow = startRow - 1 - (blkid * blockSize_x);
+		blockStartRow -= threadID;
+		if ( blockStartRow < N)
+		{
+			%SUB(xnew[(blockStartRow)*incx], xnew[(blockStartRow)*incx], sacc);
+		}
+	}
+}
+";
+
diff --git a/src/library/blas/gens/copy_reg.cpp b/src/library/blas/gens/copy_reg.cpp
new file mode 100644
index 0000000..d9f7095
--- /dev/null
+++ b/src/library/blas/gens/copy_reg.cpp
@@ -0,0 +1,274 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/*
+ * copy generator
+ */
+//#define DEBUG_COPY
+
+#define WORKGROUPS_PER_CU  32
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include "blas_kgen.h"
+#include <kprintf.hpp>
+#include <copy.clT>
+#include <solution_seq.h>
+
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+
+extern "C"
+unsigned int dtypeSize(DataType type);
+
+
+static char Prefix[4];
+
+static SolverFlags
+solverFlags(void)
+{
+	#ifdef DEBUG_COPY
+	printf("solverFlags called...\n");
+	#endif
+
+    return (SF_WSPACE_1D);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+
+static void
+assignKargs(KernelArg *args, const void *params, const void* extra );
+
+extern "C"
+void initCopyRegisterPattern(MemoryPattern *mempat);
+
+static  KernelExtraFlags
+selectVectorization(
+    void *kargs,
+    unsigned int vlen );
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *kArgs);
+
+static SolverOps copyOps = {
+    generator,
+    assignKargs,
+    NULL,
+    NULL, // Prepare Translate Dims
+    NULL, // Inner Decomposition Axis
+    calcNrThreads,
+    NULL,
+    solverFlags,
+	NULL,
+	NULL,
+	NULL,
+	setBuildOpts,
+	selectVectorization
+};
+
+static  KernelExtraFlags
+selectVectorization(
+	void *args,
+	unsigned int vlen )
+{
+	KernelExtraFlags kflags = KEXTRA_NO_FLAGS;
+	CLBlasKargs *kargs  = (CLBlasKargs *)args;
+
+    if( (((kargs->offBX) % vlen) != 0) || (((kargs->offCY) % vlen) != 0) )
+    {
+        kflags = KEXTRA_NO_COPY_VEC_A;
+    }
+	return kflags;
+}
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+	const SolutionStep *step = (const SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
+	{
+		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		#ifdef DEBUG_COPY
+		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
+		#endif
+	}
+	if( (kargs->ldb.vector) != 1) {
+        strcat( buildOptStr, " -DINCX_NONUNITY ");
+    }
+    if( (kargs->ldc.vector) != 1) {
+        strcat( buildOptStr, " -DINCY_NONUNITY ");
+    }
+
+	return;
+}
+
+
+static CLBLASMpatExtra mpatExtra;
+
+extern "C"
+void initCopyRegisterPattern(MemoryPattern *mempat)
+{
+	#ifdef DEBUG_COPY
+	printf("initREgPattern called with mempat = 0x%p\n", mempat);
+	#endif
+
+	fflush(stdout);
+    mempat->name = "Register accumulation based swap";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = ©Ops;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L2;
+    mpatExtra.bMset = CLMEM_LEVEL_L2;
+    mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY;
+    mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY;
+    mempat->extra = &mpatExtra;
+
+	Prefix[TYPE_FLOAT] = 'S';
+	Prefix[TYPE_DOUBLE] = 'D';
+	Prefix[TYPE_COMPLEX_FLOAT] = 'C';
+	Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *_extra)
+{
+	DUMMY_ARG_USAGE(subdims);
+    const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra;
+    CLBlasKargs *kargs = (CLBlasKargs *)args;
+    SolutionStep *step = container_of(kargs, args, SolutionStep);
+    TargetDevice *kDevice = &(step->device);
+
+    cl_int err;
+    unsigned int numComputeUnits = deviceComputeUnits( (kDevice->id), &err );
+    if(err != CL_SUCCESS) {
+        numComputeUnits = 1;
+    }
+
+    unsigned int vecLen = extra->vecLenA;
+	unsigned int blockSize = pgran->wgSize[0] * pgran->wgSize[1];
+
+	unsigned int wgToSpawn = ((kargs->N - 1)/ (blockSize*vecLen)) + 1;
+    wgToSpawn = min( wgToSpawn, (numComputeUnits * WORKGROUPS_PER_CU) );
+
+	threads[0] = wgToSpawn * blockSize;
+	threads[1] = 1;
+}
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+	DUMMY_ARGS_USAGE_2(pgran, subdims);
+	char tempTemplate[32*1024];
+
+	if ( buf == NULL) // return buffer size
+	{
+		buflen = (32 * 1024 * sizeof(char));
+        return (ssize_t)buflen;
+	}
+	CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
+
+	#ifdef DEBUG_COPY
+ 	printf("COPY GENERATOR called....\n");
+	printf("dataType : %c\n", Prefix[extraFlags->dtype]);
+	#endif
+
+    unsigned int vecLenA = extraFlags->vecLenA;
+
+	#ifdef DEBUG_COPY
+	printf("Vector length used : %d\n\n", vecLenA);
+	#endif
+
+	bool doVLOAD = false;
+	if( extraFlags->flags &  KEXTRA_NO_COPY_VEC_A )
+	{
+		doVLOAD = true;
+		#ifdef DEBUG_COPY
+		printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
+		#endif
+	}
+	else
+	{
+		#ifdef DEBUG_COPY
+		printf("Using Aligned Data Pointer .........................\n");
+		#endif
+	}
+    strcpy( tempTemplate, (char*)copy_kernel );
+	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD);
+    kobj.spit((char*)buf, tempTemplate);
+
+    return (32 * 1024 * sizeof(char));
+}
+
+/*
+__kernel void %PREFIXcopy_kernel( __global %TYPE *_X, __global %TYPE *_Y,
+                                        uint N, uint offx, int incx, uint offy, int incy )
+
+*/
+static void
+assignKargs(KernelArg *args, const void *params, const void* )
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+	cl_int incx, incy;
+
+    INIT_KARG(&args[0], blasArgs->A);
+	INIT_KARG(&args[1], blasArgs->B);
+    initSizeKarg(&args[2], blasArgs->N);
+    initSizeKarg(&args[3], blasArgs->offBX);
+    incx = blasArgs->ldb.vector;
+    INIT_KARG(&args[4], incx);
+    initSizeKarg(&args[5], blasArgs->offCY);
+    incy = blasArgs->ldc.vector;
+    INIT_KARG(&args[6], incy);
+
+	return;
+}
diff --git a/src/library/blas/gens/decomposition.c b/src/library/blas/gens/decomposition.c
new file mode 100644
index 0000000..e9e5d3d
--- /dev/null
+++ b/src/library/blas/gens/decomposition.c
@@ -0,0 +1,163 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * This module contains implementation of API for checking
+ * decompositions and calculate granularity
+ */
+
+#include <sys/types.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+
+#include "blas_kgen.h"
+
+static __inline bool
+checkSizeStepRelation(size_t size, size_t step)
+{
+    return ((size == SUBDIM_UNUSED) ||
+            (size && (size % step == 0)));
+}
+
+bool
+decompSanityCheck(
+    const SubproblemDim *subdims,
+    unsigned int minSize,
+    unsigned int maxSize,
+    unsigned int maxRegs,
+    DataType dtype,
+    bool wholeA)
+{
+    bool ret;
+
+    if( 0 == subdims[0].x ||
+        0 == subdims[0].y ||
+        0 == subdims[0].bwidth ||
+        0 == subdims[1].x ||
+        0 == subdims[1].y ||
+        0 == subdims[1].bwidth ){
+
+        return false;
+    }
+
+    if ( ((subdims[1].x < minSize) ||(subdims[1].x > maxSize)) ||
+         ((subdims[1].y < minSize) || (subdims[1].y > maxSize)) ||
+         ((subdims[1].bwidth < minSize) || (subdims[1].bwidth > maxSize)) ) {
+
+        return false;
+    }
+
+    // the group block must consist of integer number of subgroup blocks
+    if( subdims[0].x % subdims[1].itemX ||
+        subdims[0].y % subdims[1].itemY ||
+        subdims[0].bwidth % subdims[1].bwidth ){
+
+        return false;
+    }
+
+    ret = checkSizeStepRelation(subdims[0].itemX, subdims[0].x);
+    ret = ret && checkSizeStepRelation(subdims[0].itemY, subdims[0].y);
+    ret = ret && checkSizeStepRelation(subdims[1].itemX, subdims[1].x);
+    ret = ret && checkSizeStepRelation(subdims[1].itemY, subdims[1].y);
+    if (ret) {
+        size_t regUse;
+        size_t regsA;
+
+        if (wholeA) {
+            regsA = subdims[1].y * subdims[1].bwidth;
+        }
+        else {
+            regsA = szmax(subdims[1].y, subdims[1].bwidth);
+        }
+
+        // estimate register usage, drop
+        // inevitably slowed decompositions
+        regUse =
+            ( regsA +
+              subdims[1].bwidth * subdims[1].x +
+              subdims[1].x * subdims[1].y ) *
+             dtypeSize(dtype);
+
+        regUse /= 16; // 16 bytes per register
+        ret = (regUse <= maxRegs);
+    }
+
+    return ret;
+}
+
+void
+calcPgranDedicated(
+    PGranularity *pgran,
+    const SubproblemDim *subdims,
+    int xdim,
+    int level)
+{
+    unsigned int xg, yg;
+
+    DUMMY_ARG_USAGE(level);
+
+    assert((xdim >= -1) && (xdim <= 1));
+
+    xg = (unsigned int)(subdims[0].x / subdims[1].itemX);
+    yg = (unsigned int)(subdims[0].y / subdims[1].itemY);
+    if (xdim == -1) {
+        pgran->wgSize[0] = xg * yg;
+        pgran->wgSize[1] = 1;
+        pgran->wgDim = 1;
+    }
+    else {
+        pgran->wgSize[xdim] = xg;
+        pgran->wgSize[1 - xdim] = yg;
+        pgran->wgDim = 2;
+    }
+}
+
+void
+calcPgranCooperative(
+    PGranularity *pgran,
+    const SubproblemDim *subdims,
+    int xdim,
+    int ydim,
+    int level)
+{
+    unsigned int xg, yg;
+
+    DUMMY_ARG_USAGE(level);
+
+    assert((xdim >= 0) && (xdim <= 2));
+    assert((ydim >= 0) && (ydim <= 2));
+    assert((xdim && ydim) && (!xdim && !ydim));
+    assert(!( ((xdim == 2) && (ydim == 0)) ||
+              ((ydim == 2) && (xdim == 0)) ));
+
+    xg = (unsigned int)(subdims[0].x / subdims[1].itemX);
+    yg = (unsigned int)(subdims[0].y / subdims[1].itemY);
+    if (xdim == ydim) {
+        pgran->wgSize[xdim] = xg * yg;
+    }
+    else {
+        pgran->wgSize[xdim] = xg;
+        pgran->wgSize[ydim] = yg;
+    }
+
+    if ((xdim > 0) || (ydim > 0)) {
+        pgran->wgSize[0] = (unsigned int)(subdims[0].bwidth / subdims[1].bwidth);
+    }
+
+    pgran->wgDim = umax(xdim, ydim) + 1;
+}
+
diff --git a/src/library/blas/gens/dot.cpp b/src/library/blas/gens/dot.cpp
new file mode 100644
index 0000000..3f68221
--- /dev/null
+++ b/src/library/blas/gens/dot.cpp
@@ -0,0 +1,303 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/*
+ * dot generator
+ */
+//#define DEBUG_DOT
+
+#define WORKGROUPS_PER_CU  32
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include "blas_kgen.h"
+#include <kprintf.hpp>
+#include <dot.clT>
+#include <solution_seq.h>
+
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+
+extern "C"
+unsigned int dtypeSize(DataType type);
+
+
+static char Prefix[4];
+
+static SolverFlags
+solverFlags(void)
+{
+	#ifdef DEBUG_DOT
+	printf("solverFlags called...\n");
+	#endif
+
+    return (SF_WSPACE_1D);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+
+static void
+    fixupArgs(void *args, SubproblemDim *subdims, void *extra);
+
+static void
+assignKargs(KernelArg *args, const void *params, const void* extra );
+
+extern "C"
+void initDotRegisterPattern(MemoryPattern *mempat);
+
+static  KernelExtraFlags
+selectVectorization(
+    void *kargs,
+    unsigned int vlen );
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *kArgs);
+
+static SolverOps dotOps = {
+    generator,
+    assignKargs,
+    NULL,
+    NULL, // Prepare Translate Dims
+    NULL, // Inner Decomposition Axis
+    calcNrThreads,
+    NULL,
+    solverFlags,
+	fixupArgs,
+	NULL,
+	NULL,
+	setBuildOpts,
+	selectVectorization
+};
+
+static  KernelExtraFlags
+selectVectorization(
+	void *args,
+	unsigned int vlen )
+{
+	KernelExtraFlags kflags = KEXTRA_NO_FLAGS;
+	CLBlasKargs *kargs  = (CLBlasKargs *)args;
+
+    if( (((kargs->offBX) % vlen) != 0) || (((kargs->offCY) % vlen) != 0) )
+    {
+        kflags = KEXTRA_NO_COPY_VEC_A;
+    }
+	return kflags;
+}
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+	const SolutionStep *step = (const SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
+	{
+		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		#ifdef DEBUG_DOT
+		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
+		#endif
+	}
+    if( (kargs->ldb.vector) != 1) {
+        strcat( buildOptStr, " -DINCX_NONUNITY ");
+    }
+    if( (kargs->ldc.vector) != 1) {
+        strcat( buildOptStr, " -DINCY_NONUNITY ");
+    }
+
+	return;
+}
+
+
+static CLBLASMpatExtra mpatExtra;
+
+extern "C"
+void initDotRegisterPattern(MemoryPattern *mempat)
+{
+	#ifdef DEBUG_DOT
+	printf("initRegPattern called with mempat = 0x%p\n", mempat);
+	#endif
+
+	fflush(stdout);
+    mempat->name = "Register accumulation based swap";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &dotOps;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L2;
+    mpatExtra.bMset = CLMEM_LEVEL_L2;
+    mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY;
+    mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY;
+    mempat->extra = &mpatExtra;
+
+	Prefix[TYPE_FLOAT] = 'S';
+	Prefix[TYPE_DOUBLE] = 'D';
+	Prefix[TYPE_COMPLEX_FLOAT] = 'C';
+	Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *_extra)
+{
+    DUMMY_ARG_USAGE(subdims);
+    const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra;
+    CLBlasKargs *kargs = (CLBlasKargs *)args;
+    SolutionStep *step = container_of(kargs, args, SolutionStep);
+    TargetDevice *kDevice = &(step->device);
+
+    cl_int err;
+    unsigned int numComputeUnits = deviceComputeUnits( (kDevice->id), &err );
+    if(err != CL_SUCCESS) {
+        numComputeUnits = 1;
+    }
+
+    unsigned int vecLen = extra->vecLenA;
+	unsigned int blockSize = pgran->wgSize[0] * pgran->wgSize[1];
+
+	unsigned int wgToSpawn = ((kargs->N - 1)/ (blockSize*vecLen)) + 1;
+    wgToSpawn = min( wgToSpawn, (numComputeUnits * WORKGROUPS_PER_CU) );
+
+	threads[0] = wgToSpawn * blockSize;
+	threads[1] = 1;
+}
+
+//
+// FIXME: Report correct return value - Needs change in KPRINTF
+//
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+
+	DUMMY_ARG_USAGE(subdims);
+	size_t BLOCKSIZE  = pgran->wgSize[0];
+	char tempTemplate[32*1024];
+
+	if ( buf == NULL) // return buffer size
+	{
+		buflen = (32 * 1024 * sizeof(char));
+        return (ssize_t)buflen;
+	}
+	CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
+
+	#ifdef DEBUG_DOT
+ 	printf("DOT GENERATOR called....\n");
+	printf("dataType : %c\n", Prefix[extraFlags->dtype]);
+	#endif
+
+    unsigned int vecLenA = extraFlags->vecLenA;
+
+	#ifdef DEBUG_DOT
+	printf("Vector length used : %d\n\n", vecLenA);
+	#endif
+
+	bool doVLOAD = false;
+	if( extraFlags->flags &  KEXTRA_NO_COPY_VEC_A )
+	{
+		doVLOAD = true;
+		#ifdef DEBUG_DOT
+		printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
+		#endif
+	}
+	else
+	{
+		#ifdef DEBUG_DOT
+		printf("Using Aligned Data Pointer .........................\n");
+		#endif
+	}
+    strcpy( tempTemplate, (char*)dot_kernel );
+	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD, BLOCKSIZE);
+    kobj.spit((char*)buf, tempTemplate);
+
+    return (32 * 1024 * sizeof(char));
+}
+
+/*
+__kernel void %PREFIXdot_kernel( __global %TYPE *_X, __global %TYPE *_Y, __global %TYPE *scratchBuff,
+                                        uint N, uint offx, int incx, uint offy, int incy, int doConj )
+*/
+static void
+assignKargs(KernelArg *args, const void *params, const void* )
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+	cl_int incx, incy, doConj;
+
+    INIT_KARG(&args[0], blasArgs->B);
+	INIT_KARG(&args[1], blasArgs->C);
+	INIT_KARG(&args[2], blasArgs->D);
+    initSizeKarg(&args[3], blasArgs->N);
+    initSizeKarg(&args[4], blasArgs->offBX);
+    incx = blasArgs->ldb.vector;
+    INIT_KARG(&args[5], incx);
+    initSizeKarg(&args[6], blasArgs->offCY);
+    incy = blasArgs->ldc.vector;
+    INIT_KARG(&args[7], incy);
+    doConj = blasArgs->K;
+    INIT_KARG(&args[8], doConj);
+
+	return;
+}
+
+/** The purpose of this function is to add an work-group size indicator in
+    kernelKey, so that a different kernel is generated when work-group size is changed.
+    Reduction loop is unrolled in kprintf based on work-group size.
+
+    Member of SubproblemDim- bwidth, will be used to store work-group size of the current kernel
+    this will become a kernelKey, and kernel cache will be accordingly managed.
+    Note -- SubproblemDim is a member of kernelKey
+**/
+static void
+fixupArgs(void *args, SubproblemDim *subdims, void *extra)
+{
+    DUMMY_ARG_USAGE(extra);
+    CLBlasKargs *kargs = (CLBlasKargs*)args;
+    SolutionStep *step = container_of(kargs, args, SolutionStep);
+
+    subdims->bwidth = (step->pgran.wgSize[0]) * (step->pgran.wgSize[1]);
+}
+
diff --git a/src/library/blas/gens/fetch.c b/src/library/blas/gens/fetch.c
new file mode 100644
index 0000000..b3c3011
--- /dev/null
+++ b/src/library/blas/gens/fetch.c
@@ -0,0 +1,2190 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * COMMON DESCRIPTION:
+ *
+ * This module implements generation of fetches from memory to registers.
+ * It support various optimization strategies depending on used addressing
+ * modes, size of tiles, etc. Such a strategy is provided by an object
+ * that is named addressing agent.
+ *
+ * The module supports explicit statements repordering so as to group together
+ * scattered ALU and FETCH statements. The reordering is implemented by means
+ * of the statement batch. Scheme of priority assignment for statements put
+ * to the batch within the same call:
+ *      - Statments declaring and initializing variables have the highest
+ *        priority because all the sebsequent ones depend on it.
+ *      - Fetch statements have the decreased priority if any preparative
+ *        statements have really been generated
+ *      - Statements for updating variables have more decreased priority
+ *      - If an updating variable statement has been generated before full
+ *        tile fetch completion, priority for the next fetch statement is
+ *        decreased so as to don't disturb statements dependency.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include <clblas_stddef.h>
+#include <solution_seq.h>
+#include <trace_malloc.h>
+
+#include "blas_kgen.h"
+
+#define MAX_LENGTH 4096
+#define BITS_INT (sizeof(int) * 8)
+
+struct FetchContext;
+
+enum {
+    MAX_AUXILIARY_VARNUM = 32,
+    MAX_ADDR_AGENTS = 8,
+    ADDR_AGENT_PRIVATE_SIZE = 64,
+    /*
+     * buffer size enough to fit a declaration of a vectorized coordinate,
+     * expressions for all components, operators for building a correct syntax
+     * construction, and blanks between 2 adjacent component initializers
+     */
+    COORD_BUFSIZE = (MAX_OPENCL_VECTOR_LENGTH + 1) * (sizeof(Kstring) + 2) + 16,
+    /*
+     * Priority of all statement declaring and initializing some variables
+     */
+    PREPARE_VARS_STMT_PRIORITY = 0,
+    GENERIC_OPT_LEVELS = FOPTLEV_PREFETCH |
+                         FOPTLEV_CAN_SHARE_TMP_AB |
+                         FOPTLEV_MERGE_FETCHES
+};
+
+/*
+ * Agent for some addressing scheme. Incapsulates creation and updating
+ * of auxiliary variables and building offset expressions
+ */
+typedef struct AddrAgent {
+    Kstring vars[MAX_AUXILIARY_VARNUM];
+    // usage counters for using for A and B
+    int usageCount[2];
+    // loop preparation counters for A and B
+    int loopPrepCount[2];
+    char priv[ADDR_AGENT_PRIVATE_SIZE];
+
+    bool (*match)(const struct FetchContext*);
+    /*
+     * Generate code preparing needed variables. Must return 1 if some
+     * variables has been actually prepared, 0 otherwise
+     */
+    int (*prepareVars)(struct FetchContext*);
+    /*
+     * Generate code updating variables. Must return 1 if some variables
+     * has been actually prepared, 0 otherwise.
+     * 'stmtPriority' means the priority that must have a statement that
+     * is the agent is going to add to the batch
+     */
+    int (*updateVars)(struct FetchContext*, unsigned int nextLine,
+                      unsigned int nextVec, int stmtPriority);
+    void (*sprintfAddrOffset)(Kstring*, struct FetchContext*,
+                              unsigned int line, unsigned int vec);
+} AddressingAgent;
+
+// Preperties of the current operation of offset evaluation.
+struct OffsetEvalProps {
+    // global size K is in vectors
+    bool gkInVect;
+    // all coordinates are in vectors
+    bool coordInVect;
+    /*
+     * don't multiply coordinate in the second physical dimension
+     * on leading dimension, it is already done
+     */
+    bool ldNotMul;
+    /*
+     * Vector length of linear component in leading dimension.
+     * Number of linear coordinates in the leading dimension taken
+     * by an addressing agent at a time at offset evaluation must be
+     * equal to this number.
+     */
+    unsigned int leadVecLen;
+};
+
+typedef struct FetchContext {
+    // addressing mode that should be used in fetch operations
+    FetchAddrMode addrMode;
+    // optimization levels of code generation
+    FetchOptLevel optLevels;
+    AddressingAgent agents[MAX_ADDR_AGENTS];
+    AddressingAgent *currAgent;
+    AddressingAgent *prevAgent;
+    const BlasGenSettings *gset;
+    const FetchOpts *fopts;
+    // statement batch used at the current generation
+    struct StatementBatch *batch;
+    // Respective physical tile in global memory
+    Tile physTile;
+    // physical dimension passed in the outer loop
+    int outerDim;
+    struct OffsetEvalProps oevp;
+    bool isLoopPreparation;
+    // markers of context validity for matrix A and B
+    bool valid[2];
+} FetchContext;
+
+struct PhysOffsetComponents {
+    Kstring base;
+    Kstring offset;
+    Kstring bound;
+};
+
+/*
+ * Raw leading dimension. This a pair of a leading dimension
+ * expressed in number of elements and value on with which it
+ * should be scaled for correct addressing.
+ * Scale set to '0' means that the value in elements matches the
+ * value in vectors
+ */
+struct RawLD {
+    Kstring str;
+    unsigned int scale;
+};
+
+static const char *vectComponents = "0123456789abcdef";
+
+static void sprintfOffsetStateless(Kstring *expr, FetchContext *fctx,
+                                   unsigned int line, unsigned int vec);
+
+static void initStatelessAgent(AddressingAgent *agent);
+static void initTmpCoordAgent(AddressingAgent *agent);
+static void initPersCoordAgent(AddressingAgent *agent);
+
+void (*initAgentsTable[])(AddressingAgent *agent) = {
+    initStatelessAgent,
+    initTmpCoordAgent,
+    initPersCoordAgent,
+    NULL
+};
+
+static __inline bool
+isOne(const Kstring *kstr)
+{
+    return (kstr->buf[0] == '1') && (kstr->buf[1] == '\0');
+}
+
+static __inline bool
+isZero(const Kstring *kstr)
+{
+    return (kstr->buf[0] == '0') && (kstr->buf[1] == '\0');
+}
+
+static __inline bool
+isLocalMemoryUsed(const FetchOpts *fopts)
+{
+    return ((fopts->mrole == MATRIX_A) &&
+            (fopts->memA == CLMEM_LOCAL_MEMORY)) ||
+           ((fopts->mrole == MATRIX_B) &&
+            (fopts->memB == CLMEM_LOCAL_MEMORY));
+}
+
+static __inline unsigned int
+tileVecColsNum(const Tile *physTile)
+{
+    return physTile->nrCols / physTile->vecLen;
+}
+
+static __inline bool
+canBeFetchesMerged(const FetchContext *fctx)
+{
+    return (fctx->optLevels & FOPTLEV_MERGE_FETCHES) != 0;
+}
+
+/*
+ * Returns if the linear offsets along the dimension K
+ * can be shared for tiles A and B
+ */
+static bool
+canBeKoffShared(const FetchContext *fctx)
+{
+    unsigned int vlenA, vlenB;
+    bool canShare;
+
+    vlenA = getVecLen(fctx->gset, CLBLAS_GEMM, MATRIX_A);
+    vlenB = getVecLen(fctx->gset, CLBLAS_GEMM, MATRIX_B);
+
+    canShare = !fctx->gset->tileA.trans && fctx->gset->tileBX.trans &&
+               (vlenA == vlenB);
+    canShare = canShare &&
+              (fctx->currAgent == fctx->prevAgent) &&
+              ((fctx->optLevels & FOPTLEV_CAN_SHARE_TMP_AB) != 0);
+
+    return canShare;
+}
+
+static __inline
+const Tile* getDstTile(const FetchContext *fctx)
+{
+    return (fctx->fopts->mrole == MATRIX_A) ? &fctx->gset->tileA :
+                                              &fctx->gset->tileBX;
+}
+
+static __inline bool
+isFetchContextValid(const FetchContext *fctx)
+{
+    int i = (fctx->fopts->mrole == MATRIX_A) ? 0 : 1;
+
+    return fctx->valid[i];
+}
+
+static __inline void
+invalidateFetchContext(FetchContext *fctx)
+{
+    int i = (fctx->fopts->mrole == MATRIX_A) ? 0 : 1;
+
+    fctx->valid[i] = false;
+}
+
+static __inline int
+agentUsageCount(const FetchContext *fctx)
+{
+    int i = (fctx->fopts->mrole == MATRIX_A) ? 0 : 1;
+
+    return fctx->currAgent->usageCount[i];
+}
+
+static __inline void
+incAgentUsageCount(FetchContext *fctx)
+{
+    int i = (fctx->fopts->mrole == MATRIX_A) ? 0 : 1;
+
+    fctx->currAgent->usageCount[i]++;
+}
+
+static __inline int
+agentLoopPrepCount(const FetchContext *fctx)
+{
+    int i = (fctx->fopts->mrole == MATRIX_A) ? 0 : 1;
+
+    return fctx->currAgent->loopPrepCount[i];
+}
+
+static __inline void
+incAgentLoopPrepCount(FetchContext *fctx)
+{
+    int i = (fctx->fopts->mrole == MATRIX_A) ? 0 : 1;
+
+    fctx->currAgent->loopPrepCount[i]++;
+}
+
+static int
+bwidthPhysDimension(const FetchContext *fctx)
+{
+    int dim;
+    const Tile *tile;
+
+    tile = getDstTile(fctx);
+    if (fctx->fopts->mrole == MATRIX_A) {
+        dim = (tile->trans) ? 1 : 0;
+    }
+    else {
+        dim = (tile->trans) ? 0 : 1;
+    }
+
+    return dim;
+}
+
+static FetchAddrMode
+fetchAddrModeFromMulOpts(const TileMulOpts *mulOpts)
+{
+    FetchAddrMode mode = FETCH_ADDR_NORMAL;
+    TileMulFlags mflags = mulOpts->flags;
+
+    if (mflags & (TILEMUL_SKEW_A | TILEMUL_GLOBAL_CYCLIC_A)) {
+        mode |= FETCH_ADDR_A_CYCLICAL;
+    }
+    if (mflags & (TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_B)) {
+        mode |= FETCH_ADDR_B_CYCLICAL;
+    }
+    if (mflags & (TILEMUL_SKEW_K | TILEMUL_GLOBAL_CYCLIC_K)) {
+        mode |= FETCH_ADDR_K_CYCLICAL;
+    }
+    if (mflags & TILEMUL_WRAP_AROUND_TAIL) {
+        mode |= FETCH_ADDR_TAILK_PADD;
+    }
+
+    return mode;
+}
+
+static void
+sprintfVectorComponent(
+    Kstring *kstr,
+    const char *baseName,
+    unsigned int n,
+    unsigned int maxn)
+{
+    assert(n < maxn);
+    if (maxn == 1) {
+        kstrcpy(kstr, baseName);
+    }
+    else {
+        ksprintf(kstr, "%s.s%c", baseName, vectComponents[n]);
+    }
+}
+
+/*
+ * sprintf base coordinate and scale it in accordance with
+ * used mode and vector length so as it is in vectors
+ */
+static void
+sprintfNormalizedBaseCoord(
+    Kstring *kstr,
+    const char *name,
+    int physDim,
+    FetchContext *fctx)
+{
+    int shift = findHighestSetBit(fctx->physTile.vecLen);
+
+    if (physDim || fctx->oevp.coordInVect || (shift == 0)) {
+        kstrcpy(kstr, name);
+    }
+    else {
+        ksprintf(kstr, "(%s >> %d)", name, shift);
+    }
+}
+
+static void
+sprintfOffsetVector(Kstring *kstr, unsigned int base, unsigned int len)
+{
+    if (len == 1) {
+        ksprintf(kstr, "%u", base);
+    }
+    else {
+        unsigned int i;
+
+        ksprintf(kstr, "(uint%u)(%u", len, base);
+        for (i = 1; i < len; i++) {
+            kstrcatf(kstr, ", %u", base + i);
+        }
+        kstrcatf(kstr, "%c", ')');
+    }
+}
+
+static void
+sprintfLinearOffset(
+    Kstring *expr,
+    const struct PhysOffsetComponents *comp,
+    bool swapBaseOff)
+{
+    int cnt = 0;
+    const Kstring *kstr = NULL;
+    bool isBounded;
+
+    expr->buf[0] = '\0';
+    if (!isKstringEmpty(&comp->base) && !isZero(&comp->base)) {
+        cnt++;
+        kstr = &comp->base;
+    }
+    if (!isKstringEmpty(&comp->offset) && !isZero(&comp->offset)) {
+        cnt++;
+        kstr = &comp->offset;
+    }
+
+    if (cnt == 0) {
+        return;
+    }
+
+    isBounded = !isKstringEmpty(&comp->bound);
+    if (cnt == 2) {
+        const Kstring *first = (swapBaseOff) ? &comp->offset : &comp->base;
+        const Kstring *second = (swapBaseOff) ? &comp->base : &comp->offset;
+
+        if (isBounded) {
+            ksprintf(expr, "(%s + %s) %% %s",
+                     first->buf, second->buf, &comp->bound.buf);
+        }
+        else {
+            ksprintf(expr, "%s + %s", first->buf, second->buf);
+        }
+    }
+    else {
+        if (isBounded) {
+            ksprintf(expr, "%s %% %s", kstr->buf, &comp->bound.buf);
+        }
+        else {
+            kstrcpy(expr, kstr->buf);
+        }
+    }
+}
+
+/*
+ * Estimate if address offset evaluation will be cheap without any savings.
+ * If kxy is 0, then predicate it for the coordinates along the dimension K,
+ * otherwise do it for the coordinates along rows of A or columns of B.
+ */
+static bool
+estimateOffsetEvalCheap(const FetchContext *fctx, int kxy)
+{
+    int kdim;
+    unsigned int n;
+    const Tile *physTile;
+    FetchAddrMode relFlag, cycFlag;
+    bool needNorm;
+
+    /*
+     * Criteria:
+     * Evaluation is cheap if addressing is relative or number of
+     * elements in this dimension doesn't exceed 2 and no transform
+     * to vectors (normalization) or cycling is needed.
+     */
+
+    kdim = bwidthPhysDimension(fctx);
+    physTile = &fctx->physTile;
+    needNorm = (physTile->vecLen > 1);
+    if (!kxy) {
+        n = (kdim) ? physTile->nrRows : tileVecColsNum(physTile);
+        relFlag = FETCH_ADDR_K_RELATIVE;
+        cycFlag = FETCH_ADDR_K_CYCLICAL;
+        needNorm = needNorm && !kdim;
+    }
+    else {
+        MatrixRole mrole = fctx->fopts->mrole;
+
+        n = (kdim) ? tileVecColsNum(physTile) : physTile->nrRows;
+        relFlag = (mrole == MATRIX_A) ? FETCH_ADDR_A_RELATIVE :
+                                        FETCH_ADDR_B_RELATIVE;
+        cycFlag = (mrole == MATRIX_A) ? FETCH_ADDR_A_CYCLICAL :
+                                       FETCH_ADDR_B_CYCLICAL;
+        needNorm = needNorm && kdim;
+    }
+
+    return ( (fctx->addrMode & relFlag) ||
+             ((n <= 2) && !(needNorm || (fctx->addrMode & cycFlag))) );
+}
+
+/*
+ * Predicate if register consumption will be high if the
+ * generator request a space for 'nrCoords' coordinates.
+ * The 'isPers' argument shows if these are persistent
+ * coordinates or not.
+ * The 'isSummary' argument shows if this is summary number
+ * of coordinates for both the tiles or only for one of
+ * the tiles.
+ */
+static bool
+predictHighRegConsumption(
+    const FetchContext *fctx,
+    unsigned int nrCoords,
+    bool isPers,
+    bool isSummary)
+{
+    unsigned int max;
+
+    DUMMY_ARG_USAGE(fctx);
+
+    // TODO: take into account number of registers consumed by the tiles
+    max = (isPers) ? 12 : 16;
+    if (isSummary) {
+        max *= 2;
+    }
+
+    return !(nrCoords < max);
+}
+
+static void
+sprintfLeadingDimension(Kstring *ld, const FetchContext *fctx)
+{
+    bool done = false;
+    const char *varName;
+
+    varName = (fctx->fopts->mrole == MATRIX_A) ?
+        fctx->gset->varNames.lda : fctx->gset->varNames.ldb;
+
+    if (!(fctx->gset->flags & BGF_LD_IN_VECTORS)) {
+        int shift;
+
+        shift = findHighestSetBit(fctx->physTile.vecLen);
+        if (shift != 0) {
+            ksprintf(ld, "(%s >> %d)", varName, shift);
+            done = true;
+        }
+    }
+
+    if (!done) {
+        kstrcpy(ld, varName);
+    }
+}
+
+/*
+ * fill raw leading dimension
+ */
+static void
+fillRawLD(
+    struct RawLD *ld,
+    const FetchContext *fctx)
+{
+    const char *varName;
+
+    varName = (fctx->fopts->mrole == MATRIX_A) ?
+        fctx->gset->varNames.lda : fctx->gset->varNames.ldb;
+
+    kstrcpy(&ld->str, varName);
+
+    ld->scale = (fctx->gset->flags & BGF_LD_IN_VECTORS) ?
+        0 : fctx->physTile.vecLen;
+}
+
+/*
+ * Spintf bound for the K component in case of storing a matrix
+ * in the global memory
+ */
+static void
+sprintfGboundK(Kstring *kstr, const FetchContext *fctx)
+{
+    int dim;
+    const  char *varK = fctx->gset->varNames.sizeK;
+    unsigned int vecLen;
+    int shift;
+
+    vecLen = fctx->physTile.vecLen;
+    shift = findHighestSetBit(vecLen);
+    dim = bwidthPhysDimension(fctx);
+    if (dim || fctx->oevp.gkInVect || (shift == 0)) {
+        kstrcpy(kstr, varK);
+    }
+    else {
+        if (fctx->addrMode & FETCH_ADDR_TAILK_PADD) {
+            ksprintf(kstr, "((%s + %u) >> %d)", varK, vecLen - 1, shift);
+        }
+        else {
+            ksprintf(kstr, "(%s >> %d)", varK, shift);
+        }
+    }
+}
+
+static void
+selectAddrAgent(FetchContext *fctx)
+{
+    unsigned int level;
+    FetchOptLevel origLevels;
+    FetchOptLevel prefLev, mergeLev;
+    int i;
+    bool last = false;
+
+    prefLev = fctx->optLevels & FOPTLEV_PREFETCH;
+    /*
+     * The merge level doesn't affect addressing agents in any way.
+     * So, clear it for a time so as they wouldn't even know if it
+     * is used or not.
+     */
+    mergeLev = fctx->optLevels & FOPTLEV_MERGE_FETCHES;
+    origLevels = fctx->optLevels & ~FOPTLEV_MERGE_FETCHES;
+    fctx->currAgent = NULL;
+
+    /*
+     * Selecting criteria: Any of the agents supporting an optimization level
+     * as high as possible which is suitable for these generator settings.
+     */
+    for (level = 1 << (sizeof(int) * 8 - 1);
+         !last && (fctx->currAgent == NULL); level >>= 1) {
+
+        last = (level == 0);
+        if (!(last || (origLevels & level))) {
+            continue;
+        }
+
+        fctx->optLevels = (FetchOptLevel)level | prefLev;
+
+        for (i = 0; i < MAX_ADDR_AGENTS; i++) {
+            fctx->currAgent = &fctx->agents[i];
+            if (fctx->currAgent->match == NULL) {
+                fctx->currAgent = NULL;
+                break;
+            }
+            if (fctx->currAgent->match(fctx)) {
+                break;
+            }
+            fctx->currAgent = NULL;
+        }
+    }
+
+    fctx->optLevels = origLevels | mergeLev;
+
+    assert(fctx->currAgent != NULL);
+}
+
+static unsigned int
+persVarDepthK(const FetchContext *fctx, unsigned int maxVarVecLen)
+{
+    unsigned int depth = 0;
+    unsigned int maxDepth;
+    int kdim;
+    unsigned int vlen = 0;
+    const Tile *physTile = &fctx->physTile;
+
+    kdim = bwidthPhysDimension(fctx);
+    vlen = tileVectorsNum(physTile);
+    vlen = umin(vlen, maxVarVecLen);
+
+    if (kdim) {
+        depth = vlen / tileVecColsNum(physTile);
+        maxDepth = physTile->nrRows;
+    }
+    else {
+        depth = vlen / physTile->nrRows;
+        maxDepth = tileVecColsNum(physTile);
+    }
+
+    /*
+     * If the dimension K is traversed in the inner loop, and
+     * not all coordinates can be saved, then using persistent
+     * coordinates is prohibited because there is no chance to
+     * update the vectorized coordinate till the end of the whole
+     * tile fetch.
+     */
+    if ((fctx->outerDim != kdim) && (depth < maxDepth)) {
+        depth = 0;
+    }
+
+    return depth;
+}
+
+static void
+genInitVectCoord(
+    FetchContext *fctx,
+    const Kstring *name,
+    unsigned int lenXY,
+    unsigned int depthK,
+    bool decl,
+    bool isConst)
+{
+    const Tile *physTile;
+    char buf[COORD_BUFSIZE];
+    char *p = NULL;
+    unsigned int i, k, lenFull;
+    int kdim;
+    const char *declPref;
+    bool needVect;
+    Kstring aoff;
+    unsigned int vlen;
+    Kstring coordType;
+
+    kdim = bwidthPhysDimension(fctx);
+    physTile = &fctx->physTile;
+    lenFull = (kdim) ? tileVecColsNum(physTile) : physTile->nrRows;
+
+    /*
+     * If it makes sense Using vectorization at offset evaluation to
+     * avoid extra casting of coordinate in vectors to coordinate in elements
+     */
+    needVect = decl &&
+               ( (!kdim && (depthK > 1) && (lenXY == 1)) ||
+                 (kdim && (depthK == 1) && (lenXY > 1)) );
+    vlen = lenXY * depthK;
+
+    // coordinate declarator
+    declPref = (isConst) ? "const " : "";
+    if (decl) {
+        if (vlen == 1) {
+            ksprintf(&coordType, "%suint", declPref);
+        }
+        else {
+            ksprintf(&coordType, "%suint%u", declPref, vlen);
+        }
+    }
+
+    // declaration + initialization
+    if (needVect || (decl && (vlen == 1))) {
+        if (needVect) {
+            fctx->oevp.leadVecLen = vlen;
+        }
+        sprintfOffsetStateless(&aoff, fctx, 0, 0);
+        kgenBatchPrintf(fctx->batch, PREPARE_VARS_STMT_PRIORITY,
+                        "%s %s = %s;\n",
+                        coordType.buf, name->buf, aoff.buf);
+        fctx->oevp.leadVecLen = 1;
+    }
+    else {
+        unsigned int n = 0;
+
+        if (decl) {
+            p = buf + sprintf(buf, "%suint%u %s = {",
+                              declPref, vlen, name->buf);
+        }
+
+        for (k = 0; k < depthK; k++) {
+            for (i = 0; i < lenXY; i++) {
+                unsigned int line, vec;
+
+                line = (kdim) ? k : i;
+                vec = (kdim) ? i : k;
+                sprintfOffsetStateless(&aoff, fctx, line, vec);
+                if (decl) {
+                    const char *pref = (n % 3) ? ", " : "";
+
+                    p += sprintf(p, "%s%s", pref, aoff.buf);
+                    // split long lines
+                    n++;
+                    if (!(n % 3) && (n != vlen)) {
+                        p += sprintf(p, "%s", ",\n\t\t");
+                    }
+                }
+                else {
+                    kgenBatchPrintf(fctx->batch, PREPARE_VARS_STMT_PRIORITY,
+                                    "%s.s%c = %s;\n",
+                                    name->buf, vectComponents[k * lenFull + i],
+                                    aoff.buf);
+                }
+            }
+        }
+
+        if (decl) {
+            strcpy(p, "};\n");
+            assert(p + 4 < buf + COORD_BUFSIZE);
+            kgenAddStmtToBatch(fctx->batch, PREPARE_VARS_STMT_PRIORITY, buf);
+        }
+    }
+}
+
+
+/**************** Implement different addressing agents *********************/
+
+/********** Stateless (without precoputing) memory addressing agent *********/
+
+static bool
+matchStateless(const FetchContext *fctx)
+{
+    return !(fctx->optLevels & ~GENERIC_OPT_LEVELS);
+}
+
+static void
+sprintfOffsetStateless(
+    Kstring *expr,
+    FetchContext *fctx,
+    unsigned int line,
+    unsigned int vec)
+{
+    FetchAddrMode addrMode = fctx->addrMode;
+    bool isRel;     // shows if addressing is relative
+    const Tile *physTile;
+    bool useLocal;
+    int kdim;
+    unsigned int i, u;
+    struct PhysOffsetComponents comps;
+    Kstring leadStr, secStr;
+    struct RawLD leadDim;
+    bool vectLead;
+    bool swap;
+    Kstring *kstr;
+    const KernelVarNames *kvars = &fctx->gset->varNames;
+    unsigned int vecLen;
+    unsigned int offVlen;
+    const char *p;
+    FetchAddrMode amask;
+    MatrixRole mrole = fctx->fopts->mrole;
+    const SubproblemDim *subdim = fctx->gset->subdims;
+
+    emptyKstring(&secStr);
+    emptyKstring(&leadStr);
+
+    offVlen = fctx->oevp.leadVecLen;
+    vectLead = (offVlen > 1);
+    physTile = &fctx->physTile;
+    vecLen = physTile->vecLen;
+
+    kdim = bwidthPhysDimension(fctx);
+    useLocal = isLocalMemoryUsed(fctx->fopts);
+
+    // fill components relating to X or Y
+    memset(&comps, 0, sizeof(comps));
+    amask = (mrole == MATRIX_A) ? FETCH_ADDR_A_RELATIVE :
+                                  FETCH_ADDR_B_RELATIVE;
+    isRel = ((addrMode & amask) != 0);
+
+    // base
+    if (!isRel) {
+        p = (mrole == MATRIX_A) ? kvars->coordA : kvars->coordB;
+        sprintfNormalizedBaseCoord(&comps.base, p, 1 - kdim, fctx);
+    }
+    // offset
+    u = (kdim) ? vec : line;
+    i = (kdim) ? offVlen : 1;
+    if (u || i) {
+        sprintfOffsetVector(&comps.offset, u, i);
+    }
+    // bound
+    amask = (mrole == MATRIX_A) ? FETCH_ADDR_A_CYCLICAL :
+                                  FETCH_ADDR_B_CYCLICAL;
+    if (addrMode & amask) {
+        if (useLocal || isRel) {
+            u = (kdim) ? tileVecColsNum(physTile) : physTile->nrRows;
+            ksprintf(&comps.bound, "%u", u);
+        }
+        else {
+            // global bound
+            if (kdim) {
+                /*
+                 * For X and Y dimension the single task is to prevent
+                 * exceeding buffer bounds. Using leading dimension for
+                 * this is the easiest.
+                 */
+                 sprintfLeadingDimension(&comps.bound, fctx);
+            }
+            else {
+                const char *var = (fctx->fopts->mrole == MATRIX_A) ?
+                    fctx->gset->varNames.sizeM : fctx->gset->varNames.sizeN;
+
+                kstrcpy(&comps.bound, var);
+            }
+        }
+    }
+
+    kstr = (kdim) ? &leadStr : &secStr;
+    swap = kdim && vectLead;
+    sprintfLinearOffset(kstr, &comps, swap);
+
+
+    // fill components relating to bwidth
+    memset(&comps, 0, sizeof(comps));
+    isRel = ((addrMode & FETCH_ADDR_K_RELATIVE) != 0);
+
+    // base
+    if (!isRel) {
+        sprintfNormalizedBaseCoord(&comps.base, kvars->k, kdim, fctx);
+    }
+    // offset
+    u = (kdim) ? line : vec;
+    i = (kdim) ? 1 : offVlen;
+    if (u || i) {
+        sprintfOffsetVector(&comps.offset, u, i);
+    }
+    // bound
+    if (addrMode & (FETCH_ADDR_K_CYCLICAL)) {
+        if (useLocal || isRel) {
+            if (useLocal) {
+                u = (unsigned int)subdim->bwidth;
+            }
+            else {
+                u = (kdim) ? physTile->nrRows : tileVecColsNum(physTile);
+            }
+            ksprintf(&comps.bound, "%u", u);
+        }
+        else {
+            sprintfGboundK(&comps.bound, fctx);
+        }
+    }
+
+    kstr = (kdim) ? &secStr : &leadStr;
+    swap = !kdim && vectLead;
+    sprintfLinearOffset(kstr, &comps, swap);
+
+    if (fctx->oevp.ldNotMul) {
+        kstrcpy(&leadDim.str, "1");
+        leadDim.scale = 0;
+    }
+    else if (useLocal) {
+        leadDim.scale = 0;
+        if (kdim) {
+            u = (unsigned int)((mrole == MATRIX_A) ? subdim->y : subdim->x);
+        }
+        else {
+            u = (unsigned int)subdim->bwidth;
+        }
+        ksprintf(&leadDim.str, "%u", u / vecLen);
+    }
+    else {
+        fillRawLD(&leadDim, fctx);
+    }
+
+    // Build the full expression
+    if (!isKstringEmpty(&leadStr) && vectLead) {
+        Kstring tmp;
+
+        sprintfFastScalarMad(&tmp, &secStr, &leadDim.str,
+                             leadDim.scale, NULL);
+        if (isZero(&tmp)) {
+            kstrcpy(expr, leadStr.buf);
+        }
+        else {
+            ksprintf(expr, "%s + %s", leadStr.buf, tmp.buf);
+        }
+    }
+    else {
+        sprintfFastScalarMad(expr, &secStr, &leadDim.str,
+                             leadDim.scale, &leadStr);
+    }
+}
+
+static void
+initStatelessAgent(AddressingAgent *agent)
+{
+    memset(agent, 0, sizeof(AddressingAgent));
+    agent->match = matchStateless;
+    agent->sprintfAddrOffset = sprintfOffsetStateless;
+}
+
+/************* Addressing agent using temporary coordinates ****************/
+
+/*
+ * Common approach:
+ *
+ * Save base offsets along both the physical dimensions so as to just
+ * have only one add operation per each further offset evaluation.
+ * Prediction of hight register consumption is used to decide how many
+ * of offsets for each dimension can be saved.
+ * 2 attempts are made. On the first one the maximal number of offsets is
+ * tried to be allocated. This number is equal to the number of tile lines
+ * or vectors in a line respectively. If this number will adittely cause
+ * high register consumption, then only one offset is tried to be allocated.
+ * If the situation repeats, then the offsets in this dimension are not saved
+ * at all.
+ *
+ * Next point is that only those offsets are precomputed that are estimated
+ * to take a lot of computing resources.
+ *
+ * In case of cyclical mode in the dimension K it is saved the global
+ * size K in vectors.
+ *
+ * Offsets for A and B along the dimension K are be shared if the
+ * caller advice to do that and number of them for A and B is the same.
+ */
+
+enum {
+    TMP_COORD_AY,
+    TMP_COORD_AK,
+    TMP_A_VSIZEK,
+    TMP_COORD_BX,
+    TMP_COORD_BK,
+    TMP_B_VSIZEK
+};
+
+/*
+ * The structure stores length of vectorized temporary variables storing
+ * offsets for matrices A and B along rows/columns and the dimension K.
+ */
+typedef struct TmpCoordInfo {
+    // vector length of the offset coordinate of A along rows
+    unsigned int yaVlen;
+    // vector length of the offset coordinate of A along the dimension K
+    unsigned int kaVlen;
+    // vector length of the offset coordinate of B along columns
+    unsigned int xbVlen;
+    // vector length of the offset coordinate of B along the dimension K
+    unsigned int kbVlen;
+    /*
+     * shows if the respective coordinates are
+     * declared as constants or not
+     */
+    bool yaIsConst;
+    bool kaIsConst;
+    bool xbIsConst;
+    bool kbIsConst;
+
+    // force relative addressing along K for the matrix A
+    bool forceRelA;
+    // force relative addressign along K for the matrix B
+    bool forceRelB;
+} MAY_ALIAS TmpCoordInfo;
+
+static unsigned int
+selectTmpCoordsNum(
+    const FetchContext *fctx,
+    unsigned int currNum,
+    unsigned int reqNum,
+    bool canShare)
+{
+    if (predictHighRegConsumption(fctx, currNum + reqNum,
+                                  false, canShare)) {
+        if (predictHighRegConsumption(fctx, currNum + 1,
+                                      false, canShare)) {
+            reqNum = 0;
+        }
+        else {
+            reqNum = 1;
+        }
+    }
+
+    return reqNum;
+}
+
+/*
+ * check if such number of temporary coordinates has any sence,
+ * i. e. will lead eventually to mode efficient evaluation
+ */
+static bool
+tmpNumSanityCheck(
+    unsigned int num,
+    bool isConst,
+    int kxy,
+    bool isLoopPrep,
+    const FetchContext *fctx)
+{
+    unsigned int maxCoords[2];
+    int dim;
+    bool ret = true;
+    const Tile *physTile = &fctx->physTile;
+
+    maxCoords[0] = tileVecColsNum(physTile);
+    maxCoords[1] = physTile->nrRows;
+    dim = bwidthPhysDimension(fctx);
+    if (kxy) {
+        dim = 1 - dim;
+    }
+
+    /*
+     * Believe it is not reasonable if it is not constant value
+     * and used few times. It is also right for constant values along X and Y
+     * if they prepared within a loop rather than in advance
+     * because the compiler is not able to recognize that those values are
+     * not needed to be revaluated at each loop iteration. It is also not
+     * reasonable if it is precomputed only one constant value whict doesn't
+     * actually simplify evaluating linear coordinates in the same dimension:
+     * believe it is so, if there is no vectorization at fetching or addressing
+     * is cyclical, or this is a coordinate mapped to the second physical
+     * dimension (because neverthless this assumes multiplication on leading
+     * dimension)
+     */
+
+    if (!isConst) {
+        ret = (maxCoords[1 - dim] > 2);
+    }
+    else {
+        FetchAddrMode cycMode;
+        bool isCycled;
+
+        if (!kxy) {
+            cycMode = FETCH_ADDR_K_CYCLICAL;
+        }
+        else {
+            ret = (isLoopPrep || (maxCoords[1 - dim] > 1));
+            cycMode = (fctx->fopts->mrole == MATRIX_A) ? FETCH_ADDR_A_CYCLICAL :
+                                                         FETCH_ADDR_B_CYCLICAL;
+        }
+
+        ret = ret && (!dim || (num == maxCoords[dim]));
+
+        isCycled = ((fctx->addrMode & cycMode) != 0);
+        if (!dim) {
+            ret = ret && ((num > 1) || (physTile->vecLen > 1) || isCycled);
+        }
+
+        ret = ret && !(isCycled && (num < maxCoords[dim]));
+    }
+
+    return ret;
+}
+
+/*
+ * Force relative addressing along K or X/Y dimension
+ */
+static __inline void
+forceRelativeAddressing(FetchContext *fctx, int kxy)
+{
+    if (!kxy) {
+        fctx->addrMode |= FETCH_ADDR_K_RELATIVE;
+        fctx->addrMode &= ~FETCH_ADDR_K_CYCLICAL;
+    }
+    else {
+        fctx->addrMode |= (FETCH_ADDR_A_RELATIVE |
+                           FETCH_ADDR_B_RELATIVE);
+        fctx->addrMode &= ~(FETCH_ADDR_A_CYCLICAL |
+                            FETCH_ADDR_B_CYCLICAL);
+    }
+}
+
+static bool
+matchTmpCoordBased(const FetchContext *fctx)
+{
+    bool ret;
+
+    if ((fctx->optLevels & ~GENERIC_OPT_LEVELS) !=
+        FOPTLEV_TMP_COORD_PRECOMPUTING) {
+
+        ret = false;
+    }
+    else {
+        ret = !(estimateOffsetEvalCheap(fctx, 0) &&
+                estimateOffsetEvalCheap(fctx, 1));
+    }
+
+    return ret;
+}
+
+static int
+prepareTmpCoords(FetchContext *fctx)
+{
+    FetchAddrMode addrMode = fctx->addrMode;
+    Kstring *vars = fctx->currAgent->vars;
+    MatrixRole mrole = fctx->fopts->mrole;
+    const Tile *physTile;
+    const Kstring *kstr;
+    TmpCoordInfo *info = (TmpCoordInfo*)fctx->currAgent->priv;
+    int kdim;
+    // for sure known summary number of allocated coordinates
+    unsigned int coordsNum = 0;
+    unsigned int n;
+    unsigned int prepared = 0;
+    unsigned int maxCoords[2];
+    bool canShare;
+    bool isConst;
+    bool normBoundK;
+    Kstring *boundVars[2] = {&vars[TMP_A_VSIZEK], &vars[TMP_B_VSIZEK]};
+    int bvidx;  // bound variable index in the previously declared array
+    bool skip = false;
+
+    /*
+     * Believe that number of previously allocated coordinates
+     * for the other tile is reliable if the caller advice to share
+     * possible variables
+     */
+    canShare = canBeKoffShared(fctx);
+    if (canShare) {
+        if (mrole == MATRIX_A) {
+            coordsNum = info->xbVlen + info->kbVlen;
+        }
+        else {
+            coordsNum = info->yaVlen + info->kaVlen;
+        }
+    }
+
+    kdim = bwidthPhysDimension(fctx);
+    physTile = &fctx->physTile;
+    maxCoords[0] = tileVecColsNum(physTile);
+    maxCoords[1] = physTile->nrRows;
+    normBoundK = !kdim && !isLocalMemoryUsed(fctx->fopts) &&
+                 (fctx->addrMode & FETCH_ADDR_K_CYCLICAL) &&
+                 (physTile->vecLen > 1);
+
+    n = 0;
+    if (!estimateOffsetEvalCheap(fctx, 1)) {
+        n = selectTmpCoordsNum(fctx, coordsNum, maxCoords[1 - kdim], canShare);
+        isConst = (n == maxCoords[1 - kdim]) || (kdim == fctx->outerDim);
+        if (!tmpNumSanityCheck(n, isConst, 1, fctx->isLoopPreparation, fctx)) {
+            n = 0;
+        }
+
+        /*
+         * Variable coordinates cannot be prepared before the loop starts.
+         * If prepare before loop, the coordinates are considered as persistent
+         * for more adequate prediction of register consumption.
+         * Check also if if the coordinates for X or Y have been
+         * already prepared at the loop preparation stage
+         */
+        if (fctx->isLoopPreparation) {
+            skip = !isConst ||
+                   predictHighRegConsumption(fctx, coordsNum + n,
+                                             true, canShare);
+        }
+        else {
+            skip = isConst &&
+                   (agentLoopPrepCount(fctx) > agentUsageCount(fctx));
+        }
+
+        if (!skip) {
+            if (mrole == MATRIX_A) {
+                kstrcpy(&vars[TMP_COORD_AY], "ay");
+                kstr = &vars[TMP_COORD_AY];
+                info->yaIsConst = isConst;
+            }
+            else {
+                kstrcpy(&vars[TMP_COORD_BX], "bx");
+                kstr = &vars[TMP_COORD_BX];
+                info->xbIsConst = isConst;
+            }
+
+            if (n) {
+                /*
+                 * There are only needed offsets along rows of A or columns
+                 * of B. So, ensure that another offset components for A and B
+                 * don't contribute to the final expression. Setting for them
+                 * relative and not cycled addressing guarantees that the
+                 * respective expression will be equal to zero
+                 */
+                forceRelativeAddressing(fctx, 0);
+                // fire immediate generating of coordinates declaration
+                genInitVectCoord(fctx, kstr, n, 1, true, isConst);
+                // restore original addressing mode
+                fctx->addrMode = addrMode;
+                prepared++;
+            }
+        }
+
+        coordsNum += n;
+    }
+
+    if (!skip) {
+        if (mrole == MATRIX_A) {
+            info->yaVlen = n;
+        }
+        else {
+            info->xbVlen = n;
+        }
+    }
+
+    bvidx = (mrole == MATRIX_A) ? 0 : 1;
+    if (normBoundK) {
+        // global size K in vectors for the cyclical addressing
+        if (canShare) {
+            kstrcpy(boundVars[bvidx], boundVars[1 - bvidx]->buf);
+        }
+        else if (fctx->isLoopPreparation ||
+                 (agentLoopPrepCount(fctx) <= agentUsageCount(fctx))) {
+
+            const char *name;
+            Kstring boundK;
+
+            name = (mrole == MATRIX_A) ? "vKA" : "vKB";
+            kstrcpy(boundVars[bvidx], name);
+            sprintfGboundK(&boundK, fctx);
+            kgenBatchPrintf(fctx->batch, PREPARE_VARS_STMT_PRIORITY,
+                            "const uint %s = %s;\n",
+                            boundVars[bvidx]->buf, boundK.buf);
+            prepared++;
+        }
+    }
+    else {
+        // clear the bound because it may be already not actual
+        emptyKstring(boundVars[bvidx]);
+    }
+
+    if (!fctx->isLoopPreparation) {
+        n = 0;
+
+        if (!estimateOffsetEvalCheap(fctx, 0)) {
+            unsigned int maxn;
+
+            // Ignore sharing if number of needed variables is not equal
+            if (canShare) {
+                maxn = (mrole == MATRIX_A) ? info->kbVlen : info->kaVlen;
+            }
+            else {
+                maxn = maxCoords[kdim];
+            }
+            n = selectTmpCoordsNum(fctx, coordsNum, maxn, canShare);
+            if (n != maxn) {
+                canShare = false;
+            }
+
+            if (canShare) {
+                if (mrole == MATRIX_A) {
+                    kstrcpy(&vars[TMP_COORD_AK], vars[TMP_COORD_BK].buf);
+                    info->kaIsConst = info->kbIsConst;
+                }
+                else {
+                    kstrcpy(&vars[TMP_COORD_BK], vars[TMP_COORD_AK].buf);
+                    info->kbIsConst = info->kaIsConst;
+                }
+            }
+            else {
+                n = selectTmpCoordsNum(fctx, coordsNum,
+                                       maxCoords[kdim], canShare);
+                isConst = (n == maxCoords[kdim]) || (kdim != fctx->outerDim);
+                if (!tmpNumSanityCheck(n, isConst, 0, false, fctx)) {
+                    n = 0;
+                }
+
+                if (mrole == MATRIX_A) {
+                    kstrcpy(&vars[TMP_COORD_AK], "ak");
+                    kstr = &vars[TMP_COORD_AK];
+                    info->kaIsConst = isConst;
+                }
+                else {
+                    kstrcpy(&vars[TMP_COORD_BK], "bk");
+                    kstr = &vars[TMP_COORD_BK];
+                    info->kbIsConst = isConst;
+                }
+
+                if (n) {
+                    const BlasGenSettings *gset = fctx->gset;
+                    BlasGenSettings newGset;
+
+                    // substitute normalized bound K if it has been precomputed
+                    if (normBoundK) {
+                        int idx = (mrole == MATRIX_A) ? TMP_A_VSIZEK :
+                                                        TMP_B_VSIZEK;
+
+                        memcpy(&newGset, gset, sizeof(BlasGenSettings));
+                        newGset.varNames.sizeK = vars[idx].buf;
+                        fctx->gset = &newGset;
+                        fctx->oevp.gkInVect = true;
+                    }
+                    forceRelativeAddressing(fctx, 1);
+                    genInitVectCoord(fctx, kstr, 1, n, true, isConst);
+                    fctx->addrMode = addrMode;
+                    fctx->oevp.gkInVect = false;
+                    fctx->gset = gset;
+                    prepared++;
+                }
+            }
+        }
+
+        if (mrole == MATRIX_A) {
+            info->kaVlen = n;
+        }
+        else {
+            info->kbVlen = n;
+        }
+    }
+
+    return (prepared != 0);
+}
+
+static int
+updateTmpCoords(
+    struct FetchContext *fctx,
+    unsigned int nextLine,
+    unsigned int nextVec,
+    int stmtPriority)
+{
+    TmpCoordInfo *info = (TmpCoordInfo*)fctx->currAgent->priv;
+    const Kstring *var = NULL;
+    Kstring *agvars = fctx->currAgent->vars;
+    const Tile *physTile = &fctx->physTile;
+    int relIdx = 0;
+    int ret = 0;
+
+    if (!( (nextLine < physTile->nrRows) &&
+           (nextVec < tileVecColsNum(physTile)) )) {
+
+        return 0;
+    }
+
+    /*
+     * Update not constants coordinates. Only one coordinate for
+     * each matrix can be non constant.
+     */
+    if (fctx->fopts->mrole == MATRIX_A) {
+        if ((info->yaVlen == 1) && !info->yaIsConst) {
+            var = &agvars[TMP_COORD_AY];
+        }
+        else if ((info->kaVlen == 1) && !info->kaIsConst) {
+            var = &agvars[TMP_COORD_AK];
+            relIdx = 1;
+        }
+    }
+    else {
+        if ((info->xbVlen == 1) && !info->xbIsConst) {
+            var = &agvars[TMP_COORD_BX];
+        }
+        else if ((info->kbVlen == 1) && !info->kbIsConst) {
+            var = &agvars[TMP_COORD_BK];
+            relIdx = 1;
+        }
+    }
+
+    if (var != NULL) {
+        Kstring offset;
+        FetchAddrMode origMode = fctx->addrMode;
+
+        /*
+         * See the comment for coordinates initialization along X and Y
+         * in prepareTmpCoords() to understand why the following is needed
+         */
+        forceRelativeAddressing(fctx, relIdx);
+        sprintfOffsetStateless(&offset, fctx, nextLine, nextVec);
+        kgenBatchPrintf(fctx->batch, stmtPriority, "%s = %s;\n",
+                        var->buf, offset.buf);
+        fctx->addrMode = origMode;
+        ret = 1;
+    }
+
+    return ret;
+}
+
+static void
+sprintfTmpCoordBasedOffset(
+    Kstring *expr,
+    FetchContext *fctx,
+    unsigned int line,
+    unsigned int vec)
+{
+    int kdim;
+    const TmpCoordInfo *info = (TmpCoordInfo*)fctx->currAgent->priv;
+    MatrixRole mrole = fctx->fopts->mrole;
+    const Kstring *agvars = fctx->currAgent->vars;
+    const Kstring *varK, *varXY;
+    unsigned int xy, k;
+    bool isConstK, isConstXY;
+    bool savedK, savedXY;
+    unsigned int maxK, maxXY;
+    unsigned int idxK, idxXY;
+    const BlasGenSettings *gset = fctx->gset;
+    BlasGenSettings newGset;
+    unsigned int phySizes[2];
+    Kstring tmpXY, tmpK;
+
+    memcpy(&newGset, gset, sizeof(BlasGenSettings));
+    fctx->gset = &newGset;
+
+    phySizes[0] = tileVecColsNum(&fctx->physTile);
+    phySizes[1] = fctx->physTile.nrRows;
+    kdim = bwidthPhysDimension(fctx);
+    xy = (kdim) ? vec : line;
+    k = (kdim) ? line : vec;
+
+    /*
+     * If the full set of precomputed coordinates for both the dimensions
+     * has been saved, then form the target expression simply as sum of the
+     * respective values in the dimensions. If the set is not full, e. g. only
+     * the coordinate for the top left tile corner is saved, or no coordinates
+     * is saved at all, then substitute kernel variables with respective
+     * precomputed values (it there is some for the dimension), select new line
+     * and vector accordingly, and invoke sprintf of the stateless agent.
+     * At invoking the stateless agent cyclical addressing is disabled for
+     * dimension having full set of precomputed coordinates because they
+     * already take this into account. Eventually, since precomputed coordinates
+     * for the second physical dimension already include multiplication on
+     * leading dimension, disable this step for the stateless agent
+     */
+
+    if (mrole == MATRIX_A) {
+        isConstXY = info->yaIsConst;
+        maxXY = info->yaVlen;
+        varXY = &agvars[TMP_COORD_AY];
+    }
+    else {
+        isConstXY = info->xbIsConst;
+        maxXY = info->xbVlen;
+        varXY = &agvars[TMP_COORD_BX];
+    }
+    idxXY = umin(xy, maxXY - 1);
+    savedXY = maxXY && (!isConstXY ||
+                        (xy < maxXY));
+
+    if (mrole == MATRIX_A) {
+        isConstK = info->kaIsConst;
+        maxK = info->kaVlen;
+        varK = &agvars[TMP_COORD_AK];
+    }
+    else {
+        isConstK = info->kbIsConst;
+        maxK = info->kbVlen;
+        varK = &agvars[TMP_COORD_BK];
+    }
+    idxK = umin(k, maxK - 1);
+    savedK = maxK && (!isConstK ||
+                      (k < maxK));
+
+    if (savedXY && savedK) {
+        sprintfVectorComponent(&tmpXY, varXY->buf, idxXY, maxXY);
+        sprintfVectorComponent(&tmpK, varK->buf, idxK, maxK);
+        ksprintf(expr, "%s + %s", tmpXY.buf, tmpK.buf);
+    }
+    else {
+        FetchAddrMode origMode = fctx->addrMode;
+        unsigned int newLine = line;
+        unsigned int newVec = vec;
+        KernelVarNames *kvars = &newGset.varNames;
+        const char **cname;
+
+        if (maxXY) {
+            cname = (mrole == MATRIX_A) ? &kvars->coordA : &kvars->coordB;
+            sprintfVectorComponent(&tmpXY, varXY->buf, idxXY, maxXY);
+            *cname = tmpXY.buf;
+            if ( savedXY && (!kdim || (maxXY == phySizes[1 - kdim])) ) {
+                if (mrole == MATRIX_A) {
+                    fctx->addrMode &= ~FETCH_ADDR_A_CYCLICAL;
+                }
+                else {
+                    fctx->addrMode &= ~FETCH_ADDR_B_CYCLICAL;
+                }
+            }
+
+            if (kdim) {
+                newVec = (savedXY) ? 0 : vec;
+                fctx->oevp.coordInVect = true;
+            }
+            else {
+                newLine = (savedXY) ? 0 : line;
+            }
+        }
+
+        if (maxK) {
+            sprintfVectorComponent(&tmpK, varK->buf, idxK, maxK);
+            newGset.varNames.k = tmpK.buf;
+            if ( savedK && (kdim || (maxK == phySizes[kdim])) ) {
+                fctx->addrMode &= ~FETCH_ADDR_K_CYCLICAL;
+            }
+
+            if (kdim) {
+                newLine = (savedK) ? 0 : line;
+            }
+            else {
+                newVec = (savedK) ? 0 : vec;
+                fctx->oevp.coordInVect = true;
+            }
+        }
+
+        // Substitute the bound along K if it's needed
+        if ((fctx->addrMode & FETCH_ADDR_K_CYCLICAL) &&
+            (maxK < phySizes[kdim])) {
+
+            varK = (mrole == MATRIX_A) ? &agvars[TMP_A_VSIZEK] :
+                                         &agvars[TMP_B_VSIZEK];
+            if (!isKstringEmpty(varK)) {
+                newGset.varNames.sizeK = varK->buf;
+                fctx->oevp.gkInVect = true;
+            }
+        }
+
+        // Finally disable multiplying on leading dimension
+        if ((maxXY && !kdim) || (maxK && kdim)) {
+            fctx->oevp.ldNotMul = true;
+        }
+
+        // let the staless agent doesnt's stand idly by
+        sprintfOffsetStateless(expr, fctx, newLine, newVec);
+
+        // restore original settings
+        fctx->oevp.coordInVect = false;
+        fctx->oevp.gkInVect = false;
+        fctx->oevp.ldNotMul = false;
+        fctx->addrMode = origMode;
+    }
+
+    fctx->gset = gset;
+}
+
+static void
+initTmpCoordAgent(AddressingAgent *agent)
+{
+    memset(agent, 0, sizeof(AddressingAgent));
+    agent->match = matchTmpCoordBased;
+    agent->prepareVars = prepareTmpCoords;
+    agent->updateVars = updateTmpCoords;
+    agent->sprintfAddrOffset = sprintfTmpCoordBasedOffset;
+}
+
+/************* Addressing agent using persistent coordinates ***************/
+
+enum {
+    PERS_COORD_A,
+    PERS_COORD_B,
+    MAX_PERS_COORD_VECLEN = 8
+};
+
+typedef struct PersCoordInfo {
+    // length of the vectorized coordinate for A
+    unsigned int vlenA;
+    // length of the vectorized coordinate for B
+    unsigned int vlenB;
+} MAY_ALIAS PersCoordInfo;
+
+static unsigned int
+persCoordIdx(
+    const Tile *physTile,
+    unsigned int line,
+    unsigned int vec,
+    int kdim)
+{
+    unsigned int n;
+
+    if ((line == physTile->nrRows) ||
+        (vec == tileVecColsNum(physTile))) {
+
+        n = tileVectorsNum(physTile);
+    }
+    else if (kdim) {
+        n = line * tileVecColsNum(physTile) + vec;
+    }
+    else {
+        n = vec * physTile->nrRows + line;
+    }
+
+    return n;
+}
+
+static bool
+matchPersCoordBased(const FetchContext *fctx)
+{
+    bool ret;
+
+    if ((fctx->optLevels & ~GENERIC_OPT_LEVELS) !=
+            FOPTLEV_PERS_COORD_PRECOMPUTING) {
+
+        ret = false;
+    }
+    else {
+        unsigned int maxK, depthK;
+        int kdim;
+
+        ret = !(estimateOffsetEvalCheap(fctx, 0) &&
+                estimateOffsetEvalCheap(fctx, 1)) &&
+              !isLocalMemoryUsed(fctx->fopts);
+        ret = ret && !(fctx->addrMode & (FETCH_ADDR_K_RELATIVE |
+                                         FETCH_ADDR_K_CYCLICAL));
+
+        /*
+         * Don't use this agent if dimension K is passed in the inner loop
+         * and maximum possible number of coordinates is not sufficient to
+         * cover the entire tile size in this dimension. Using this agent
+         * also makes no sense if even single step along K cannot be covered.
+         */
+        depthK = persVarDepthK(fctx, MAX_PERS_COORD_VECLEN);
+        // take any huge number to know maximum depth along K
+        maxK = persVarDepthK(fctx, 16384);
+        kdim = bwidthPhysDimension(fctx);
+
+        ret = ret && (depthK && ((depthK == maxK) ||
+                                 (fctx->outerDim == kdim)));
+    }
+
+    return ret;
+}
+
+static int
+preparePersCoords(FetchContext *fctx)
+{
+    unsigned int depthK;
+    unsigned int n;
+    Kstring *var;
+    bool decl;
+    int kdim;
+    PersCoordInfo *info;
+    MatrixRole mrole;
+
+    if (agentLoopPrepCount(fctx) > agentUsageCount(fctx)) {
+        return 0;
+    }
+
+    info = (PersCoordInfo*)fctx->currAgent->priv;
+    mrole = fctx->fopts->mrole;
+    if (mrole == MATRIX_A) {
+        var = &fctx->currAgent->vars[PERS_COORD_A];
+        decl = isKstringEmpty(var);
+        if (decl) {
+            kstrcpy(var, "vca");
+        }
+    }
+    else {
+        var = &fctx->currAgent->vars[PERS_COORD_B];
+        decl = isKstringEmpty(var);
+        if (decl) {
+            kstrcpy(var, "vcb");
+        }
+    }
+
+    kdim = bwidthPhysDimension(fctx);
+    n = (kdim) ? tileVecColsNum(&fctx->physTile) : fctx->physTile.nrRows;
+    depthK = persVarDepthK(fctx, MAX_PERS_COORD_VECLEN);
+    if (mrole == MATRIX_A) {
+        info->vlenA = n * depthK;
+    }
+    else {
+        info->vlenB = n * depthK;
+    }
+
+    genInitVectCoord(fctx, var, n, depthK, decl, false);
+
+    return 1;
+}
+
+static int
+updatePersCoords(
+    FetchContext *fctx,
+    unsigned int nextLine,
+    unsigned int nextVec,
+    int stmtPriority)
+{
+    unsigned int step;
+    int kdim;
+    struct StatementBatch *batch = fctx->batch;
+    const Kstring *var = (fctx->fopts->mrole == MATRIX_A) ?
+        &fctx->currAgent->vars[PERS_COORD_A] :
+        &fctx->currAgent->vars[PERS_COORD_B];
+    unsigned int nextCoord, maxCoords;
+    PersCoordInfo *info = (PersCoordInfo*)fctx->currAgent->priv;
+    const Tile *physTile;
+
+    kdim = bwidthPhysDimension(fctx);
+    maxCoords = (fctx->fopts->mrole == MATRIX_A) ? info->vlenA : info->vlenB;
+    nextCoord = persCoordIdx(&fctx->physTile, nextLine, nextVec, kdim);
+    if (nextCoord % maxCoords != 0) {
+        return 0;
+    }
+
+    physTile = &fctx->physTile;
+    step = (kdim) ? (maxCoords / tileVecColsNum(physTile)) :
+                    (maxCoords / physTile->nrRows);
+    if (fctx->addrMode & FETCH_ADDR_BW_STRIDE) {
+        step *= (unsigned int)fctx->gset->subdims[0].bwidth;
+    }
+
+    if (kdim) {
+        struct RawLD ld;
+        Kstring tmp1, tmp2;
+
+        fillRawLD(&ld, fctx);
+        ksprintf(&tmp1, "%u", step);
+        sprintfFastScalarMad(&tmp2, &tmp1, &ld.str, ld.scale, NULL);
+        kgenBatchPrintf(batch, stmtPriority, "%s += %s;\n",
+                        var->buf, tmp2.buf);
+    }
+    else {
+        kgenBatchPrintf(batch, stmtPriority, "%s += %u;\n",
+                        var->buf, step);
+    }
+
+    return 1;
+}
+
+static void
+sprintfPersCoordBasedOffset(
+    Kstring *kstr,
+    FetchContext *fctx,
+    unsigned int line,
+    unsigned int vec)
+{
+    const Kstring *var;
+    unsigned int kdim;
+    unsigned int idx, maxIdx;
+    PersCoordInfo *info = (PersCoordInfo*)fctx->currAgent->priv;
+
+    kdim = bwidthPhysDimension(fctx);
+    maxIdx = (fctx->fopts->mrole == MATRIX_A) ? info->vlenA : info->vlenB;
+    idx = persCoordIdx(&fctx->physTile, line, vec, kdim);
+
+    var = (fctx->fopts->mrole == MATRIX_A) ?
+        &fctx->currAgent->vars[PERS_COORD_A] :
+        &fctx->currAgent->vars[PERS_COORD_B];
+
+    sprintfVectorComponent(kstr, var->buf, idx % maxIdx, maxIdx);
+}
+
+static void
+initPersCoordAgent(AddressingAgent *agent)
+{
+    memset(agent, 0, sizeof(AddressingAgent));
+    agent->match = matchPersCoordBased;
+    agent->prepareVars = preparePersCoords;
+    agent->updateVars = updatePersCoords;
+    agent->sprintfAddrOffset = sprintfPersCoordBasedOffset;
+}
+
+/***************************************************************************/
+
+static void
+initPhysTile(FetchContext *fctx)
+{
+    MatrixRole mrole = fctx->fopts->mrole;
+    const BlasGenSettings *gset = fctx->gset;
+    const Tile *dstTile;
+    bool trans;
+    Tile *physTile = &fctx->physTile;
+
+    dstTile = getDstTile(fctx);
+    trans = dstTile->trans;
+
+    memset(physTile, 0, sizeof(Tile));
+    if ((mrole == MATRIX_A) && !(gset->flags & BGF_WHOLE_A)) {
+        const SubproblemDim *dim = &gset->subdims[1];
+
+        physTile->nrRows = (unsigned int)(trans ? dim->bwidth : dim->y);
+        physTile->nrCols = (unsigned int)(trans ? dim->y : dim->bwidth);
+    }
+    else {
+        physTile->nrRows = trans ? dstTile->nrCols : dstTile->nrRows;
+        physTile->nrCols = trans ? dstTile->nrRows : dstTile->nrCols;
+    }
+
+    physTile->vecLen = getVecLen(gset, CLBLAS_GEMM, mrole);
+    physTile->baseName = (mrole == MATRIX_A) ? gset->varNames.A :
+                                               gset->varNames.B;
+}
+
+static void
+sprintfPhysTileElement(
+    Kstring *elem,
+    FetchContext *fctx,
+    unsigned int line,
+    unsigned int vec)
+{
+    Kstring ptr;
+    Kstring off;
+    const char *varName;
+    const BlasGenSettings *gset = fctx->gset;
+
+    varName = (fctx->fopts->mrole == MATRIX_A) ? gset->varNames.A :
+                                                 gset->varNames.B;
+    if (fctx->gset->flags & BGF_UPTRS) {
+        const char *ptrName;
+
+        getVectorTypeName(gset->kextra->dtype, fctx->physTile.vecLen,
+                          NULL, &ptrName);
+        ksprintf(&ptr, "%s.%s", varName, ptrName);
+    }
+    else {
+        kstrcpy(&ptr, varName);
+    }
+
+    fctx->currAgent->sprintfAddrOffset(&off, fctx, line, vec);
+    ksprintf(elem, "%s[%s]", ptr.buf, off.buf);
+}
+
+static void
+genHandLoad(
+    FetchContext *fctx,
+    const Tile *dstTile,
+    unsigned int lineOffset,
+    unsigned int line,
+    unsigned int vec,
+    unsigned int vecLen,
+    int stmtPriority)
+{
+    Kstring src, dst;
+    unsigned int row, col;
+
+    row = (dstTile->trans) ? (vec * vecLen) : line;
+    col = (dstTile->trans) ? line : (vec * vecLen);
+
+    sprintfPhysTileElement(&src, fctx, line + lineOffset, vec);
+    sprintfTileElement(&dst, dstTile, row, col, vecLen);
+    kgenBatchPrintf(fctx->batch, stmtPriority,
+                    "%s = %s;\n", dst.buf, src.buf);
+}
+
+/*
+ * Invoke update variable methods if it is presented.
+ * Return priority that must be used for subsequent statements.
+ * Via the parameter 'priority' the function accept the last used
+ * priority level
+ */
+static int
+checkGenUpdateVars(
+    FetchContext *fctx,
+    unsigned int nextLine,
+    unsigned int nextVec,
+    int priority)
+{
+    AddressingAgent *agent = fctx->currAgent;
+    const Tile *physTile = &fctx->physTile;
+    int nextPrio;
+    bool endTile;
+
+    endTile = (nextLine == physTile->nrRows) ||
+              (nextVec == physTile->nrCols);
+    if (endTile) {
+        kgenAddStmtToBatch(fctx->batch, priority, "\n");
+    }
+
+    nextPrio = canBeFetchesMerged(fctx) ? (priority + 1) : priority;
+
+    if (agent->updateVars &&
+        agent->updateVars(fctx, nextLine, nextVec, nextPrio)) {
+
+        if (canBeFetchesMerged(fctx)) {
+            priority += 2;
+        }
+    }
+    else if (!endTile && (fctx->fopts->linesNum == 1) &&
+             tileVecColsNum(physTile) > 1) {
+
+        kgenAddStmtToBatch(fctx->batch, priority, "\n");
+    }
+
+    return priority;
+}
+
+static void
+doGenFetch(FetchContext *fctx)
+{
+    const FetchOpts *fetchOpts = fctx->fopts;
+    unsigned int lineOffset = fetchOpts->lineOffset;
+    unsigned int linesNumber = fetchOpts->linesNum;
+    const Tile *physTile, *dstTile;
+    unsigned int i, j;
+    // length of vectors the tile will be fetched with
+    unsigned int vecLen;
+    int priority = PREPARE_VARS_STMT_PRIORITY + 1;
+
+    physTile = &fctx->physTile;
+    dstTile = getDstTile(fctx);
+    vecLen = umin(dstTile->vecLen, physTile->vecLen);
+
+    if (fctx->outerDim) {
+        for (i = 0; i < linesNumber; i++) {
+            for (j = 0; j < physTile->nrCols / vecLen; j++) {
+                /*
+                 * TODO: add ability to use load with vload() depending
+                 *       on some option set
+                 */
+                genHandLoad(fctx, dstTile, lineOffset, i, j, vecLen,
+                            priority);
+            }
+            priority = checkGenUpdateVars(fctx, lineOffset + i + 1, 0,
+                                          priority);
+        }
+    }
+    else {
+        for (j = 0; j < tileVecColsNum(physTile); j++) {
+            for (i = 0; i < linesNumber; i++) {
+                genHandLoad(fctx, dstTile, lineOffset, i, j, vecLen,
+                            priority);
+            }
+            priority = checkGenUpdateVars(fctx, lineOffset, j + 1,
+                                          priority);
+        }
+    }
+}
+
+
+struct FetchContext
+*createFetchContext(void)
+{
+    FetchContext *fctx;
+    int i = 0;
+
+    fctx = calloc(1, sizeof(FetchContext));
+    if (fctx != NULL) {
+        fctx->addrMode = FETCH_ADDR_NORMAL;
+        fctx->optLevels = FOPTLEV_TMP_COORD_PRECOMPUTING;
+    }
+
+    // init addressing agents
+    while (initAgentsTable[i] != NULL) {
+        initAgentsTable[i](&fctx->agents[i]);
+        i++;
+    }
+
+    fctx->oevp.leadVecLen = 1;
+    fctx->outerDim = 1;
+
+    return fctx;
+}
+
+void
+destroyFetchContext(struct FetchContext *fctx)
+{
+    free(fctx);
+}
+
+FetchOptLevel
+getFetchOptLevels(struct FetchContext *fctx)
+{
+    return fctx->optLevels;
+}
+
+void
+enableFetchOptLevels(struct FetchContext *fctx, FetchOptLevel levels)
+{
+    fctx->optLevels |= levels;
+}
+
+void
+disableFetchOptLevels(struct FetchContext *fctx, FetchOptLevel levels)
+{
+    fctx->optLevels &= ~levels;
+}
+
+FetchAddrMode
+getFetchAddrMode(const struct FetchContext *fctx)
+{
+    return fctx->addrMode;
+}
+
+void
+setFetchAddrMode(struct FetchContext *fctx, FetchAddrMode mode)
+{
+    fctx->addrMode = mode;
+}
+
+FetchAddrMode
+setDefaultFetchAddrMode(
+    struct FetchContext *fctx,
+    const BlasGenSettings *gset,
+    FetchAddrMode mask,
+    int tailStatus,
+    bool processTailK)
+{
+    FetchAddrMode addrMode = fctx->addrMode;
+    KernelExtraFlags kflags = gset->kextra->flags;
+
+    if ((kflags & KEXTRA_TAILS_M_LOWER) && !(tailStatus & TAIL_A_RAISED)) {
+        addrMode &= ~FETCH_ADDR_A_RELATIVE;
+        addrMode |= FETCH_ADDR_A_CYCLICAL;
+    }
+    else {
+        addrMode &= ~FETCH_ADDR_A_CYCLICAL;
+        addrMode |= FETCH_ADDR_A_RELATIVE;
+    }
+
+    if ((kflags & KEXTRA_TAILS_N_LOWER) && !(tailStatus & TAIL_B_RAISED)) {
+        addrMode &= ~FETCH_ADDR_B_RELATIVE;
+        addrMode |= FETCH_ADDR_B_CYCLICAL;
+    }
+    else {
+        addrMode &= ~FETCH_ADDR_B_CYCLICAL;
+        addrMode |= FETCH_ADDR_B_RELATIVE;
+    }
+
+    if (kflags & KEXTRA_TAILS_K_LOWER) {
+        addrMode &= ~FETCH_ADDR_K_RELATIVE;
+    }
+    else {
+        addrMode |= FETCH_ADDR_K_RELATIVE;
+    }
+    if (processTailK) {
+        addrMode |= FETCH_ADDR_K_CYCLICAL | FETCH_ADDR_TAILK_PADD;
+    }
+    else {
+        addrMode &= ~(FETCH_ADDR_K_CYCLICAL | FETCH_ADDR_TAILK_PADD);
+    }
+
+    addrMode &= ~mask;
+    fctx->addrMode = addrMode;
+
+    return addrMode;
+}
+
+int
+prepareFetchLoop(
+    struct KgenContext *genCtx,
+    struct FetchContext *fetchCtx,
+    const BlasGenSettings *gset,
+    CLMemType memA,
+    CLMemType memB)
+{
+    AddressingAgent *agent, *saved;
+    FetchOpts fopts;
+    int i;
+    int ret = 0;
+    int cnt = 0;
+
+    memset(&fopts, 0, sizeof(FetchOpts));
+    fopts.memA = memA;
+    fopts.memB = memB;
+
+    fetchCtx->fopts = &fopts;
+    fetchCtx->gset = gset;
+
+    fetchCtx->batch = createStmtBatch();
+    if (fetchCtx->batch == NULL) {
+        return -ENOMEM;
+    }
+
+    saved = fetchCtx->prevAgent;
+
+    fetchCtx->isLoopPreparation = true;
+    for (i = 0; i < 2; i++) {
+        fopts.mrole = (i) ? MATRIX_A : MATRIX_B;
+        initPhysTile(fetchCtx);
+        selectAddrAgent(fetchCtx);
+        agent = fetchCtx->currAgent;
+        if (agent->prepareVars) {
+            if (agent->prepareVars(fetchCtx)) {
+                cnt++;
+                incAgentLoopPrepCount(fetchCtx);
+                /*
+                 * Substitute previous agent so as the it could
+                 * know that some variables can be really shared
+                 * if it is selected again
+                 */
+                fetchCtx->prevAgent = agent;
+            }
+        }
+    }
+    fetchCtx->isLoopPreparation = false;
+
+    fetchCtx->prevAgent = saved;
+
+    if (cnt) {
+        flushStmtBatch(genCtx, fetchCtx->batch);
+        ret = kgenAddBlankLine(genCtx);
+        if (ret) {
+            ret = -EOVERFLOW;
+        }
+    }
+
+    destroyStmtBatch(fetchCtx->batch);
+    fetchCtx->batch = NULL;
+
+    return ret;
+}
+
+void
+revalidateFetchContext(struct FetchContext *fctx, MatrixRole mrole)
+{
+    if (fctx->currAgent != NULL) {
+        int i = (mrole == MATRIX_A) ? 0 : 1;
+
+        fctx->valid[i] = true;
+    }
+}
+
+static void
+genFetchCommon(struct FetchContext *fctx)
+{
+    if (fctx->fopts->mulOpts) {
+        fctx->addrMode = fetchAddrModeFromMulOpts(fctx->fopts->mulOpts);
+    }
+
+    // prepare needed variables
+    if (!isFetchContextValid(fctx)) {
+        fctx->prevAgent = fctx->currAgent;
+        selectAddrAgent(fctx);
+        if (fctx->currAgent->prepareVars &&
+            fctx->currAgent->prepareVars(fctx)) {
+
+            kgenAddStmtToBatch(fctx->batch, PREPARE_VARS_STMT_PRIORITY, "\n");
+        }
+    }
+
+    // fire fetch generation
+    revalidateFetchContext(fctx, fctx->fopts->mrole);
+    doGenFetch(fctx);
+    incAgentUsageCount(fctx);
+    invalidateFetchContext(fctx);
+}
+
+int
+genFetchInputTile(
+    struct KgenContext *ctx,
+    struct FetchContext *fctx,
+    const BlasGenSettings *gset,
+    const FetchOpts *fetchOpts)
+{
+    int ret;
+
+    fctx->batch = createStmtBatch();
+    if (fctx->batch == NULL) {
+        return -ENOMEM;
+    }
+
+    fctx->fopts = fetchOpts;
+    fctx->gset = gset;
+    initPhysTile(fctx);
+
+    genFetchCommon(fctx);
+    ret = flushStmtBatch(ctx, fctx->batch);
+
+    destroyStmtBatch(fctx->batch);
+    fctx->batch = NULL;
+
+    return (ret) ? -EOVERFLOW : 0;
+}
+
+void
+genFetchInputTileBatch(
+    struct StatementBatch *batch,
+    struct FetchContext *fctx,
+    const struct BlasGenSettings *gset,
+    const FetchOpts *fetchOpts)
+{
+    fctx->fopts = fetchOpts;
+    fctx->gset = gset;
+    initPhysTile(fctx);
+    fctx->batch = batch;
+
+    genFetchCommon(fctx);
+    fctx->batch = NULL;
+}
diff --git a/src/library/blas/gens/fetch.h b/src/library/blas/gens/fetch.h
new file mode 100644
index 0000000..fe33fe0
--- /dev/null
+++ b/src/library/blas/gens/fetch.h
@@ -0,0 +1,379 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef FETCH_H_
+#define FETCH_H_
+
+/**
+ * @internal
+ * @defgroup FETCH_GEN Generating fetches from memory
+ * @ingroup BLAS_GENERATORS
+ */
+
+/*@{*/
+
+/**
+ * @internal
+ * @brief Context for the fetch generator
+ */
+struct FetchContext;
+
+struct BlasGenSettings;
+//enum TailStatus;
+
+// FIXME: Deprecated. Throw later
+struct TileMulOpts;
+
+/**
+ * @internal
+ * @brief Optimization levels for the fetch generator with witch the caller
+ *        can control some aspects of the code generation.
+ *
+ * !!NOTE: At expanding this list, the levels must be placed in ascending
+ *         order of their importance.
+ *
+ * @ingroup BLAS_MAJOR_SUBGENS
+ */
+typedef enum FetchOptLevel {
+    /** Expand the fetch loop in the way providing a prefetch effect */
+    FOPTLEV_PREFETCH = 0x01,
+    /**
+     * Can share temporary coordinates for A and B. Usable in case when
+     * A and fetches are fired sequentially and hence in some cases can
+     * share the same temporary coordinates. Must be set only if fetch
+     * has been already fired for one of the tiles. Otherwise result is
+     * undefined.
+     */
+    FOPTLEV_CAN_SHARE_TMP_AB = 0x02,
+    /**
+     * Reorder generated statements so as fethes would be groupped
+     * all together
+     */
+    FOPTLEV_MERGE_FETCHES = 0x04,
+    /** Enable using of temporary precomputed coordinates */
+    FOPTLEV_TMP_COORD_PRECOMPUTING = 0x08,
+    /** Enable using of persistent precomputed coordinates */
+    FOPTLEV_PERS_COORD_PRECOMPUTING = 0x10
+} FetchOptLevel;
+
+/**
+ * @internal
+ * @brief Addressing modes for the fetch generator
+ */
+typedef enum FetchAddrMode {
+    /**
+     * Normal mode. Fetching is performed only with full vectors.
+     * Physical coordinates in memory are absolute for the matrices and
+     * evaluated only based on the logical coordinates along rows of the
+     * matrix \b A, columns of the matrix \b B and coordinate along K
+     */
+    FETCH_ADDR_NORMAL = 0,
+    /**
+     * Pointer for the matrix A is set at start of the tile panel.
+     * All resulting coordinates will be relative against this base.
+     * KernelVarNames::CoordA the generator settings structure is not used
+     */
+    FETCH_ADDR_A_RELATIVE = 0x01,
+    /**
+     * Pointer for the matrix B is set at start of the tile panel.
+     * All resulting coordinates will be relative against this base.
+     * KernelVarNames::CoordB the generator settings structure is not used
+     */
+    FETCH_ADDR_B_RELATIVE = 0x02,
+    /**
+     * Pointers for A and B match the current coordinate along dimension K and
+     * thus set at the beginning of the tile. All resulting coordinates will be
+     * relative against the current value of the pointers.
+     * KernelVarNames::CoordA, KernelVarNames::coordB and KernelVarNames
+     * accessible via the generator settings structure are not used
+     */
+    FETCH_ADDR_K_RELATIVE = 0x04,
+    /**
+     * Cyclical addressing along rows of \b A. That means substracting
+     * number of rows from the coordinate in case of exceeding it.
+     */
+    FETCH_ADDR_A_CYCLICAL = 0x08,
+    /** Cyclical addressing along columns of B */
+    FETCH_ADDR_B_CYCLICAL = 0x10,
+    /** Cyclical addressing along K dimension */
+    FETCH_ADDR_K_CYCLICAL = 0x20,
+    /**
+     * Perform padding of the trailing part along dimension K.
+     * That allows perform a vectorized fetch of tail including a piece being
+     * outside the size along K. It affects only if K expands along the leading
+     * dimension
+     */
+    FETCH_ADDR_TAILK_PADD = 0x40,
+    /*
+     * Expand loop with stride equal to witdth of the top level block
+     */
+    FETCH_ADDR_BW_STRIDE = 0x80
+} FetchAddrMode;
+
+
+// FIXME: Deprecated and should be thrown away later
+union FetchTmpVarName {
+    const char *idx;
+    const char *uptr;
+};
+
+/**
+ * @internal
+ * @brief Specific settings for the fetching generator
+ * @ingroup BLAS_MAJOR_SUBGENS
+ */
+typedef struct FetchOpts {
+    MatrixRole mrole;
+    CLMemType memA;             /**< type of memory matrix A is located on */
+    CLMemType memB;             /**< type of memory matrix B is located on */
+    unsigned int lineOffset;
+    unsigned int linesNum;
+    const char *regName;        // TODO: the field is deprecated. Remove it
+
+    /*
+     * FIXME: one more klugde for backward compatibility; get addressing
+     *        mode from the options of tilemul
+     */
+    const struct TileMulOpts *mulOpts;
+
+    // TODO: All the following fields are deprecated. Remove it
+    union FetchTmpVarName tmpYvar;
+    union FetchTmpVarName tmpXvar;
+    const char *alvM;     /**< vecLen-aligned M in vectors */
+    const char *alvN;     /**< vecLen-aligned N in vectors */
+    const char *alvKA;    /**< vecLen-aligned K in vectors of A */
+    const char *avlKB;    /**< vecLen-aligned K in vectors of B */
+    const char *ax;       /**< matrix A x coordinate, in vectors */
+    const char *ay;       /**< matrix A y coordinate */
+    const char *bx;       /**< matrix B x coordinate, in vectors */
+    const char *by;       /**< matrix B y coordinate */
+    const char *ldav;     /**< matrix A leading dimension, in vectors */
+    const char *ldbv;     /**< matrix B leading dimension, in vectors */
+    const char *skewArow; /**< matrix A rows skew */
+    const char *skewAcol; /**< matrix A columns skew, in vectors */
+    const char *skewBrow; /**< matrix A rows skew */
+    const char *skewBcol; /**< matrix A columns skew, in vectors */
+} FetchOpts;
+
+
+/**
+ * @internal
+ * @brief Create context for the fetch generator
+ *
+ * After creation there are enabled optimization levels relating
+ * to precomputing with storing to temporary coordinates.
+ * Addressing mode is set to ::FETCH_ADDR_NORMAL
+ *
+ * @return pointer to a new context object on success, NULL otherwise
+ */
+struct FetchContext
+*createFetchContext(void);
+
+/**
+ * @internal
+ * @brief Destroy fetch generator context
+ *
+ * @param[out] fctx            Fetch generator context to destroy
+ */
+void
+destroyFetchContext(struct FetchContext *fctx);
+
+/**
+ * @internal
+ * @brief Get current fetch optimization levels
+ *
+ * @param[in] fctx              Fetch context
+ */
+FetchOptLevel
+getFetchOptLevels(struct FetchContext *fctx);
+
+/**
+ * @internal
+ * @brief Enable needed code optimization levels the fetch generator
+ *
+ * @param[out] ctx              Generator context
+ * @param[in] opts              Fetch Options
+ */
+void
+enableFetchOptLevels(struct FetchContext *fctx, FetchOptLevel levels);
+
+/**
+ * @internal
+ * @brief Disable unneeded code optimization levels for the fetch generator
+ *
+ * @param[out] ctx              Generator context
+ * @param[in] opts              Fetch Options
+ */
+void
+disableFetchOptLevels(struct FetchContext *fctx, FetchOptLevel levels);
+
+/**
+ * @internal
+ * @brief Get current addressing mode used by the fetch generator
+ *
+ * @param[in] fctx              Fetch context
+ */
+FetchAddrMode
+getFetchAddrMode(const struct FetchContext *fctx);
+
+/**
+ * @internal
+ * @brief Set addressing mode for the fetch generator
+ *
+ * @param[out] fctx             Fetch context
+ * @param[in]  mode             Addressing mode to set
+ */
+void
+setFetchAddrMode(struct FetchContext *fctx, FetchAddrMode mode);
+
+/**
+ * @internal
+ * @brief Set default fetch addressing mode based on the problem specifics
+ *
+ * @param[out] fctx             Fetch context
+ * @param[in]  gset             Generator settings
+ * @param[in]  mask             Addressing mode mask
+ * @param[in]  tailStatus       Tails handling status
+ * @param[in]  processTailK     Flag showing if the tail part along the
+ *                              dimension K is picked up or not.
+ *
+ * Primarily, the function checks if there are tails along rows of A,
+ * columns of B, dimension K and if some tails are raised or not.
+ * Based on this info and also taking into account fetch vector length,
+ * it set appropriate addressing mode to don't exceed matrix bounds during
+ * the fetch operations. If there are not "small" tails for rows of A and
+ * columns of B is selects relative addressing for them. If there are not
+ * "small" tails along K, it selects relative addressing for this dimension
+ * as well.
+ *
+ * The addressing mode mask passed via the \b mask parameter is used to
+ * not set addressing modes not suitable for callers. Resulting addressing
+ * mode which is set is presented as bitwise AND of  a default value selected
+ * by the function and bitwise negated value of the mask
+ *
+ * \b tailStatus is a bit mask of values consisting the #TailStatus enumeration.
+ *
+ * @return Addressing mode the function set during the last call.
+ */
+FetchAddrMode
+setDefaultFetchAddrMode(
+    struct FetchContext *fctx,
+    const struct BlasGenSettings *gset,
+    FetchAddrMode mask,
+    int tailStatus,
+    bool processTailK);
+
+/**
+ * @internal
+ * @brief Prepare the fetch generator to generate efficient fetches
+ *        within the K loop
+ *
+ * @param[out] genCtx           Generator context
+ * @param[out] fetchCtx         Fetch context
+ * @param[in] gset              Generator settings
+ * @param[in] memA              Type of memory the matrix A is stored in
+ * @param[in] memB              Type of memory the matrix B is stored in
+ *
+ * Basically, the function lets to declare all needed for work of the fetch
+ * generator. If a user lots upon efficient fetching within the tilemul loop,
+ * he should call the function before generating that loop.
+ * If it is not invoked, the fetch generator produces a code in some default
+ * way which may be far from efficient. The stuff prepared with the function is
+ * valid only for one fetch call. If the user needs to use the same once again,
+ * it may use revalidateFetchContext().
+ */
+int
+prepareFetchLoop(
+    struct KgenContext *genCtx,
+    struct FetchContext *fetchCtx,
+    const struct BlasGenSettings *gset,
+    CLMemType memA,
+    CLMemType memB);
+
+/**
+ * @internal
+ * @brief Revalidate fetch context
+ *
+ * @param[out] fctx             Fetch context
+ * @param[in]  mrole            Matrix to revalidate the context for
+ *
+ * Enable the fetch generator to use the stuff produces with the last call
+ * of prepareFetch() once again.
+ */
+void
+revalidateFetchContext(struct FetchContext *fctx, MatrixRole mrole);
+
+/**
+ * @internal
+ * @brief Tile fetching generator
+ *
+ * @param[out] genCtx         Generator context
+ * @param[in]  fetchCtx       FetchContext
+ * @param[in]  gset           Generator settings
+ * @param[in]  fetchOpts      Fetch-specific generator options
+ *
+ * This function generates code which fetches tile a or b from global or local
+ * memory into private memory.\n
+ * Generated code fetches tiles by vectors using coordinate values in vectors
+ * from @ref FetchOpts.
+ * Complex types and conjugated tiles are supported. Global cycling is supported
+ * for global memory fetching - this mean that if tile overlaps matrix
+ * the tail of tile will be fetched from the beginning instead of accessing
+ * memory outside the matrix.\n
+ * Second level of subdimensions is used for tile sizes.\n
+ * Tile can be fetched from global memory or from local memory.
+ * If tile is fetched from local memory then leading dimensions for local
+ * memory area are taken from first level subdimensions.\n
+ * Post-fetch callback generator function can be called after fetching tile
+ * for zeroing tails or setting diagonal elements to one. This function is
+ * provided by caller in @ref TileMulOpts.postFetch.\n
+ * After the function completes its work it invalidates the fetch context, and
+ * all the stuff that has been prepared before, will not be used in the next
+ * fetch transaction.
+ *
+ * @return 0 on success
+ * @return -EOVERFLOW on source buffer overflowing
+ */
+int
+genFetchInputTile(
+    struct KgenContext *genCtx,
+    struct FetchContext *fetchCtx,
+    const struct BlasGenSettings *gset,
+    const FetchOpts *fetchOpts);
+
+/**
+ * @internal
+ * @brief Fetch input tile
+ *
+ * @param[out] batch                    Statement batch
+ * @param[in]  gset                     Generator settings
+ * @param[in]  fetchOpts                Fetch Options
+ *
+ * The function has the same effect and semantics as the previous one,
+ * but put the code to the intermediate statement batch rather than a target
+ * generator context.
+ */
+void
+genFetchInputTileBatch(
+    struct StatementBatch *batch,
+    struct FetchContext *fctx,
+    const struct BlasGenSettings *gset,
+    const FetchOpts *fetchOpts);
+
+/*@}*/
+
+#endif /* FETCH_H_ */
diff --git a/src/library/blas/gens/gbmv.cpp b/src/library/blas/gens/gbmv.cpp
new file mode 100644
index 0000000..115ffbc
--- /dev/null
+++ b/src/library/blas/gens/gbmv.cpp
@@ -0,0 +1,482 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/*
+ * gbmv generator
+ */
+//#define DEBUG_GBMV
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include "blas_kgen.h"
+#include <kprintf.hpp>
+#include <gbmv.clT>
+#include <solution_seq.h>
+
+extern "C"
+unsigned int dtypeSize(DataType type);
+
+
+static char Prefix[4];
+
+static int
+getDefaultDecomposition(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    void *pArgs);
+
+static SolverFlags
+solverFlags(void)
+{
+	#ifdef DEBUG_GBMV
+	printf("solverFlags callen......\n");
+	#endif
+
+    return (SF_WSPACE_1D);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+
+static void
+assignKargs(KernelArg *args, const void *params, const void* extra );
+
+extern "C"
+void initGbmvRegisterPattern(MemoryPattern *mempat);
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *kArgs);
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs);
+
+static SolverOps gbmvOps = {
+    generator,
+    assignKargs,
+    isFitToLDS,
+    NULL, // Prepare Translate Dims
+    NULL, // Inner Decomposition Axis
+    calcNrThreads,
+    NULL,
+    solverFlags,
+	NULL,
+    getDefaultDecomposition,
+	NULL,
+	setBuildOpts,
+	NULL
+};
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+	const SolutionStep *step = (const SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+
+	if ( (kargs->dtype == TYPE_DOUBLE) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) )
+	{
+		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		#ifdef DEBUG_GBMV
+		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
+		#endif
+	}
+
+    if( kargs->pigFuncID == CLBLAS_TBMV )
+	{
+		strcat( buildOptStr, " -DTBMV_ONLY ");
+		if( kargs->diag == clblasUnit )
+		{
+		    strcat( buildOptStr, " -DUNIT_DIAG ");
+		}
+	}
+	if( ((kargs->pigFuncID == CLBLAS_GBMV) || (kargs->pigFuncID == CLBLAS_TBMV)) && (kargs->transA == clblasConjTrans) )
+	{
+	    strcat( buildOptStr, " -DDO_CONJ ");
+	}
+
+	if( (kargs->pigFuncID == CLBLAS_SBMV) || (kargs->pigFuncID == CLBLAS_HBMV) )
+	{
+	    bool isUpper = ( kargs->uplo == clblasUpper )? true: false;
+	    isUpper = ( kargs->order == clblasColumnMajor )? !isUpper : isUpper;
+
+	    if( isUpper )
+	            strcat( buildOptStr, " -DGIVEN_SHBMV_UPPER ");
+	    else    strcat( buildOptStr, " -DGIVEN_SHBMV_LOWER ");
+
+        if(kargs->pigFuncID == CLBLAS_HBMV)
+        {
+            strcat( buildOptStr, " -DHBMV_ONLY ");
+            if( kargs->order == clblasColumnMajor )  // Since routine calls Row-major, the whole matrix has to be conjugated while loading
+            {
+                strcat( buildOptStr, " -DDO_CONJ ");
+            }
+        }
+	}
+
+	return;
+}
+
+
+static CLBLASMpatExtra mpatExtra;
+
+extern "C"
+void initGbmvRegisterPattern(MemoryPattern *mempat)
+{
+	#ifdef DEBUG_GBMV
+	printf("initGBMVREgPattern called with mempat = 0x%p\n", mempat);
+	#endif
+
+	fflush(stdout);
+    mempat->name = "Register accumulation based gbmv";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &gbmvOps;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L2;
+    mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS; // For "x" vector
+    mpatExtra.mobjA = CLMEM_BUFFER;
+    mpatExtra.mobjB = CLMEM_BUFFER;
+    mempat->extra = &mpatExtra;
+
+	Prefix[TYPE_FLOAT] = 'S';
+	Prefix[TYPE_DOUBLE] = 'D';
+	Prefix[TYPE_COMPLEX_FLOAT] = 'C';
+	Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *_extra)
+{
+	int BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block
+	size_t fM, fN;
+
+    const CLBlasKargs *kargs = (const CLBlasKargs *)args;
+	const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra;
+
+	clblasOrder order = ( extra->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor;
+	clblasTranspose trans = ( extra->flags & KEXTRA_TRANS_A) ? clblasTrans :
+								(( extra->flags & KEXTRA_CONJUGATE_A) ? clblasConjTrans: clblasNoTrans);
+
+    fM = kargs->M;
+    fN = kargs->N;
+    if ( order == clblasColumnMajor )
+    {
+        order = clblasRowMajor;
+        fM = kargs->N;
+        fN = kargs->M;
+        if ( trans == clblasNoTrans)
+        {
+            trans = clblasTrans;
+        }
+        else if ( trans == clblasTrans )
+        {
+            trans = clblasNoTrans;
+        }
+        else // clblasConjTrans
+        {
+            trans = clblasNoTrans;
+        }
+    }
+    if( (kargs->pigFuncID == CLBLAS_SBMV) || (kargs->pigFuncID == CLBLAS_HBMV) )    // Only NT kernel is used
+    {
+        trans = clblasNoTrans;
+    }
+
+	size_t blocks;
+	size_t H = subdims->x;
+	size_t TARGET_ROWS =  BLOCKSIZE / H;
+
+	if( trans == clblasNoTrans )
+	{
+    	blocks = ((fM - 1)/ TARGET_ROWS) + 1;
+    }
+    else {
+        blocks = ((fN - 1)/ H) + 1;
+    }
+
+	threads[0] = blocks * BLOCKSIZE;
+	threads[1] = 1;
+
+	#ifdef DEBUG_GBMV
+	    printf("calcNrThreads called from gbmv.cpp\n");
+	    printf("BLOCKSIZE : %d, subdims->x : %d\n", BLOCKSIZE, H);
+	    printf("blocks : %d\n", blocks);
+	    printf("pgran-wgSize[0] : %d, globalthreads[0]  : %d\n", pgran->wgSize[0], threads[0]);
+	#endif
+
+}
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+
+	size_t BLOCKSIZE  = pgran->wgSize[0];
+	size_t H = subdims->x;
+	char tempTemplate[64*1024];
+	char def_target_rows[10], def_h[10];
+
+    SolutionStep *step = container_of( pgran , pgran, SolutionStep);    // NOTE: using container_of() to get pigFuncID
+    CLBlasKargs* kargs = (CLBlasKargs*) &(step->args);
+
+
+	if ( buf == NULL) // return buffer size
+	{
+		buflen = (64 * 1024 * sizeof(char));
+        return (ssize_t)buflen;
+	}
+
+	CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
+
+	//clblasUplo uplo   = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower;
+	clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor;
+	clblasTranspose trans = ( extraFlags->flags & KEXTRA_TRANS_A) ? clblasTrans : (( extraFlags->flags & KEXTRA_CONJUGATE_A) ? clblasConjTrans: clblasNoTrans);
+
+    if ( order == clblasColumnMajor )
+    {
+        order = clblasRowMajor;
+        if ( trans == clblasNoTrans)
+        {
+            trans = clblasTrans;
+        }
+        else if ( trans == clblasTrans )
+        {
+            trans = clblasNoTrans;
+        }
+        else // clblasConjTrans
+        {
+            trans = clblasNoTrans;
+        }
+    }
+    if( (kargs->pigFuncID == CLBLAS_SBMV) || (kargs->pigFuncID == CLBLAS_HBMV) )    // Only NT kernel is used
+    {
+        trans = clblasNoTrans;
+    }
+
+	if ((BLOCKSIZE % H) != 0)
+    {
+		printf("WARNING: GBMV: generator: Invalid Block Size\n");
+		return 0;
+	}
+	size_t TARGET_ROWS =  BLOCKSIZE / H;
+
+	if ( trans == clblasNoTrans)
+	{
+		strcpy(tempTemplate, (char*)gbmv_RNT_kernel);
+	}
+	else // Transpose cases...
+	{
+        strcpy(tempTemplate, (char*)gbmv_RT_kernel);;
+	}
+
+    unsigned int vecLenA = extraFlags->vecLenA;
+
+	bool doVLOAD = false;       // Always scalar load for banded matrices
+	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD);
+
+    sprintf( def_target_rows, "%d", (int)TARGET_ROWS );
+	sprintf( def_h, "%d", (int)H );
+
+	#ifdef DEBUG_GBMV
+	    printf("GBMV GENERATOR called....\n");
+	    if((( extraFlags->flags &  KEXTRA_TRANS_A) || ( extraFlags ->flags & KEXTRA_CONJUGATE_A )))
+	    {
+	        printf("A is trans or CONJ-TRANS\n");
+	    }
+	    else
+	    {
+	        printf("A is noTrans...\n");
+	    }
+        printf("TARGET ROWS = %s\n", def_target_rows);
+        printf("H = %s\n", def_h);
+        printf("dataType : %c\n", Prefix[extraFlags->dtype]);
+	#endif
+
+    kobj.put("%DEF_H", (const char *)def_h);
+    kobj.put("%DEF_TARGET_ROWS", (const char *)def_target_rows);
+    kobj.spit((char*)buf, tempTemplate);
+
+	return (64 * 1024 * sizeof(char));
+}
+
+/*
+__kernel void %PREFIXgbmv_RNT_kernel( __global const %TYPE * _A, __global %TYPE * _y_vector, __global %TYPE const* restrict _x_vector,
+                                        uint M, uint N, uint KL, uint KU, uint lda, int incx, int incy, uint offa, uint offx, uint offy
+ifndef TBMV_ONLY
+                                    ,%TYPE alpha, %TYPE beta
+endif
+*/
+static void
+assignKargs(KernelArg *args, const void *params, const void* )
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+
+	size_t fM, fN, fKL, fKU;
+	cl_int inc;
+
+	if( blasArgs->order == clblasColumnMajor )       // M, N, KL, KU gets swapped
+	{
+	    fM = blasArgs->N;
+	    fN = blasArgs->M;
+	    fKL = blasArgs->KU;
+	    fKU = blasArgs->KL;
+	}
+	else    {
+	    fM = blasArgs->M;
+	    fN = blasArgs->N;
+	    fKL = blasArgs->KL;
+	    fKU = blasArgs->KU;
+	}
+
+    INIT_KARG(&args[0], blasArgs->A); 	    //A - input matrix - argument
+    INIT_KARG(&args[1], blasArgs->C);       //y - y vector
+    INIT_KARG(&args[2], blasArgs->B);       //x - actual x vector argument
+
+	initSizeKarg(&args[3], fM);
+    initSizeKarg(&args[4], fN);
+    initSizeKarg(&args[5], fKL);
+    initSizeKarg(&args[6], fKU);
+
+    initSizeKarg(&args[7], blasArgs->lda.matrix);
+    inc = blasArgs->ldb.vector;
+    INIT_KARG(&args[8], inc);
+    inc = blasArgs->ldc.vector;
+    INIT_KARG(&args[9], inc);
+
+	initSizeKarg(&args[10], blasArgs->offa);
+	initSizeKarg(&args[11], blasArgs->offBX);
+	initSizeKarg(&args[12], blasArgs->offCY);
+
+	// For GBMV, SBMV, HBMV both alpha and beta has to be passed.
+	if( (blasArgs->pigFuncID == CLBLAS_GBMV) || (blasArgs->pigFuncID == CLBLAS_SBMV) || (blasArgs->pigFuncID == CLBLAS_HBMV) )
+	{
+		assignScalarKarg(&args[13], &(blasArgs->alpha), blasArgs->dtype);
+		assignScalarKarg(&args[14], &(blasArgs->beta), blasArgs->dtype);
+	}
+
+	#ifdef DEBUG_GBMV
+    printf("KL %d\tKU %d\n", fKL, fKU);
+	#endif
+
+	return;
+}
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs)
+{
+    kernelArgs = kernelArgs; // To remove warnings
+    cl_ulong maxSize = ( (dim[0].x+1) * dim[0].y ) * sizeof(dtype);
+
+    return ( maxSize <= ldsSize );
+}
+
+static int
+getDefaultDecomposition(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    void *pArgs)
+{
+    SolutionStep *step = container_of( pgran , pgran, SolutionStep);
+    size_t maxWorkGroupSize;
+    cl_device_id devID = step->device.id;
+    size_t wgX, wgY;
+    pArgs = pArgs;
+
+    clGetDeviceInfo(devID, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                    sizeof(size_t), &maxWorkGroupSize, NULL);
+
+    if (maxWorkGroupSize >= 256)
+    {
+        wgX = 32;
+        wgY = 8;
+    } else if (maxWorkGroupSize >= 128)
+    {
+        wgX = 32;
+        wgY = 4;
+    } else {
+        //
+        // PENDING: What if maxWorkGroupSize < 64 ????
+        //
+        wgX = 32;
+        wgY = 2;
+    }
+
+    pgran->wgDim = 1; //1D blocking
+    pgran->wgSize[0] = (unsigned int)(wgX * wgY);
+    pgran->wgSize[1] = 1;
+
+    if(subdimsNum > 0)
+    {
+        subdims[0].y = wgY ;
+        subdims[0].x = wgX ;
+        subdims[0].itemX = subdims[0].x;
+        subdims[0].itemY = subdims[0].y;
+        subdims[0].bwidth = 1;
+    }
+    if(subdimsNum > 1)
+    {
+        subdims[1].itemY = 1;
+        subdims[1].itemX = 1;
+        subdims[1].y = subdims[1].itemY;
+        subdims[1].x = subdims[1].itemX;
+        subdims[1].bwidth = 1;
+    }
+
+    return 0;
+}
diff --git a/src/library/blas/gens/gemm.c b/src/library/blas/gens/gemm.c
new file mode 100644
index 0000000..25cf499
--- /dev/null
+++ b/src/library/blas/gens/gemm.c
@@ -0,0 +1,1447 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Cached global buffers based gemm generator
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+
+#include "blas_kgen.h"
+#include "blas_subgroup.h"
+#include "gen_helper.h"
+
+typedef struct {
+    size_t staggered;
+} MAY_ALIAS extraData_t;
+
+static CLBLASMpatExtra mpatExtra;
+
+static ssize_t
+blockGen(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+static ssize_t
+subgGen(
+    char *pBuf,
+    size_t buflen,
+    const struct SubproblemDim *pSubDims,
+    const struct PGranularity *pPGran,
+    void *pExtra );
+
+static void
+assignBlockKargs(
+    KernelArg *args,
+    const void *params,
+    const void *extra);
+
+static bool
+blockCheckCalcDecomp(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    DataType dtype,
+    int check);
+
+static int
+blockGetPerf(
+    unsigned int kflags,
+    const void *args);
+
+static void
+assignSubgKargs(
+    KernelArg *args,
+    const void *params,
+    const void *extra);
+
+static SolverFlags
+solverFlags(void);
+
+static DecompositionAxis
+innerDecompositionAxis(const void *args);
+
+static int
+gemmSubgGetDefaultDecomp(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    void * pArgs);
+
+static bool
+subgCheckCalcDecomp(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    DataType dtype,
+    int check);
+
+static void
+subgCalcGlobalThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra
+);
+
+static int
+subgGetPerf(
+    unsigned int kflags,
+    const void *args);
+
+static void
+fixupArgs(void *args, SubproblemDim *subdims, void *extra);
+
+static SolverOps blockSOps = {
+    blockGen,
+    assignBlockKargs,
+    NULL,
+    blockGetPerf,
+    innerDecompositionAxis,
+    NULL,
+    NULL,
+    solverFlags,
+    NULL,// fixup kargs
+    NULL, //blockGetDefaultDecomp,
+    blockCheckCalcDecomp,
+    NULL,
+    NULL
+};
+
+static SolverOps subgSOps = {
+    subgGen,
+    assignSubgKargs,
+    NULL,
+    subgGetPerf,
+    innerDecompositionAxis,
+    subgCalcGlobalThreads,
+    NULL,
+    solverFlags,
+    fixupArgs,// fixup kargs
+    gemmSubgGetDefaultDecomp,
+    subgCheckCalcDecomp,
+    NULL,
+    NULL
+};
+
+//*****************************************************************************
+//-----------------------------------------------------------------------------
+
+static void
+genSetupItemPtr(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    MatrixRole mrole)
+{
+    char tmp[1024];
+    unsigned int vecLen;
+    char ldv[64];
+    int shift;
+    char ptrLit;
+    char shiftMul[128];
+    size_t tileWidth;
+    int widx;
+    KernelExtraFlags kflags = gset->kextra->flags;
+
+    /*
+     * The matrix was made B inner if every thread should accesses their
+     * elements with a large stride but accesses elements of the matrix A
+     * sequentially to provide more coalesced memory accesses.
+     * Otherwise, the matrix A was made inner.
+     */
+    widx = (!isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_A) &&
+            isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_B)) ? 1 : 0;
+
+    vecLen = getVecLen(gset, CLBLAS_GEMM, mrole);
+    shift = findHighestSetBit(vecLen);
+    if (mrole == MATRIX_A) {
+        tileWidth = gset->subdims[1].y;
+        ptrLit = 'A';
+        if ((shift > 0) && !(gset->flags & BGF_LD_IN_VECTORS)) {
+            sprintf(ldv, "(lda >> %d)", shift);
+        }
+        else {
+            strcpy(ldv, "lda");
+        }
+    }
+    else {
+        tileWidth = gset->subdims[1].x;
+        ptrLit = 'B';
+        if ((shift > 0) && !(gset->flags & BGF_LD_IN_VECTORS)) {
+            sprintf(ldv, "(ldb >> %d)", shift);
+        }
+        else {
+            strcpy(ldv, "ldb");
+        }
+        widx = 1 - widx;
+    }
+
+    if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, mrole)) {
+        if (tileWidth / vecLen > 1) {
+            sprintf(shiftMul, " * %lu", tileWidth / vecLen);
+        }
+        else {
+            shiftMul[0] = '\0';
+        }
+        // Alternative calculate global thead id to eliminate Channel Conflicts.
+        if (mrole == MATRIX_B) {
+            int bankSize = 2048;
+            int dataSize = 0;
+            int grShift;
+
+            DataType dtype = gset->kextra->dtype;
+            switch (dtype) {
+            case TYPE_FLOAT:           dataSize = 4; break;
+            case TYPE_COMPLEX_DOUBLE:  dataSize = 16; break;
+            default:                   dataSize = 8; break;
+            }
+
+            grShift = bankSize/ dataSize;
+
+            sprintf(tmp,
+                "get_group_id_%d = (get_group_id(0) + get_group_id(1))"
+                    "%% get_num_groups(%d);\n", widx, widx);
+            kgenAddStmt(ctx, tmp);
+
+            sprintf(tmp,
+                "get_global_id_%d = get_group_id_%d * get_local_size(%d) "
+                    "+ get_local_id(%d);\n",widx, widx, widx, widx);
+            kgenAddStmt(ctx, tmp);
+
+
+            sprintf(tmp,
+                "kif = (N %% %d != 0);\n"
+                "get_global_id_%d = (kif*(uint)get_global_id(%d)) + "
+                    "((1-kif)*get_global_id_%d);\n",grShift, widx, widx, widx);
+            kgenAddStmt(ctx, tmp);
+
+            sprintf(tmp,
+                "%c += get_global_id_%d%s;",
+                    ptrLit, widx, shiftMul);
+        }
+        else {
+            sprintf(tmp, "%c += (uint)get_global_id(%d)%s;\n",
+                    ptrLit, widx, shiftMul);
+        }
+
+    }
+    else {
+        sprintf(tmp, "%c += %luu * (uint)get_global_id(%d) * %s;\n",
+                ptrLit, tileWidth, widx, ldv);
+    }
+    kgenAddStmt(ctx, tmp);
+}
+
+static void
+genShiftPointers(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    KernelExtraFlags kflags,
+    bool vectorizedPtrs)
+{
+    char tmp[1024];
+    unsigned int flags[3] = {KEXTRA_A_OFF_NOT_ZERO, KEXTRA_BX_OFF_NOT_ZERO,
+                             KEXTRA_CY_OFF_NOT_ZERO};
+    char ptrNames[3] = {'A', 'B', 'C'};
+    const char *offNames[3] = {"offA", "offB", "offC"};
+    MatrixRole mroles[3] = {MATRIX_A, MATRIX_B, MATRIX_C};
+    int i;
+
+    for (i = 0; i < 3; i++) {
+        if (kflags & flags[i]) {
+            unsigned int vecLen;
+
+            vecLen = getVecLen(gset, CLBLAS_GEMM, mroles[i]);
+
+            if( vectorizedPtrs && (vecLen > 1) ) {
+                sprintf(tmp, "%c += %s / %u;\n",
+                        ptrNames[i], offNames[i], vecLen);
+            }
+            else {
+                sprintf(tmp, "%c += %s;\n", ptrNames[i], offNames[i]);
+            }
+            kgenAddStmt(ctx, tmp);
+        }
+    }
+}
+
+//-----------------------------------------------------------------------------
+
+static void
+sprintfOffABC(
+    char *str,
+    KernelExtraFlags kflags)
+{
+    str[0] = '\0';
+    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
+        str += sprintf(str, ",\n    const uint offA");
+    }
+    if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
+        str += sprintf(str, ",\n    const uint offB");
+    }
+    if (kflags & KEXTRA_CY_OFF_NOT_ZERO) {
+        str += sprintf(str, ",\n    const uint offC");
+    }
+}
+
+static void
+declareKernel(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    const char *nameSuffix)
+{
+    char tmp[4096];
+    char offABC[1024];
+    char fpref;
+    char *tnameA, *tnameB;
+    const char *tnameC;
+    const char *rawType;
+    DataType dtype = gset->kextra->dtype;
+    unsigned int vecLen;
+    const PGranularity *pgran = gset->pgran;
+
+    fpref = dtypeToBlasPrefix(dtype);
+    rawType = dtypeBuiltinType(dtype);
+    vecLen = getVecLen(gset, CLBLAS_GEMM, MATRIX_A);
+    getVectorTypeName(dtype, vecLen, (const char **)&tnameA, NULL);
+    vecLen = getVecLen(gset, CLBLAS_GEMM, MATRIX_B);
+    getVectorTypeName(dtype, vecLen, (const char **)&tnameB, NULL);
+
+    // FIXME - take into account flag BGF_LD_IN_VECTORS
+    //sprintf( tnameC, "%s", rawType );
+    getVectorTypeName( dtype,
+        getVecLen( gset, 0, MATRIX_C ),
+        &tnameC,
+        NULL );
+
+    sprintfOffABC(offABC, gset->kextra->flags);
+
+    sprintf(tmp, "__attribute__((reqd_work_group_size(%u, %u, 1)))\n"
+                 "void __kernel\n"
+                 "%cgemm%s(\n"
+                 "    uint M,\n"
+                 "    uint N,\n"
+                 "    uint K,\n"
+                 "    const %s alpha,\n"
+                 "    const %s beta,\n"
+                 "    const __global %s *restrict A,\n"
+                 "    const __global %s *restrict B,\n"
+                 "    __global %s *C,\n"
+                 "    uint lda,\n"
+                 "    uint ldb,\n"
+                 "    uint ldc%s)\n",
+            pgran->wgSize[0], pgran->wgSize[1], fpref, nameSuffix,
+            rawType, rawType, tnameA, tnameB, tnameC, offABC);
+
+    kgenDeclareFunction(ctx, tmp);
+}
+
+//-----------------------------------------------------------------------------
+
+static void
+genHitMatrixCheck(
+    struct KgenContext *ctx,
+    KernelExtraFlags kflags)
+{
+    /* tails of upper level blocks */
+    bool tailsM = kflags & KEXTRA_TAILS_M;
+    bool tailsN = kflags & KEXTRA_TAILS_N;
+
+    if (tailsM) {
+        if (tailsN) {
+            kgenAddStmt(ctx, "if ((coord.y >= M) || (coord.x >= N)) {\n");
+        }
+        else {
+            kgenAddStmt(ctx, "if (coord.y >= M) {\n");
+        }
+    }
+    else {
+        if (tailsN) {
+            kgenAddStmt(ctx, "if (coord.x >= N) {\n");
+        }
+    }
+
+    if (tailsM || tailsN) {
+        kgenAddStmt(ctx, "    return;\n}\n\n");
+    }
+}
+
+//-----------------------------------------------------------------------------
+
+static ssize_t
+blockGen(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+    struct KgenContext *ctx;
+    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    KernelExtraFlags kflags = kextra->flags;
+    bool isRelA, isRelB;
+    bool tailsK = ((kflags & KEXTRA_TAILS_K_LOWER) != 0);
+    DataType dtype = kextra->dtype;
+    char tmp[2048];
+    bool doubleBased = isDoubleBasedType(dtype);
+    BlasGenSettings gset;
+    KernelVarNames *vnames = &gset.varNames;
+    TileMulOpts mulOpts;
+    ssize_t ret;
+    char globalIdB[64];
+    const char *alignedK;
+    FetchAddrMode addrMode, addrMask = 0;
+    FetchOpts fopts;
+    TilePostFetchPrivate pfPriv;
+    TailStatus tailStatus;
+    UpdateResultFlags upFlags;
+    unsigned int i;
+    unsigned int vecLen;
+    int isColMajA;
+    int isColMajB;
+
+    memset(&gset, 0, sizeof(gset));
+    memset(&mulOpts, 0, sizeof(mulOpts));
+    memset(&pfPriv, 0, sizeof(pfPriv));
+    memset(&fopts, 0, sizeof(fopts));
+
+    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
+    gset.flags = BGF_DISTINCT_VECLEN | BGF_LD_IN_VECTORS;
+
+    // FIXME: throw the explicit constant away
+    switch (dtype) {
+    case TYPE_FLOAT:
+//        i = 12;
+        i = 16;
+        break;
+    case TYPE_COMPLEX_DOUBLE:
+        i = 6;
+        break;
+    default:
+        i = 8;
+        break;
+    }
+
+    if (subdims[1].y + subdims[1].x <= i) {
+        gset.flags |= BGF_WHOLE_A;
+    }
+    gset.kextra = kextra;
+    gset.pgran = pgran;
+    //avoid [0].bw loop
+    gset.subdims[0].bwidth = gset.subdims[1].bwidth;
+
+    mulOpts.core = ((kflags & KEXTRA_ENABLE_MAD) &&
+                    (dtype != TYPE_COMPLEX_FLOAT)) ? TILEMUL_MAD
+                                                   : TILEMUL_MULADD;
+    mulOpts.memA = CLMEM_GLOBAL_MEMORY;
+    mulOpts.memB = CLMEM_GLOBAL_MEMORY;
+    mulOpts.fctx = createFetchContext();
+    if (mulOpts.fctx == NULL) {
+        return -ENOMEM;
+    }
+
+    ctx = createKgenContext(buf, buflen, true);
+    if (ctx == NULL) {
+        destroyFetchContext(mulOpts.fctx);
+        return -ENOMEM;
+    }
+
+    isColMajA = isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_A);
+    isColMajB = isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_B);
+
+    alignedK = (tailsK) ? "Kbase" : "K";
+
+    // setup kernel variables
+    vnames->A = "A";
+    vnames->B = "B";
+    vnames->C = "C";
+    vnames->coordA = "coord.y";
+    vnames->coordB = "coord.x";
+    vnames->k = "coord.z";
+    vnames->sizeK = alignedK;
+    vnames->sizeM = "M";
+    vnames->sizeN = "N";
+    vnames->lda = "lda";
+    vnames->ldb = "ldb";
+    vnames->ldc = "ldc";
+    vnames->alpha = "alpha";
+    vnames->beta = "beta";
+
+    // at first, generate needed declarations
+    ret = kgenDeclareUptrs(ctx, doubleBased);
+
+    declareKernel(ctx, &gset, "Block");
+    ret = kgenBeginFuncBody(ctx);
+
+    if (tailsK) {
+        sprintf(tmp, "const uint Ktail = K %% %lu;\n"
+                     "const uint Kbase = K - Ktail;\n",
+                subdims[1].bwidth);
+        kgenAddStmt(ctx, tmp);
+        alignedK = "Kbase";
+    }
+    else {
+        alignedK = "K";
+    }
+
+    initDefaultTiles(&gset, CLBLAS_GEMM, 0, PRIV_STORAGE_VARIABLE_SET);
+    declareTileStorages(ctx, &gset);
+    kgenAddStmt(ctx, "uint4 coord = 0u; /* contains coordB, coordA, k */\n");
+    kgenAddBlankLine(ctx);
+
+    vecLen = getVecLen(&gset, CLBLAS_GEMM, MATRIX_A);
+    if (vecLen > 1) {
+        kgenPrintf(ctx, "lda /= %u;\n", vecLen);
+    }
+    vecLen = getVecLen(&gset, CLBLAS_GEMM, MATRIX_B);
+    if (vecLen > 1) {
+        kgenPrintf(ctx, "ldb /= %u;\n", vecLen);
+    }
+
+    /*
+     * The matrix was made B inner if every thread should accesses their
+     * elements with a large stride but accesses elements of the matrix A
+     * sequentially to provide more coalesced memory accesses.
+     * Otherwise, the matrix A was made inner.
+     */
+    i = (!isColMajA && isColMajB) ? 1 : 0;
+
+    tailStatus = checkGenAdjustTailCoords(NULL, CLBLAS_GEMM, &gset, NULL);
+
+    if (tailStatus & TAIL_A_RAISED) {
+        addrMask |= FETCH_ADDR_A_RELATIVE;
+    }
+    if (tailStatus & TAIL_B_RAISED) {
+        addrMask |= FETCH_ADDR_B_RELATIVE;
+    }
+
+    enableFetchOptLevels(mulOpts.fctx, FOPTLEV_MERGE_FETCHES);
+    addrMode = setDefaultFetchAddrMode(mulOpts.fctx, &gset, addrMask,
+                                       tailStatus, false);
+    isRelA = ((addrMode & FETCH_ADDR_A_RELATIVE) != 0);
+    isRelB = ((addrMode & FETCH_ADDR_B_RELATIVE) != 0);
+
+    // Alternative calculate global thead id to eliminate Channel conflicts
+    if (isRelB &&
+        isMatrixAccessColMaj(CLBLAS_GEMM, gset.kextra->flags, MATRIX_B)) {
+
+        sprintf(globalIdB, "get_global_id_%d", 1-i);
+        sprintf(tmp,
+                "uint kif;\n"
+                "uint get_group_id_%d;\n"
+                "uint get_global_id_%d;\n",1-i, 1-i);
+        kgenAddStmt(ctx, tmp);
+    }
+    else {
+        sprintf(globalIdB, "get_global_id(%d)", 1-i);
+    }
+
+    if (!(isColMajA || isColMajB)) {
+        size_t tsize;
+
+        tsize = dtypeSize(dtype);
+        sprintf(tmp, "coord.z = (get_local_id(0) %% 2 * %lu) %% %s;\n",
+                sizeof(cl_float8) / tsize, alignedK);
+        kgenAddStmt(ctx, tmp);
+
+        /*
+         * Adjust fetch addressing mode. It is used staggered access. That
+         * means there is a starting offset along K and hence addressing
+         * in this dimension should be cycled.
+         */
+        addrMode &= ~FETCH_ADDR_K_RELATIVE;
+        addrMode |= FETCH_ADDR_K_CYCLICAL;
+        setFetchAddrMode(mulOpts.fctx, addrMode & ~addrMask);
+    }
+
+    if (isRelA) {
+        genSetupItemPtr(ctx, &gset, MATRIX_A);
+    }
+    if (isRelB) {
+        genSetupItemPtr(ctx, &gset, MATRIX_B);
+    }
+
+    /*
+     * Setup coordinates and check if they don't exceed matrix
+     */
+
+    sprintf(tmp, "\n"
+                 "coord.y = %luu * (uint)get_global_id(%d);\n"
+                 "coord.x = %luu * (uint)%s;\n",
+            subdims[1].y, i, subdims[1].x, globalIdB);
+    kgenAddStmt(ctx, tmp);
+
+    genHitMatrixCheck(ctx, kflags);
+    genShiftPointers(ctx, &gset, kflags, true);
+    genZeroTile(ctx, &gset.tileCY);
+
+    tailStatus = checkGenAdjustTailCoords(ctx, CLBLAS_GEMM, &gset, NULL);
+
+    mulOpts.core = ((kflags & KEXTRA_ENABLE_MAD) != 0)
+            ? TILEMUL_MAD
+            : TILEMUL_MULADD;
+
+    mulOpts.flags |= TILEMUL_EXTERN_RDECL;
+    mulOpts.flags |= kextraToTilemulFlags(CLBLAS_GEMM, kflags);
+
+    sprintf(tmp, "for (uint k1 = 0; k1 < %s; k1 += %lu)",
+            alignedK, subdims[1].bwidth);
+
+    prepareFetchLoop(ctx, mulOpts.fctx, &gset, CLMEM_GLOBAL_MEMORY,
+                     CLMEM_GLOBAL_MEMORY);
+
+    kgenBeginBranch(ctx, tmp);
+    ret = tileMulGen(ctx, &gset, &mulOpts);
+    if (ret != 0) {
+        goto out;
+    }
+    kgenEndBranch(ctx, NULL); // 0..K loop
+    kgenAddBlankLine(ctx);
+
+    //Optionally handle tails along K
+    if (tailsK) {
+        setDefaultFetchAddrMode(mulOpts.fctx, &gset, addrMask,
+                                tailStatus, true);
+
+        vnames->sizeK = "K";
+        pfPriv.fetchNumA = 0;
+        pfPriv.wholeA = 0;
+        pfPriv.funcID = CLBLAS_GEMM;
+        pfPriv.gset = &gset;
+        mulOpts.postFetch = defaultTilePostFetch;
+        mulOpts.postFetchPriv = &pfPriv;
+
+        if (!(isColMajA || isColMajB)) {
+            kgenAddStmt(ctx, "coord.z = Kbase;\n");
+        }
+
+        sprintf(tmp, "for (uint k1 = 0u; k1 < Ktail; k1 += %luu)",
+                subdims[1].bwidth);
+        kgenBeginBranch(ctx, tmp);
+        ret = tileMulGen(ctx, &gset, &mulOpts);
+        if (ret != 0) {
+            goto out;
+        }
+        kgenEndBranch(ctx, NULL); // 0..Ktail loop
+        kgenAddBlankLine(ctx);
+    }
+
+    gset.kextra = kextra;
+    checkGenRestoreTailCoords(ctx, &gset, tailStatus);
+
+    upFlags = kextraToUpresFlags(CLBLAS_GEMM, kflags);
+    upFlags |= tailStatusToUpresFlags(tailStatus);
+    upFlags |= UPRES_INDEXING_WITH_CONSTANTS;
+    genResultUpdateWithFlags(ctx, CLBLAS_GEMM, &gset, upFlags,
+                             NULL, NULL, NULL);
+
+    kgenEndFuncBody(ctx);
+    ret = kgenAddBlankLine(ctx);
+
+    if (!ret) {
+        ret = (ssize_t)kgenSourceSize(ctx) + 1;
+    }
+
+out:
+    destroyFetchContext(mulOpts.fctx);
+    destroyKgenContext(ctx);
+
+    return (ret < 0) ? -EOVERFLOW : ret;
+}
+
+//-----------------------------------------------------------------------------
+
+/*    the generator for subgroup access pattern
+    (used when A and B matrices are accessed row-major)*/
+static ssize_t
+subgGen(
+    char *pBuf,
+    size_t buflen,
+    const struct SubproblemDim *pSubDims,
+    const struct PGranularity *pPGran,
+    void *pExtra )
+{
+    struct KgenContext *pCtx;
+    CLBLASKernExtra *pKExtra = (CLBLASKernExtra*)pExtra;
+    KernelExtraFlags kflags = pKExtra->flags;
+    DataType dtype = pKExtra->dtype;
+    size_t staggered = ((extraData_t*)&pKExtra->solverPriv)->staggered;
+    char tmp[2048];
+    BlasGenSettings gset;
+    TileMulOpts mulOpts;
+    ssize_t ret;
+    FetchOpts fopts;
+    TilePostFetchPrivate pfPriv;
+    UpdateResultFlags upResFlags = 0;
+    TailStatus tailStatus;
+    FetchAddrMode addrMode;
+    Kstring exprK;
+    SubgVarNames subVNames;
+
+    KernelVarNames *vnames = NULL;
+    const char *alignedK;
+
+    unsigned int vecLenA;
+
+    bool isDoubleBased = isDoubleBasedType(dtype);
+
+    bool tailsLowerK = ( (kflags & KEXTRA_TAILS_K_LOWER) != 0 );
+    bool tailsM = ( (kflags & KEXTRA_TAILS_M) != 0 );
+    bool tailsN = ( (kflags & KEXTRA_TAILS_N) != 0 );
+    bool tailsLowerM = ( (kflags & KEXTRA_TAILS_M_LOWER) != 0 );
+    bool tailsLowerN = ( (kflags & KEXTRA_TAILS_N_LOWER) != 0 );
+
+    unsigned int subgroupsA = 0;
+    unsigned int subgroupsB = 0;
+
+    memset(&gset, 0, sizeof(gset));
+    memset(&mulOpts, 0, sizeof(mulOpts));
+    memset(&pfPriv, 0, sizeof(pfPriv));
+    memset(&fopts, 0, sizeof(fopts));
+
+    memcpy( gset.subdims, pSubDims, sizeof(gset.subdims) );
+    gset.pgran  = pPGran;
+    gset.flags  = BGF_DISTINCT_VECLEN | BGF_WHOLE_A | BGF_LD_IN_VECTORS;
+    gset.kextra = pKExtra;
+
+    vnames = &gset.varNames;
+    // setting the basic names for kernel variables
+    vnames->A = "A";
+    vnames->B = "B";
+    vnames->C = "C";
+    vnames->LDS = "scratch";
+    vnames->sizeM = "M";
+    vnames->sizeN = "N";
+    vnames->lda = "lda";
+    vnames->ldb = "ldb";
+    vnames->ldc = "ldc";
+
+    vnames->alpha = "alpha";
+    vnames->beta = "beta";
+
+    vnames->vectCoordA = "vca";
+    vnames->vectCoordB = "vcb";
+    vnames->k = exprK.buf;
+
+    subgroupsA = (unsigned int)(gset.subdims[0].y/gset.subdims[1].y);
+    subgroupsB = (unsigned int)(gset.subdims[0].x/gset.subdims[1].x);
+
+    initDefaultTiles(&gset, CLBLAS_GEMM, 0, PRIV_STORAGE_VARIABLE_SET);
+
+    vecLenA = gset.tileA.vecLen;
+
+    // channel offset based coordinate
+    ksprintf(&exprK, "( get_group_id(0)*%lu + k )", staggered/vecLenA*vecLenA);
+
+    // starting code generation--------------------------------------------------
+    pCtx = createKgenContext(pBuf, buflen, true);
+    if ( pCtx == NULL) {
+        return -ENOMEM;
+    }
+
+    //define required macros
+    /* B_BLK_H should be one of common vector sizes,
+    as matrix C is accessed by vectors of this length*/
+    sprintf(tmp,"#define A_BLK_H %lu\n",gset.subdims[1].y);
+    kgenAddStmt(pCtx,tmp);
+    sprintf(tmp,"#define B_BLK_H %lu\n",gset.subdims[1].x);
+    kgenAddStmt(pCtx,tmp);
+    sprintf(tmp,"#define SUBG_ITEMS %d\n",pPGran->wgSize[0]);
+    kgenAddStmt(pCtx,tmp);
+
+    sprintf(tmp,"#define SUBG_A %d\n",subgroupsA);
+    kgenAddStmt(pCtx,tmp);
+    sprintf(tmp,"#define SUBG_B %d\n",subgroupsB);
+    kgenAddStmt(pCtx,tmp);
+
+    kgenAddBlankLine(pCtx);
+
+    kgenAddStmt(pCtx,tmp);
+    sprintf(
+        tmp,
+        "#define K_VLEN_A %u\n"
+        "#define K_VLEN_B %u\n",
+        getVecLen(&gset, CLBLAS_GEMM, MATRIX_A),
+        getVecLen(&gset, CLBLAS_GEMM, MATRIX_B));
+
+    kgenAddStmt(pCtx,tmp);
+    kgenAddBlankLine(pCtx);
+
+    // Declare pointer unions
+    kgenDeclareUptrs(pCtx, isDoubleBased);
+    kgenAddBlankLine(pCtx);
+
+    // declaring kernel function
+    declareKernel( pCtx, &gset, "Subgroup" );
+    ret = kgenBeginFuncBody( pCtx );
+    // kernel generation steps:
+
+    // register variables declarations-----------------------------------------
+
+    // K tail
+    // if postfetch should be engaged, generate tail code for
+    // whole subgroup, otherwise tail is handled by main cycle.
+    if( tailsLowerK ){
+        sprintf(tmp,
+            "uint Ktail = K %% %lu;\n"
+            "uint Kbase = K - Ktail;\n",
+            pSubDims[0].bwidth);
+
+        kgenAddStmt(pCtx, tmp);
+        alignedK = "Kbase";
+    }
+    else {
+        alignedK = "K";
+    }
+    vnames->sizeK = alignedK;
+
+    declareTileStorages(pCtx, &gset);
+
+    // scaling leading dims
+    // If lower-K tails need to be handled, vectorized access is disabled
+    // scaling is performed by factor 1
+    sprintf(tmp, "%s /= K_VLEN_A;\n", vnames->lda);
+    kgenAddStmt(pCtx, tmp);
+    sprintf(tmp, "%s /= K_VLEN_B;\n", vnames->ldb);
+    kgenAddStmt(pCtx, tmp);
+
+    //declare variables for subgroup mode
+    subVNames.itemId = "itemId";
+
+    kgenAddBlankLine( pCtx );
+
+    kgenPrintf( pCtx, "int2 %s;\n", subVNames.itemId );
+
+    // item id
+    kgenPrintf( pCtx,
+        "%s.x = get_local_id(0);\n",
+        subVNames.itemId );
+
+    // subgroup id
+    kgenPrintf( pCtx,
+        "%s.y = get_local_id(1);\n",
+        subVNames.itemId );
+
+    kgenAddBlankLine( pCtx );
+
+    // coordinate variables
+    vnames->coordA = "coordY";
+    vnames->coordB = "coordX";
+
+    // generate offsets
+    genShiftPointers( pCtx, &gset, kflags, true );
+
+    // FIXME add new subgroup variables support
+    sprintf(tmp, "int %s = "
+                    "A_BLK_H*( "
+                        "get_group_id(1)*SUBG_A + "
+                        "get_local_id(1)/SUBG_B );\n",
+            vnames->coordA);
+    kgenAddStmt(pCtx, tmp);
+
+    sprintf(tmp, "int %s = "
+                    "B_BLK_H*( "
+                        "get_group_id(0)*SUBG_B + "
+                        "get_local_id(1)%%SUBG_B );\n",
+            vnames->coordB);
+
+    kgenAddStmt(pCtx, tmp);
+    kgenAddBlankLine(pCtx);
+
+    // Block M N tails. Drop excess blocks ------------------------------------
+    kgenAddStmt(pCtx,"uint skipTileMul = 0;\n");
+    //M
+    if( tailsM ){
+
+        kgenAddStmt(pCtx,"//M block tail\n");
+
+        sprintf(tmp,
+            "if( %s >= %s )",
+            vnames->coordA,
+            vnames->sizeM);
+
+        kgenBeginBranch( pCtx,tmp );
+        kgenAddStmt(pCtx,"skipTileMul = 1;\n");
+        kgenEndBranch(pCtx,NULL);
+
+    }
+
+    //N
+    if( tailsN ){
+
+        kgenAddStmt(pCtx,"//N block tail\n");
+
+        sprintf(tmp,
+            "if( %s >= %s )",
+            vnames->coordB,
+            vnames->sizeN);
+
+        kgenBeginBranch( pCtx,tmp );
+        kgenAddStmt(pCtx,"skipTileMul = 1;\n");
+        kgenEndBranch(pCtx,NULL);
+
+    }
+    kgenAddBlankLine(pCtx);
+
+    //"Lower" tails
+    if( tailsLowerM || tailsLowerN ){
+        kgenAddStmt(pCtx, "//Raising \"Lower\" M N tails\n");
+    }
+    tailStatus = checkGenAdjustTailCoords(pCtx, CLBLAS_GEMM, &gset, NULL);
+
+    // A, B pointers-----------------------------------------------------------
+
+    sprintf(tmp,
+            "A += %s*%s;\n",
+            vnames->lda,
+            vnames->coordA);
+
+    kgenAddStmt(pCtx, tmp);
+
+    sprintf(tmp,
+        "B += %s*%s;\n",
+        vnames->ldb,
+        vnames->coordB);
+
+    kgenAddStmt(pCtx, tmp);
+
+    // calculated in vectors, C access is aligned to.
+    // if row of C-block is splitted into smaller vectors -
+    // multiply offset by number of these vectors
+
+    kgenAddBlankLine(pCtx);
+
+    genZeroTile( pCtx, &gset.tileCY );
+
+    kgenAddBlankLine(pCtx);
+    kgenAddBlankLine(pCtx);
+
+    mulOpts.fctx = createFetchContext();
+    if (mulOpts.fctx == NULL) {
+        destroyKgenContext(pCtx);
+        return -ENOMEM;
+    }
+
+    enableFetchOptLevels(mulOpts.fctx,
+                         FOPTLEV_CAN_SHARE_TMP_AB);
+
+    addrMode = setDefaultFetchAddrMode(mulOpts.fctx,
+                                       &gset,
+                                       FETCH_ADDR_K_RELATIVE,
+                                       tailStatus,
+                                       false);
+
+    addrMode |= FETCH_ADDR_A_RELATIVE |
+                FETCH_ADDR_B_RELATIVE |
+                FETCH_ADDR_K_CYCLICAL;
+
+    setFetchAddrMode(mulOpts.fctx, addrMode);
+    prepareFetchLoop(pCtx,
+                     mulOpts.fctx,
+                     &gset,
+                     CLMEM_GLOBAL_MEMORY,
+                     CLMEM_GLOBAL_MEMORY);
+
+    if( tailsM || tailsN ){
+        kgenBeginBranch(pCtx,"if( !skipTileMul )");
+    }
+
+    sprintf(tmp,
+            "for(int k = %u*get_local_id(0); k < %s; k += %u*SUBG_ITEMS)",
+            vecLenA,
+            alignedK,
+            vecLenA);
+    kgenBeginBranch( pCtx, tmp );
+
+    // tiles multiplier--------------------------------------------------------
+
+    mulOpts.memA = CLMEM_GLOBAL_MEMORY;
+    mulOpts.memB = CLMEM_GLOBAL_MEMORY;
+
+    mulOpts.core    = ((kflags & KEXTRA_ENABLE_MAD) != 0) ? TILEMUL_MAD :
+                                                            TILEMUL_MULADD;
+
+    mulOpts.flags = kextraToTilemulFlags( CLBLAS_GEMM, kflags );
+    mulOpts.flags |= TILEMUL_EXTERN_RDECL;
+    mulOpts.flags |= TILEMUL_NOT_INC_K;
+    mulOpts.flags |= TILEMUL_BW_STRIDE;
+    /* both matrices are accessed row - major */
+    mulOpts.flags |= TILEMUL_TRB;
+
+    ret = tileMulGen( pCtx, &gset, &mulOpts );
+    if (ret != 0) {
+        goto out;
+    }
+
+    kgenEndBranch(pCtx, NULL);
+    kgenAddBlankLine(pCtx);
+
+    // K - Tail
+    if ( tailsLowerK ) {
+        setFetchAddrMode(mulOpts.fctx, addrMode | FETCH_ADDR_TAILK_PADD);
+
+        vnames->sizeK    = "K";
+        vnames->k        = "k";
+
+        kgenPrintf(pCtx,
+                   "uint %s = %s + get_local_id(0)*%u;\n",
+                   vnames->k,
+                   alignedK,
+                   vecLenA);
+
+        pfPriv.fetchNumA = 0;
+        pfPriv.wholeA = 0;
+        pfPriv.funcID = CLBLAS_GEMM;
+        pfPriv.gset = &gset;
+        mulOpts.postFetch = defaultTilePostFetch;
+        mulOpts.postFetchPriv = &pfPriv;
+
+        kgenBeginBranch(pCtx, NULL);
+        ret = tileMulGen(pCtx, &gset, &mulOpts);
+        if (ret != 0) {
+            goto out;
+        }
+        kgenEndBranch(pCtx, NULL);
+    }
+
+    if( tailsM || tailsN ){
+        kgenEndBranch(pCtx, NULL);          // skip tilemul condition
+    }
+    kgenAddBlankLine(pCtx);
+
+    upResFlags = kextraToUpresFlags(CLBLAS_GEMM, kflags) |
+                 tailStatusToUpresFlags(tailStatus);
+    // restore coordinates, if tail was raised
+    checkGenRestoreTailCoords(pCtx, &gset, tailStatus);
+    // merge and update result
+    mergeUpdateResult( pCtx,
+        CLBLAS_GEMM,
+        &gset,
+        &subVNames,
+        upResFlags |
+        UPRES_EXCEED_PROBLEM_CONDITION |
+        UPRES_INDEXING_WITH_CONSTANTS,
+        (UpresProcPtr)genResultUpdateWithFlags );
+    kgenEndFuncBody(pCtx);
+
+    if (!ret) {
+        ret = (ssize_t)kgenSourceSize(pCtx) + 1;
+    }
+
+out:
+    destroyFetchContext(mulOpts.fctx);
+    destroyKgenContext(pCtx);
+
+    return (ret < 0) ? -EOVERFLOW : ret;
+}
+
+//-----------------------------------------------------------------------------
+
+static void
+assignBlockKargs(KernelArg *args, const void *params, const void *extra)
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+    KernelExtraFlags kflags = ((const CLBLASKernExtra*)extra)->flags;
+    int idx;
+    (void)extra;
+
+    initSizeKarg(&args[0], blasArgs->M);
+    initSizeKarg(&args[1], blasArgs->N);
+    initSizeKarg(&args[2], blasArgs->K);
+    assignScalarKarg(&args[3], &(blasArgs->alpha), blasArgs->dtype);
+    assignScalarKarg(&args[4], &(blasArgs->beta), blasArgs->dtype);
+    INIT_KARG(&args[5], blasArgs->A);
+    INIT_KARG(&args[6], blasArgs->B);
+    INIT_KARG(&args[7], blasArgs->C);
+    initSizeKarg(&args[8], blasArgs->lda.matrix);
+    initSizeKarg(&args[9], blasArgs->ldb.matrix);
+    initSizeKarg(&args[10], blasArgs->ldc.matrix);
+    idx = 11;
+    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
+        initSizeKarg(&args[idx++], blasArgs->offA);
+    }
+    if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
+        initSizeKarg(&args[idx++], blasArgs->offBX);
+    }
+    if (kflags & KEXTRA_CY_OFF_NOT_ZERO) {
+        initSizeKarg(&args[idx++], blasArgs->offCY);
+    }
+}
+
+static bool
+blockCheckCalcDecomp(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    DataType dtype,
+    int check)
+{
+    bool ret = true;
+
+    DUMMY_ARG_USAGE(subdimsNum);
+
+    if (check == PGRAN_CHECK) {
+        unsigned int minSize, maxSize;
+
+        maxSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 4 : 8;
+        minSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 1 : 2;
+        ret = decompSanityCheck(subdims, minSize, maxSize, 24, dtype, true);
+        ret = ret && (subdims[0].bwidth == subdims[1].bwidth);
+        ret = ret && (pgran->wgSize[0] * pgran->wgSize[1] == 64);
+    }
+    else {
+        calcPgranDedicated(pgran, subdims, 1, 3);
+    }
+
+    return ret;
+}
+
+//-----------------------------------------------------------------------------
+
+static void
+assignSubgKargs(KernelArg *args, const void *params, const void *extra)
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+    KernelExtraFlags kflags = ((const CLBLASKernExtra*)extra)->flags;
+    int idx = 0;
+    (void)extra;
+
+    initSizeKarg(&args[0], blasArgs->M);
+    initSizeKarg(&args[1], blasArgs->N);
+    initSizeKarg(&args[2], blasArgs->K);
+    assignScalarKarg(&args[3], &(blasArgs->alpha), blasArgs->dtype);
+    assignScalarKarg(&args[4], &(blasArgs->beta), blasArgs->dtype);
+    INIT_KARG(&args[5], blasArgs->A);
+    INIT_KARG(&args[6], blasArgs->B);
+    INIT_KARG(&args[7], blasArgs->C);
+    initSizeKarg(&args[8], blasArgs->lda.matrix);
+    initSizeKarg(&args[9], blasArgs->ldb.matrix);
+    initSizeKarg(&args[10], blasArgs->ldc.matrix);
+    idx = 11;
+    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
+        initSizeKarg(&args[idx++], blasArgs->offA);
+    }
+    if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
+        initSizeKarg(&args[idx++], blasArgs->offBX);
+    }
+    if (kflags & KEXTRA_CY_OFF_NOT_ZERO) {
+        initSizeKarg(&args[idx++], blasArgs->offCY);
+    }
+
+    return;
+}
+
+//-----------------------------------------------------------------------------
+
+static DecompositionAxis
+innerDecompositionAxis(const void *args)
+{
+    const CLBlasKargs *kargs = args;
+    int tra, trb;
+
+    tra = (kargs->order == clblasColumnMajor) ^
+           (kargs->transA != clblasNoTrans);
+    trb = (kargs->order == clblasRowMajor) ^
+           (kargs->transB != clblasNoTrans);
+
+    /*
+     * Make the matrix B inner if every thread should access their elements
+     * with a large stride but accesses elements of the matrix A sequentially
+     * to provide more coalesced memory accesses.
+     */
+    return (!tra && trb) ? DECOMP_AXIS_X : DECOMP_AXIS_Y;
+}
+
+//-----------------------------------------------------------------------------
+
+static SolverFlags
+solverFlags(void)
+{
+    return (SF_WSPACE_2D);
+}
+
+//-----------------------------------------------------------------------------
+
+static void
+fixupArgs(void *args, SubproblemDim *subdims, void *extra)
+{
+    CLBlasKargs *kargs = (CLBlasKargs*)args;
+    extraData_t *extraData = (extraData_t*)&((CLBLASKernExtra*)extra)->solverPriv;
+
+    const size_t nChans = 8; // !!!DEVICE DEPENDED!!!
+    const size_t wideChans = 64; // !!!DEVICE DEPENDED!!!
+    const size_t sizeType[] = {1,2,2,4};
+
+    size_t sizeBlock = wideChans * nChans / sizeType[kargs->dtype];
+    size_t off = kargs->K % sizeBlock;
+    if (off == 0) {
+        extraData->staggered = roundUp(subdims[1].bwidth * sizeType[kargs->dtype]
+                                    , wideChans / sizeType[kargs->dtype]);
+    }
+    else {
+        extraData->staggered = 0;
+    }
+}
+
+//-----------------------------------------------------------------------------
+
+void
+InitGEMMCachedBlockPattern(MemoryPattern *mempat)
+{
+    mempat->name = "Cached global memory based block gemm";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &blockSOps;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L1;
+    mpatExtra.bMset = CLMEM_LEVEL_L1;
+    mpatExtra.mobjA = CLMEM_BUFFER;
+    mpatExtra.mobjB = CLMEM_BUFFER;
+    mempat->extra = &mpatExtra;
+}
+
+//-----------------------------------------------------------------------------
+
+static int
+blockGetPerf(
+    unsigned int kflags,
+    const void *args)
+{
+    (void)args;
+
+    if( !isMatrixAccessColMaj( CLBLAS_GEMM, kflags, MATRIX_A ) &&
+        !isMatrixAccessColMaj( CLBLAS_GEMM, kflags, MATRIX_B ) ){
+
+        return PPERF_AVERAGE;
+    }
+
+    return PPERF_GOOD;
+}
+
+//-----------------------------------------------------------------------------
+
+void
+InitGEMMCachedSubgroupPattern(MemoryPattern *mempat)
+{
+    mempat->name = "Cached global memory based subgroup gemm";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &subgSOps;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L1;
+    mpatExtra.bMset = CLMEM_LEVEL_L1;
+    mpatExtra.mobjA = CLMEM_BUFFER;
+    mpatExtra.mobjB = CLMEM_BUFFER;
+    mempat->extra = &mpatExtra;
+}
+
+//-----------------------------------------------------------------------------
+
+static int
+gemmSubgGetDefaultDecomp( PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    void *pArgs )
+{
+    DUMMY_ARG_USAGE(subdimsNum);
+    pgran->wgDim = 2;
+    return subgGetDefaultDecomp( pgran, subdims, pArgs );
+}
+
+//-----------------------------------------------------------------------------
+
+static bool
+subgCheckCalcDecomp(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    DataType dtype,
+    int check)
+{
+    unsigned int subgroupsA = 0;
+    unsigned int subgroupsB = 0;
+    unsigned int itemsPerSubg = 0;
+    unsigned int regUse = 0;
+
+    //EINVAL
+    if( (subdimsNum<2)||
+        (NULL==pgran)||
+        (NULL==subdims) ){
+
+        return false;
+    }
+
+    if( 0 == subdims[0].x ||
+        0 == subdims[0].y ||
+        0 == subdims[0].bwidth ||
+        0 == subdims[1].x ||
+        0 == subdims[1].y ||
+        0 == subdims[1].bwidth ){
+
+        return false;
+    }
+
+    if( subdims[1].x != subdims[1].itemX ||
+        subdims[1].y != subdims[1].itemY ){
+
+        return false;
+    }
+
+    // the group block must consist of integer number of subgroup blocks
+    if( subdims[0].x % subdims[1].x ||
+        subdims[0].y % subdims[1].y ||
+        subdims[0].bwidth % subdims[1].bwidth ){
+
+        return false;
+    }
+
+    if( !(isDoubleBasedType(dtype) && isComplexType(dtype) ) ){
+
+        if ( subdims[1].x < 2 || subdims[1].y < 2 || subdims[1].bwidth < 2 ) {
+
+            return false;
+        }
+    }
+
+    // check dimensions
+    if( subdims[1].bwidth > 8 ||
+        subdims[1].x > 8 ||
+        subdims[1].y > 8 ){
+
+        return false;
+    }
+
+    // estimate register usage, drop
+    // inevitably slowed decompositions
+    regUse =
+        (   subdims[1].bwidth * subdims[1].x +
+            subdims[1].bwidth * subdims[1].y +
+            subdims[1].x * subdims[1].y ) *
+        dtypeSize(dtype);
+
+    regUse /= 16; // 16 bytes per register
+
+    if( regUse >= 50 ){
+        return false;
+    }
+
+    // validate the subgroup decomposition
+    itemsPerSubg = subdims[0].bwidth/subdims[1].bwidth;
+
+    subgroupsA = subdims[0].y/subdims[1].y;
+    subgroupsB = subdims[0].x/subdims[1].x;
+
+    // passed PGranularity should be checked
+    if( PGRAN_CHECK == check ){
+
+        if( pgran->wgSize[0] != itemsPerSubg ||
+            pgran->wgSize[1] != subgroupsA*subgroupsB ){
+
+            return false;
+        }
+
+        //filter subgroup numbers with poor performance
+        //(less than 2 items in subgroup)
+        if( pgran->wgSize[0] < 2 ){
+            return false;
+        }
+
+        // drop groups consisting of number of items other than 64
+        if( pgran->wgSize[0] * pgran->wgSize[1] != 64 ){
+            return false;
+        }
+    }
+    // PGranularity should be calculated
+    else{
+        pgran->wgSize[0] = itemsPerSubg;
+        pgran->wgSize[1] = subgroupsA*subgroupsB;
+    }
+
+    pgran->wgDim = 2;
+
+    /*Debug out for Tune*/
+
+    return true;
+}
+
+//-----------------------------------------------------------------------------
+
+static void
+subgCalcGlobalThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra
+)
+{
+    CLBlasKargs    *pArgs;
+
+    //EINVAL
+    if( NULL == subdims ||
+        NULL == pgran ||
+        NULL == args ||
+        NULL == extra)
+    {
+        return;
+    }
+    pArgs = (CLBlasKargs*)args;
+
+    threads[0] = (pArgs->N/subdims[0].x)*pgran->wgSize[0];
+    threads[1] = (pArgs->M/subdims[0].y)*pgran->wgSize[1];
+
+    // N tail group
+    if( pArgs->N%subdims[0].x ){
+        threads[0] += pgran->wgSize[0];
+    }
+    // M tail group
+    if( pArgs->M%subdims[0].y ){
+        threads[1] += pgran->wgSize[1];
+    }
+}
+
+//-----------------------------------------------------------------------------
+static int
+subgGetPerf(
+    unsigned int kflags,
+    const void *args)
+{
+    DUMMY_ARG_USAGE(args);
+
+    if( !isMatrixAccessColMaj( CLBLAS_GEMM, kflags, MATRIX_A ) &&
+        !isMatrixAccessColMaj( CLBLAS_GEMM, kflags, MATRIX_B ) ){
+
+        return PPERF_GOOD;
+    }
+
+    return PPERF_NOT_SUPPORTED;
+}
diff --git a/src/library/blas/gens/gemm_cached.cpp b/src/library/blas/gens/gemm_cached.cpp
new file mode 100644
index 0000000..09231f9
--- /dev/null
+++ b/src/library/blas/gens/gemm_cached.cpp
@@ -0,0 +1,503 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Cached global buffers based gemm generator
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include <kprintf.hpp>
+#include <gemm.clT>
+#include <gemm_helper.clT>
+#include <symm_helper.clT>
+#include <solution_seq.h>
+#include "tuned_numbers.h"
+
+//#define DEBUG_GEMM_2
+static CLBLASMpatExtra mpatExtra;
+
+static char Prefix[4];
+
+/* Function, finding default decomposition */
+static int
+getDefaultDecomposition(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    void *pArgs);
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra);
+
+static SolverFlags
+solverFlags(void);
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args);
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static SolverOps gemmSops = {
+    generator,
+    assignKargs,
+    NULL,
+    NULL,
+   	NULL,
+    calcNrThreads,
+    NULL,
+    solverFlags,
+    NULL,
+	getDefaultDecomposition,
+	NULL,
+	setBuildOpts,
+	NULL
+};
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra)
+{
+    const CLBlasKargs *kargs = (const CLBlasKargs *)args;
+    //const CLBLASKernExtra *kextra = ( CLBLASKernExtra *)extra;
+    //KernelExtraFlags kflags = kextra->flags;
+    size_t M, N;
+
+    M = kargs->M;
+    N = kargs->N;
+
+    threads[1] = 1;
+
+    if ((subdims->x != SUBDIM_UNUSED) &&
+        (subdims->y != SUBDIM_UNUSED)) {
+
+        size_t groupWorkX, groupWorkY;
+        size_t nrGroupsX, nrGroupsY;
+        int nrDims;
+
+        groupWorkX = subdims->x;
+        groupWorkY = subdims->y;
+
+        nrGroupsX = N / groupWorkX;
+        if (N % groupWorkX) {
+            nrGroupsX++;
+        }
+
+        nrGroupsY = M / groupWorkY;
+        if (M % groupWorkY) {
+            nrGroupsY++;
+        }
+        nrDims = (pgran == NULL) ? 1 : pgran->wgDim;
+        threads[0] = nrGroupsX * nrGroupsY;
+
+        if(kargs->pigFuncID == CLBLAS_HERK)
+        {
+            threads[0] = (nrGroupsY * (nrGroupsY + 1)) / 2;
+        }
+
+    }
+
+    if (pgran != NULL) {
+        threads[0] *= pgran->wgSize[0];
+        threads[1] *= pgran->wgSize[1];
+    }
+}
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+	SolutionStep *step = (SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+	const SubproblemDim *dims = step->subdims;
+	//size_t vecLen = sizeof(cl_float4)/dtypeSize(kargs->dtype);
+    KernelExtraFlags kflags = step->extraFlags;
+
+    blockSizes bestSize = bestBlockSizeForDevice( step );
+
+    if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
+    {
+        strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+    }
+
+    if (isComplexType(kargs->dtype))
+    {
+        strcat(buildOptStr, " -DCOMPLEX ");
+    }
+
+    if ((bestSize.useBarrier) == 1)
+    {
+	    strcat(buildOptStr, " -DGEMM_NEEDS_BARRIER ");
+    }
+
+    if (kargs->M % dims->y)
+	{
+		strcat(buildOptStr, " -DM_TAIL_PRESENT ");
+    }
+
+	if (kargs->N % dims->x)
+	{
+		strcat(buildOptStr, " -DN_TAIL_PRESENT ");
+	}
+
+    if (kflags & KEXTRA_CONJUGATE_A)
+    {
+        strcat( buildOptStr, " -DCONJUGATE_A ");
+    }
+    if (kflags & KEXTRA_CONJUGATE_B)
+    {
+        strcat( buildOptStr, " -DCONJUGATE_B ");
+    }
+
+    switch(kargs->pigFuncID)
+    {
+        case CLBLAS_HEMM:
+        case CLBLAS_SYMM:
+        case CLBLAS_SYMM_DIAGONAL:
+        case CLBLAS_HEMM_DIAGONAL:
+            #ifdef DEBUG_GEMM_2
+            printf("GEMM2: setBuildOpts: Setting options for SYMM\n");
+            #endif
+            if (kargs->side == clblasLeft)
+            {
+                strcat (buildOptStr, " -D__SYMM_LEFT__ ");
+            }
+            if (kargs->side == clblasRight)
+            {
+                strcat (buildOptStr, " -D__SYMM_RIGHT__ ");
+            }
+            if (kargs->uplo == clblasLower)
+            {
+                strcat(buildOptStr, " -D__SYMM_LOWER__ ");
+            }
+            if (kargs->uplo == clblasUpper)
+            {
+                strcat(buildOptStr, " -D__SYMM_UPPER__ ");
+            }
+            // Define the order for Legacy sake.
+            if (kargs->order == clblasColumnMajor)
+            {
+                strcat(buildOptStr, " -D__SYMM_COLMAJOR__ ");
+            } else {
+                strcat(buildOptStr, " -D__SYMM_ROWMAJOR__ ");
+            }
+            if ((kargs->pigFuncID == CLBLAS_SYMM_DIAGONAL) || (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL))
+            {
+                strcat(buildOptStr, " -D__SYMM_DIAGONAL__ ");
+            }
+            if (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL)
+            {
+                strcat(buildOptStr, " -D__HEMM__ ");
+            }
+            break;
+
+         case CLBLAS_HERK:
+            strcat( buildOptStr, " -DHERK");
+            if(kargs->uplo == clblasLower)
+            {
+                strcat( buildOptStr, " -DHERK_LOWER_TRIANGLE");
+            }
+            else if(kargs->uplo == clblasUpper)
+            {
+                strcat( buildOptStr, " -DHERK_UPPER_TRIANGLE");
+            }
+            break;
+
+         default:
+            break;
+    }
+
+    #ifdef DEBUG_GEMM_2
+	printf("buildStr: %s\n", buildOptStr);
+    #endif
+	return;
+}
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    KernelExtraFlags kflags = kextra->flags;
+    DataType dtype = kextra->dtype;
+    char tempTemplate[64*1024]; //PENDING: Is it safe to have 64K in stack for threadSafety?
+    char itemx[10], itemy[10], width[10], itemy_by_width[10], itemx_by_width[10];
+    char bwidth[10], panel_by_v[10];
+    size_t Y, X, BLOCKSIZE, ITEMX, ITEMY;
+	bool doVLOAD = false;
+	unsigned int veclen;
+
+    if (buf == NULL)
+    {
+        buflen = 64*1024*sizeof(char);
+        return (ssize_t)buflen;
+    }
+
+    //
+    // PENDING: Add Support for Row Major
+    //
+    if ((kflags & KEXTRA_COLUMN_MAJOR) == 0)
+    {
+        return 0;
+    }
+
+	if ((kflags & KEXTRA_NO_COPY_VEC_A) || (kflags & KEXTRA_NO_COPY_VEC_B) || (kflags  & KEXTRA_NO_COPY_VEC_C))
+	{
+		#ifdef DEBUG_GEMM_2
+		printf("GEMM2: Doing un-aligned access\n");
+		#endif
+		doVLOAD= true;
+	} else {
+		#ifdef DEBUG_GEMM_2
+		printf("GEMM2: Doing Aligned access\n");
+		#endif
+	}
+
+
+    BLOCKSIZE = pgran->wgSize[0];
+    #ifdef DEBUG_GEMM_2
+    printf("GEMM2- generator(): Blocksize passed = %lu, subdimy = %lu, subdimx = %lu, veclen = %d \n",
+                                BLOCKSIZE, subdims->y, subdims->x, kextra->vecLen);
+    #endif
+
+	veclen = kextra->vecLen;
+
+    ITEMY = subdims->itemY;
+    ITEMX = subdims->itemX;
+    Y = subdims->y / ITEMY;
+    X = subdims->x / ITEMX;
+
+	//
+	// Handle in-compatible subdims and workgroup sizes
+	// We will use "veclen" of 1 as our shield against these in-compatible
+    // geometries.
+	//
+    if ( (ITEMY % kextra->vecLen) || ((ITEMX % kextra->vecLen) && (kflags & KEXTRA_TRANS_B)) )
+    {
+        //
+        // FIXME:
+        // This kernel must be stored against vecLen of 1 in Kernel Cache.
+        // This needs change in EXTRA structure. However, this is against the API.
+        // We are going against the API by changing fields in EXTRA structure.
+        // One alternate FIX is to return an error.
+        //
+        kextra->vecLen = kextra->vecLenA = kextra->vecLenB = kextra->vecLenC = 1;
+
+       	doVLOAD = true;
+		veclen = 1;
+    }
+
+	//
+	// PENDING: Selective Vectorization for A, B and C access has to be added
+	// 			in KPRINTF module (VLOADA, VLOADB, VLOADC, VSTOREC)
+	//
+    kprintf kobj(Prefix[dtype], veclen, doVLOAD, doVLOAD); // Only Vectored Access
+    sprintf(width, "%lu", Y);
+    sprintf(itemy, "%lu", ITEMY);
+    sprintf(itemx, "%lu", ITEMX);
+    sprintf(itemy_by_width, "%lu", (size_t) ITEMY/veclen);
+    sprintf(itemx_by_width, "%lu", (size_t) ITEMX/veclen);
+    //sprintf(bwidth, "%lu", subdims->bwidth);
+    //sprintf(panel_by_v, "%lu", (subdims->bwidth / veclen));
+    sprintf(bwidth, "%lu", (size_t) veclen);
+    sprintf(panel_by_v, "%lu", (size_t) 1);
+
+    kobj.put("%WIDTH", width);
+    kobj.put("%ITEMX", itemx);
+    kobj.put("%ITEMY", itemy);
+    kobj.put("%ITEMY_BY_V", itemy_by_width);
+    kobj.put("%ITEMX_BY_V", itemx_by_width);
+    kobj.put("%PANEL", bwidth);
+    kobj.put("%PANEL_BY_V", panel_by_v);
+    #ifdef DEBUG_GEMM_2
+    printf("ColMajor GEMM - WIDTH = %s, PANEL = %lu, ITEMX = %s, ITEMY = %s, Veclen = %lu\n", width, subdims->bwidth, itemx, itemy, veclen);
+    #endif
+
+    strcpy(tempTemplate, SYMM_HEMM_HELPER);
+	if ((kflags & KEXTRA_TRANS_A) == 0)
+	{
+		if (kflags & KEXTRA_TRANS_B)
+		{
+			#ifdef DEBUG_GEMM_2
+			printf("Using GEMM_NT_KERNEL\n");
+			#endif
+    		strcat(tempTemplate, GEMM_HELPER);
+            strcat(tempTemplate, GEMM_NT_KERNEL);
+		} else {
+			#ifdef DEBUG_GEMM_2
+			printf("Using GEMM_NN_KERNEL\n");
+			#endif
+    		strcat(tempTemplate, GEMM_HELPER);
+    		strcat(tempTemplate, GEMM_NN_KERNEL);
+		}
+	} else {
+		// PENDING:
+		if (kflags & KEXTRA_TRANS_B)
+		{
+		    tempTemplate[0] = 0;
+		} else {
+			#ifdef DEBUG_GEMM_2
+			printf("Using GEMM_TN_KERNEL\n");
+			#endif
+    		strcat(tempTemplate, GEMM_HELPER);
+    		strcat(tempTemplate, GEMM_TN_KERNEL);
+	    }
+	}
+    kobj.spit(buf, tempTemplate);
+    #ifdef DEBUG_GEMM_KPRINTF
+    printf("Kernel = \n%s\n", buf);
+    #endif
+    size_t tail = strlen(buf) + 1;
+    while(tail < 64*1024)
+    {
+        buf[tail++] = 0;
+    }
+    return 64*1024*sizeof(char);
+}
+
+static void
+assignKargs(KernelArg *args, const void *params, const void*)
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+
+    #ifdef DEBUG_GEMM_2
+    printf("SAlpha=%f, DAlpha=%f, CAlpha =<%f, %f>, DAlpha=<%f, %f>\n",
+            blasArgs->alpha.argFloat, blasArgs->alpha.argDouble, CREAL(blasArgs->alpha.argFloatComplex), CIMAG(blasArgs->alpha.argFloatComplex),
+            CREAL(blasArgs->alpha.argDoubleComplex) , CIMAG(blasArgs->alpha.argDoubleComplex));
+    printf("SBeta=%f, DBeta=%f, CBeta=<%f, %f>, DBeta=<%f, %f>\n",
+            blasArgs->beta.argFloat, blasArgs->beta.argDouble, CREAL(blasArgs->beta.argFloatComplex), CIMAG(blasArgs->beta.argFloatComplex),
+            CREAL(blasArgs->beta.argDoubleComplex) , CIMAG(blasArgs->beta.argDoubleComplex));
+    #endif
+
+    INIT_KARG(&args[0], blasArgs->A);   //A - input matrix - argument
+    INIT_KARG(&args[1], blasArgs->B);   //x - result buffer = _xnew argument
+    INIT_KARG(&args[2], blasArgs->C);   //y - scratch == _x_vector argument
+    initSizeKarg(&args[3], blasArgs->M);
+    initSizeKarg(&args[4], blasArgs->N);
+    initSizeKarg(&args[5], blasArgs->K);
+    initSizeKarg(&args[6], blasArgs->lda.matrix);
+    initSizeKarg(&args[7], blasArgs->ldb.matrix);
+    initSizeKarg(&args[8], blasArgs->ldc.matrix);
+    initSizeKarg(&args[9], blasArgs->offA);
+    initSizeKarg(&args[10], blasArgs->offBX);
+    initSizeKarg(&args[11], blasArgs->offCY);
+    assignScalarKarg(&args[12], &(blasArgs->alpha), blasArgs->dtype);
+    assignScalarKarg(&args[13], &(blasArgs->beta), blasArgs->dtype);
+    return;
+}
+
+static SolverFlags
+solverFlags(void)
+{
+    return (SF_WSPACE_1D);
+}
+
+extern "C"
+void
+initGemmV2CachedPattern(MemoryPattern *mempat)
+{
+    mempat->name = "Cached global memory based block gemm";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &gemmSops;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L1;
+    mpatExtra.bMset = CLMEM_LEVEL_L1;
+    mpatExtra.mobjA = CLMEM_BUFFER;
+    mpatExtra.mobjB = CLMEM_BUFFER;
+    mempat->extra = &mpatExtra;
+
+
+    Prefix[TYPE_FLOAT] = 'S';
+    Prefix[TYPE_DOUBLE] = 'D';
+    Prefix[TYPE_COMPLEX_FLOAT] = 'C';
+    Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
+}
+
+static int
+getDefaultDecomposition(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    void *pArgs)
+{
+
+    DUMMY_ARG_USAGE(pArgs);
+    //
+    // FIXME:  container_of() - Counts on the fact that "getDefaultDecomposition" is called
+    //          with step->pgran, step->subdims
+    //
+    SolutionStep *step = container_of( pgran , pgran, SolutionStep);
+
+    blockSizes bestSize = bestBlockSizeForDevice( step );
+
+    pgran->wgSize[0] = bestSize.TY * bestSize.TX;
+    pgran->wgSize[1] = 1;
+    pgran->wgDim = 1;
+
+    if (subdimsNum >= 1)
+    {
+        subdims[0].y = bestSize.TY * bestSize.ITEMY;
+        subdims[0].x = bestSize.TX * bestSize.ITEMX;
+        subdims[0].itemY = bestSize.ITEMY;
+        subdims[0].itemX = bestSize.ITEMX;
+        subdims[0].bwidth = 4;
+    }
+    if (subdimsNum >= 2)
+    {
+        subdims[1].y = bestSize.TY * bestSize.ITEMY;
+        subdims[1].x = bestSize.TX * bestSize.ITEMX;
+        subdims[1].itemY = bestSize.ITEMY;
+        subdims[1].itemX = bestSize.ITEMX;
+        subdims[1].bwidth = 4;
+    }
+
+    return 0;
+}
+
diff --git a/src/library/blas/gens/gemm_tail_cached.cpp b/src/library/blas/gens/gemm_tail_cached.cpp
new file mode 100644
index 0000000..ea79249
--- /dev/null
+++ b/src/library/blas/gens/gemm_tail_cached.cpp
@@ -0,0 +1,461 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Cached global buffers based gemm generator
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include <kprintf.hpp>
+#include <gemm.clT>
+#include <symm_helper.clT>
+#include <solution_seq.h>
+
+extern "C" int
+gemmHasNTail(size_t N, int vecLen, clblasOrder order, clblasTranspose transA, clblasTranspose transB);
+
+extern "C" int
+gemmHasMTail(size_t M, int vecLen, clblasOrder order, clblasTranspose transA, clblasTranspose transB);
+
+
+//#define DEBUG_GEMM_TAIL
+static CLBLASMpatExtra mpatExtra;
+
+static char Prefix[4];
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra);
+
+static SolverFlags
+solverFlags(void);
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args);
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static SolverOps gemmSops = {
+    generator,
+    assignKargs,
+    NULL,
+    NULL,
+   	NULL,
+    calcNrThreads,
+    NULL,
+    solverFlags,
+    NULL,
+	NULL,
+	NULL,
+	setBuildOpts,
+	NULL
+};
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+	const SolutionStep *step = (const SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+    KernelExtraFlags kflags = step->extraFlags;
+
+	strcat(buildOptStr, " -DTAIL_RUN -DM_TAIL_PRESENT -DN_TAIL_PRESENT ");
+    if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
+    {
+        strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+        #ifdef DEBUG_GEMM_TAIL
+        printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
+        #endif
+    }
+
+    if (isComplexType(kargs->dtype))
+    {
+        strcat(buildOptStr, " -DCOMPLEX ");
+    }
+
+    if (kflags & KEXTRA_CONJUGATE_A)
+    {
+        strcat( buildOptStr, " -DCONJUGATE_A ");
+}
+    if (kflags & KEXTRA_CONJUGATE_B)
+    {
+        strcat( buildOptStr, " -DCONJUGATE_B ");
+    }
+
+
+    switch(kargs->pigFuncID)
+    {
+        case CLBLAS_GEMM2:
+        case CLBLAS_GEMM_TAIL:
+            break;
+
+        case CLBLAS_HERK:
+            strcat( buildOptStr, " -DHERK");
+            if(kargs->uplo == clblasLower)
+            {
+                strcat( buildOptStr, " -DHERK_LOWER_TRIANGLE");
+            }
+            else if(kargs->uplo == clblasUpper)
+            {
+                strcat( buildOptStr, " -DHERK_UPPER_TRIANGLE");
+            }
+            break;
+
+        case CLBLAS_HEMM:
+        case CLBLAS_SYMM_DIAGONAL:
+        case CLBLAS_HEMM_DIAGONAL:
+        case CLBLAS_SYMM:
+            #ifdef DEBUG_GEMM_2
+            printf("GEMM2: setBuildOpts: Setting options for SYMM\n");
+            #endif
+            if (kargs->side == clblasLeft)
+            {
+                strcat (buildOptStr, " -D__SYMM_LEFT__ ");
+            }
+            if (kargs->side == clblasRight)
+            {
+                strcat (buildOptStr, " -D__SYMM_RIGHT__ ");
+            }
+            if (kargs->uplo == clblasLower)
+            {
+                strcat(buildOptStr, " -D__SYMM_LOWER__ ");
+            }
+            if (kargs->uplo == clblasUpper)
+            {
+                strcat(buildOptStr, " -D__SYMM_UPPER__ ");
+            }
+            if (kargs->order == clblasColumnMajor)
+            {
+                strcat(buildOptStr, " -D__SYMM_COLMAJOR__ ");
+            } else {
+                strcat(buildOptStr, " -D__SYMM_ROWMAJOR__ ");
+            }
+            if ((kargs->pigFuncID == CLBLAS_SYMM_DIAGONAL)  || (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL))
+            {
+                strcat(buildOptStr, " -D__SYMM_DIAGONAL__ ");
+            }
+            if (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL)
+            {
+                strcat(buildOptStr, " -D__HEMM__ ");
+            }
+            break;
+
+        default:
+            printf("GEMM TAIL: Unknown pigFuncID\n");
+            break;
+    }
+    #ifdef DEBUG_GEMM_TAIL
+    printf("GEMMTAIL: Build options = %s\n", buildOptStr);
+    #endif
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra)
+{
+    int BLOCKSIZE = pgran->wgSize[0]; // 1D Block
+	size_t tailM, tailN, M, N;
+	size_t Y, X;
+	size_t nWorkGroupsAY, nWorkGroupsAX, nWorkGroupsA;
+	size_t nWorkGroupsBY, nWorkGroupsBX, nWorkGroupsB;
+	size_t totalWorkGroups;
+    #ifdef DEBUG_GEMM_TAIL
+    printf("calcNrThreads called from gemm_tail.cpp\n");
+    #endif
+    const CLBlasKargs *kargs = (const CLBlasKargs *)args;
+    const CLBLASKernExtra *kextra = ( CLBLASKernExtra *)extra;
+	KernelExtraFlags kflags = kextra->flags;
+
+	//
+	// RowMajor GEMM can be expressed in terms of Column Major GEMM
+	//
+    if ((kflags & KEXTRA_COLUMN_MAJOR) == 0)
+    {
+    	printf("calcNrThreads: FIXME: RowMajor is NOT supported \n");
+        return;
+    }
+
+	if (kextra->vecLenA != 1)
+	{
+    	printf("GEMM_TAIL: calcNrThreads(): Vector Length must be 1 for TAIL. Non-one Vector Length Requested\n");
+		return;
+	}
+
+	tailM = kargs->tailStartM;
+	tailN = kargs->tailStartN;
+	M = kargs->M;
+	N = kargs->N;
+
+    Y = 8;
+    if (Y != subdims->y)
+	{
+		Y = subdims->y;
+	}
+    X = BLOCKSIZE/Y;
+    /*
+    LEGACY CODE: Outdated now. TAIL can handle this condition now using MTAIL_PRESENT and NTAIL_PRESENT
+	if (tailN % X)
+	{
+		printf("GEMM_TAIL: calcNrThreads(): WARNING: tailN is not divisible by X. Will produce Wrong results!\n");
+	}
+    */
+
+	//
+	// A Tail Workgroup will process YxX panel
+	//
+	/*
+			 ______________
+			|			|  |
+			|			|  |
+			|			|  | B Tail panel
+			|___________|  |
+			|___________|__|
+		    <---  A   -->
+	 */
+	if(tailM != M)
+	{
+		#ifdef DEBUG_GEMM_TAIL
+		printf("GEMM_TAIL: M has TAIL\n");
+		#endif
+		nWorkGroupsAY = ((M - tailM -1)/Y + 1);
+		nWorkGroupsAX = ((tailN - 1)/X + 1);
+		nWorkGroupsA = nWorkGroupsAY * nWorkGroupsAX;
+	} else {
+		nWorkGroupsA = 0;
+	}
+
+	if (tailN != N)
+	{
+		#ifdef DEBUG_GEMM_TAIL
+		printf("GEMM_TAIL: N has TAIL\n");
+		#endif
+		nWorkGroupsBY = ((M-1)/Y) + 1;
+		nWorkGroupsBX = ((N-tailN-1)/X) + 1;
+		nWorkGroupsB = nWorkGroupsBY * nWorkGroupsBX;
+	} else {
+		nWorkGroupsB = 0;
+	}
+
+	totalWorkGroups = nWorkGroupsA + nWorkGroupsB;
+
+	threads[0] = totalWorkGroups * BLOCKSIZE;
+	threads[1] = 1;
+	#ifdef DEBUG_GEMM_TAIL
+	printf("GEMM_TAIL: calcNrThreads(): vlen:%d, <tailM:%lu, M:%lu>, <tailN:%lu, N:%lu, nWorkGroupsA<%lu,%lu>, nWorkGroupsB<%lu,%lu>\n",
+			kextra->vecLenA, tailM, M, tailN, N, nWorkGroupsAY, nWorkGroupsAX, nWorkGroupsBY, nWorkGroupsBX);
+	printf("GEMM_TAIL: calcNrThreads(): globalThreads0=%lu, globalThreads1=%lu\n", threads[0], threads[1]);
+	#endif
+	return;
+}
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    KernelExtraFlags kflags = kextra->flags;
+    DataType dtype = kextra->dtype;
+    char tempTemplate[32*1024];
+    char itemx[10], itemy[10], width[10], itemy_by_width[10], itemx_by_width[10];
+    size_t Y, X, BLOCKSIZE, ITEMX, ITEMY;
+
+    if (buf == NULL)
+    {
+        buflen = 32*1024*sizeof(char);
+        return (ssize_t)buflen;
+    }
+
+    //
+    // PENDING: Add Support for Row Major at the xAPI.c level
+	// Row major calcs can be expressed in terms of column major
+    //
+    if ((kflags & KEXTRA_COLUMN_MAJOR) == 0)
+    {
+        return 0;
+    }
+
+    kprintf kobj(Prefix[dtype], 1, false, false); // Only Scalar Access
+
+    BLOCKSIZE = pgran->wgSize[0];
+    #ifdef DEBUG_GEMM_TAIL
+    printf("GEMM- generator(): Blocksize passed = %lu, subdimy = %lu, subdimx = %lu, veclen = %d \n", BLOCKSIZE, subdims->y, subdims->x, kextra->vecLenA);
+    #endif
+
+    Y = 8;
+    if (Y != subdims->y)
+	{
+		//printf("GEMM_TAIL: generator(): WARNING: subdims->y is un-suitable.\n");
+		Y = subdims->y;
+	}
+    X = BLOCKSIZE/Y;
+    ITEMY = (subdims->y) / Y;
+    ITEMX = (subdims->x) / X;
+    if (ITEMX == 0)
+    {
+        ITEMX = 1;
+    }
+
+    if ((BLOCKSIZE % Y) || ((subdims->y) % Y) || ((subdims->x)%X) || (ITEMY % kextra->vecLenA) || ((X*ITEMX) % kextra->vecLenA))
+    {
+        printf("WARNING: GEMM TAIL - generator: subdim and blocksize in-compatible. This code should never execute!\n");
+    }
+
+    sprintf(width, "%lu", Y);
+    sprintf(itemy, "%lu", ITEMY);
+    sprintf(itemx, "%lu", ITEMX);
+    sprintf(itemy_by_width, "%lu", (size_t) ITEMY/kextra->vecLenA);
+    sprintf(itemx_by_width, "%lu", (size_t) ITEMX/kextra->vecLenA);
+
+    kobj.put("%WIDTH", width);
+    kobj.put("%ITEMX", itemx);
+    kobj.put("%ITEMY", itemy);
+    kobj.put("%ITEMY_BY_V", itemy_by_width);
+    kobj.put("%ITEMX_BY_V", itemx_by_width);
+    kobj.put("%PANEL", "1");
+    kobj.put("%PANEL_BY_V", "1");
+    #ifdef DEBUG_GEMM_TAIL
+    printf("ColMajor GEMM - WIDTH = %s, ITEMX = %s, ITEMY = %s\n", width, itemx, itemy);
+    #endif
+
+    strcpy(tempTemplate, SYMM_HEMM_HELPER);
+    if ((kflags & KEXTRA_TRANS_A) == 0)
+    {
+        if (kflags & KEXTRA_TRANS_B)
+        {
+			#ifdef DEBUG_GEMM_TAIL
+			printf("GEMM_TAIL: Using GEMM_NT_KERNEL\n");
+			#endif
+            strcat(tempTemplate, GEMM_NT_KERNEL);
+        } else {
+			#ifdef DEBUG_GEMM_TAIL
+			printf("GEMM_TAIL: Using GEMM_NN_KERNEL\n");
+			#endif
+            strcat(tempTemplate, GEMM_NN_KERNEL);
+		}
+    } else {
+        //
+        // NOTE: A^T * B Never leaves any tails. This should NEVER be called.
+        // PENDING: A^T * B^T support is PENDING
+        tempTemplate[0] = 0;
+    }
+
+    kobj.spit(buf, tempTemplate);
+    //#ifdef DEBUG_GEMM_TAIL
+    //printf("Kernel = \n%s\n", buf);
+    //#endif
+    size_t tail = strlen(buf) + 1;
+    while(tail < 32*1024)
+    {
+        buf[tail++] = 0;
+    }
+    return 32*1024*sizeof(char);
+}
+
+static void
+assignKargs(KernelArg *args, const void *params, const void*)
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+
+    #ifdef DEBUG_GEMM_TAIL
+    printf("SAlpha=%f, DAlpha=%f, CAlpha =<%f, %f>, DAlpha=<%f, %f>\n",
+            blasArgs->alpha.argFloat, blasArgs->alpha.argDouble, CREAL(blasArgs->alpha.argFloatComplex), CIMAG(blasArgs->alpha.argFloatComplex),
+            CREAL(blasArgs->alpha.argDoubleComplex) , CIMAG(blasArgs->alpha.argDoubleComplex));
+    printf("SBeta=%f, DBeta=%f, CBeta=<%f, %f>, DBeta=<%f, %f>\n",
+            blasArgs->beta.argFloat, blasArgs->beta.argDouble, CREAL(blasArgs->beta.argFloatComplex), CIMAG(blasArgs->beta.argFloatComplex),
+            CREAL(blasArgs->beta.argDoubleComplex) , CIMAG(blasArgs->beta.argDoubleComplex));
+	printf("TailStartM = %lu, TailStartN = %lu\n", blasArgs->tailStartM, blasArgs->tailStartN);
+    #endif
+
+    INIT_KARG(&args[0], blasArgs->A);   //A - input matrix - argument
+    INIT_KARG(&args[1], blasArgs->B);   //x - result buffer = _xnew argument
+    INIT_KARG(&args[2], blasArgs->C);   //y - scratch == _x_vector argument
+    initSizeKarg(&args[3], blasArgs->M);
+    initSizeKarg(&args[4], blasArgs->N);
+    initSizeKarg(&args[5], blasArgs->K);
+    initSizeKarg(&args[6], blasArgs->lda.matrix);
+    initSizeKarg(&args[7], blasArgs->ldb.matrix);
+    initSizeKarg(&args[8], blasArgs->ldc.matrix);
+    initSizeKarg(&args[9], blasArgs->offA);
+    initSizeKarg(&args[10], blasArgs->offBX);
+    initSizeKarg(&args[11], blasArgs->offCY);
+    assignScalarKarg(&args[12], &(blasArgs->alpha), blasArgs->dtype);
+    assignScalarKarg(&args[13], &(blasArgs->beta), blasArgs->dtype);
+    initSizeKarg(&args[14], blasArgs->tailStartM);
+    initSizeKarg(&args[15], blasArgs->tailStartN);
+    return;
+}
+
+static SolverFlags
+solverFlags(void)
+{
+    return (SF_WSPACE_1D);
+}
+
+extern "C"
+void
+initGemmV2TailCachedPattern(MemoryPattern *mempat)
+{
+    mempat->name = "Cached global memory based gemm tail";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &gemmSops;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L1;
+    mpatExtra.bMset = CLMEM_LEVEL_L1;
+    mpatExtra.mobjA = CLMEM_BUFFER;
+    mpatExtra.mobjB = CLMEM_BUFFER;
+    mempat->extra = &mpatExtra;
+
+
+    Prefix[TYPE_FLOAT] = 'S';
+    Prefix[TYPE_DOUBLE] = 'D';
+    Prefix[TYPE_COMPLEX_FLOAT] = 'C';
+    Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
+}
+
diff --git a/src/library/blas/gens/gemv.c b/src/library/blas/gens/gemv.c
new file mode 100644
index 0000000..40293d8
--- /dev/null
+++ b/src/library/blas/gens/gemv.c
@@ -0,0 +1,685 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * gemv generator
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <math.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+
+#include "blas_kgen.h"
+#include "xxmv_common.h"
+
+typedef struct {
+    size_t staggered;
+} MAY_ALIAS extraData_t;
+
+static const char *gemvDecl =
+    "__attribute__((reqd_work_group_size(%lu, %lu, 1)))\n"
+    "void __kernel\n"
+    "%cgemv(\n"
+    "    uint %c,\n"
+    "    uint %c,\n"
+    "    const %s alpha,\n"
+    "    const __global %s *restrict A,\n"
+    "    const __global %s *restrict X,\n"
+    "%s"
+    "    __global %s *Y,\n"
+    "    uint lda"
+    "%s"    // offset A, X and Y
+    "%s"
+    "%s)\n";
+
+static CLBLASMpatExtra mpatExtra;
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra);
+
+static void
+fixupArgs(void *args, SubproblemDim *subdims, void *extra);
+
+static SolverFlags
+solverFlags(void);
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs);
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static bool
+subgCheckCalcDecomp(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    DataType dtype,
+    int check);
+
+static int
+subgGetDefaultDecomp(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    void * pArgs);
+
+static SolverOps gemvSops = {
+    generator,
+    assignKargs,
+    isFitToLDS,
+    NULL,
+    NULL,
+    calcNrThreads,
+    NULL,
+    solverFlags,
+    fixupArgs,
+    subgGetDefaultDecomp,//getDefaultDecomposition
+    subgCheckCalcDecomp, //get Decomp. list
+    NULL,
+    NULL
+};
+
+static void
+declareGemvKernel(
+    struct KgenContext *ctx,
+    DataType dtype,
+    const PGranularity *pgran,
+    KernelExtraFlags kflags)
+{
+    char sizeNames[2] = {'M', 'N'};
+    bool incxOne = ((kflags & KEXTRA_INCX_ONE) != 0);
+    bool incyOne = ((kflags & KEXTRA_INCY_ONE) != 0);
+    bool beta0 = ((kflags & KEXTRA_BETA_ZERO) != 0);
+    const char *incxDecl = incxOne ? "" : ",\n    const int incx";
+    const char *incyDecl = incyOne ? "" : ",\n    const int incy";
+    char offDecl[128];
+    char betaDecl[128];
+    char tmp[512];
+    char fpref;
+    bool tra = ((kflags & KEXTRA_TRANS_A) != 0);
+    const char *typeName;
+
+    typeName = dtypeBuiltinType(dtype);
+    fpref = dtypeToBlasPrefix(dtype);
+
+    offDecl[0] = '\0';
+    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
+        strcpy(offDecl, ",\n    const uint offA");
+    }
+    if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
+        strcat(offDecl, ",\n    const uint offX");
+    }
+    if (kflags & KEXTRA_CY_OFF_NOT_ZERO) {
+        strcat(offDecl, ",\n    const uint offY");
+    }
+
+    if (beta0) {
+        betaDecl[0] = '\0';
+    }
+    else {
+        sprintf(betaDecl, "    const %s beta,\n", typeName);
+    }
+    sprintf(tmp, gemvDecl, pgran->wgSize[0], pgran->wgSize[1], fpref,
+            sizeNames[tra], sizeNames[1 - tra],
+            typeName, typeName, typeName, betaDecl, typeName,
+            offDecl, incxDecl, incyDecl);
+
+    kgenDeclareFunction(ctx, tmp);
+}
+
+static void
+setFetchHandler(
+    TileMulOpts *mulOpts,
+    const BlasGenSettings *gset,
+    int handler(struct KgenContext *ctx, MatrixRole mrole, void *priv),
+    TilePostFetchPrivate *priv)
+{
+    int i, nrPrivs;
+    const char *regName = NULL;
+
+    nrPrivs = 1;
+    for (i = 0; i < nrPrivs; i++) {
+        priv[i].fetchNumA = 0;
+        priv[i].wholeA = 1;
+        priv[i].funcID = CLBLAS_GEMV;
+        priv[i].gset = gset;
+        priv[i].regName = regName;
+        mulOpts->postFetch = handler;
+        mulOpts->postFetchPriv = priv;
+    }
+}
+
+// global memory based kernel generator
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+    struct KgenContext *ctx;
+    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    KernelExtraFlags kflags = kextra->flags;
+    size_t staggered = ((extraData_t*)&kextra->solverPriv)->staggered;
+    //yes, KEXTRA_TAILS_K because it is set if N % bw != 0
+    bool tailN = ((kflags & KEXTRA_TAILS_K) != 0);
+    bool tailM = ((kflags & KEXTRA_TAILS_M) != 0);
+    char tmp[4096];
+    DataType dtype = kextra->dtype;
+    bool doubleBased = isDoubleBasedType(dtype);
+    BlasGenSettings gset;
+    TileMulOpts mulOpts;
+    KernelVarNames *vnames = &gset.varNames;
+    ssize_t ret;
+    TilePostFetchPrivate pfPriv;
+    unsigned int vecLen = kextra->vecLen;
+    const char *outTypeName;
+    const char *gid = "get_group_id(0)";
+    const char *lid = "get_local_id(0)";
+    const char *typeName;
+    size_t wgSize;
+    //unsigned int nStep = 32;
+    unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth; //8;
+    unsigned int cLocal;
+    bool isComplex = isComplexType(dtype);
+    unsigned int nPlans;
+
+    typeName = dtypeBuiltinType(dtype);
+    memset(&gset, 0, sizeof(gset));
+    memset(&mulOpts, 0, sizeof(mulOpts));
+    ctx = createKgenContext(buf, buflen, true);
+    if (ctx == NULL) {
+        return -ENOMEM;
+    }
+
+    // at first, generate needed declarations
+    kgenDeclareUptrs(ctx, doubleBased);
+
+    // now, generate the kernel
+    declareGemvKernel(ctx, dtype, pgran, kflags);
+    ret = kgenBeginFuncBody(ctx);
+    kgenAddStmt(ctx, "// M always denotes length of Y "
+                     "and N denotes length of X in the kernel\n");
+    /* 1D work space. Matrix is divided among wi, each calculates it's own
+     * part of vector y */
+
+    wgSize = (subdims[0].y / subdims[1].y) *
+            (subdims[0].bwidth / subdims[1].bwidth);
+    assert(pgran->wgSize[0] == wgSize);
+    assert(subdims[0].x == 1);
+    assert(subdims[1].x == 1);
+    cLocal = wgSize/bStep;
+
+    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
+    gset.subdims[0].itemX = gset.subdims[0].x = 1;
+    gset.subdims[1].itemX = gset.subdims[1].x = 1;
+    gset.subdims[0].bwidth = gset.subdims[1].bwidth;
+
+    gset.pgran = pgran;
+    gset.kextra = kextra;
+    gset.flags = BGF_UPTRS;
+
+    initDefaultTiles(&gset, CLBLAS_GEMV, 0, PRIV_STORAGE_VARIABLE_SET);
+    if (isComplex) {
+         gset.tileCY.vecLen = 1;
+    }
+    declareTileStorages(ctx, &gset);
+    genZeroTile(ctx, &gset.tileCY);
+    getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL);
+    nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen;
+
+    sprintf(tmp, "__local %s localRes[%u][%u];\n",
+                outTypeName, pgran->wgSize[0], nPlans);
+    kgenAddStmt(ctx, tmp);
+    sprintf(tmp, "uint coordA = (%s * %u + %s %% %u) * %lu;\n",
+                 gid, bStep, lid, bStep, subdims[1].y);
+    kgenAddStmt(ctx, tmp);
+    sprintf(tmp, "uint k0 = (%s / %u) * %lu;\n",
+                 lid,  bStep, subdims[1].bwidth);
+    kgenAddStmt(ctx, tmp);
+
+    kgenAddBlankLine(ctx);
+
+    kgenBeginBranch(ctx,"if (coordA < M && k0 < N)");
+
+    genIncPointers(ctx, kflags);
+    sprintf(tmp,
+            "const GPtr Ag = {(__global %s*)A};\n"
+            "const GPtr Xg = {(__global %s*)X};\n",
+            typeName, typeName);
+    kgenAddStmt(ctx, tmp);
+
+    kgenAddBlankLine(ctx);
+
+    if (tailN) {
+        sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth);
+        kgenAddStmt(ctx, tmp);
+        kgenAddStmt(ctx, "N -= Ntail;\n");
+        kgenAddBlankLine(ctx);
+    }
+
+    mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC;
+    if (tailM) {
+        mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
+    }
+
+    vnames->A = "Ag";
+    vnames->B = "Xg";
+    vnames->coordA = "coordA";
+    vnames->coordB = ""; //should not be used for vector
+    vnames->k = "k";
+    vnames->lda = "lda";
+    vnames->sizeK = "N";
+    vnames->sizeM = "M";
+
+    mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_C_COLUMN_MAJOR | TILEMUL_NOT_INC_K;
+    if ((kflags & KEXTRA_CONJUGATE_A) != 0) {
+        mulOpts.flags |= TILEMUL_CONJA;
+    }
+    if (isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
+        mulOpts.flags |= TILEMUL_TRA;
+    }
+    if ((kflags & KEXTRA_ENABLE_MAD) != 0) {
+        mulOpts.core = TILEMUL_MAD;
+    }
+    else {
+        mulOpts.core = TILEMUL_MULADD;
+    }
+    mulOpts.memA = CLMEM_GLOBAL_MEMORY;
+    mulOpts.memB = CLMEM_GLOBAL_MEMORY;
+
+    if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
+        gset.subdims[0].bwidth = pgran->wgSize[0] * subdims[1].bwidth;
+        mulOpts.flags |= TILEMUL_BW_STRIDE;
+    }
+
+    sprintf(tmp, "uint k = k0;\nfor (; k < N; k += %lu)", cLocal*subdims[1].bwidth);
+    kgenBeginBranch(ctx, tmp);
+
+    if (staggered) {
+        vnames->k = "k1";
+        sprintf(tmp, "const uint k1 = (k + get_group_id(0)*%lu)%%N;\n",staggered);
+        kgenAddStmt(ctx, tmp);
+    }
+
+    genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
+            mulOpts.flags, kflags);
+
+    ret = tileMulGen(ctx, &gset, &mulOpts);
+    if (ret != 0) {
+        return ret;
+    }
+    vnames->k = "k";
+    kgenEndBranch(ctx, NULL); /* k loop */
+
+    if (tailN) {
+        /* Handle tail along vector X */
+        kgenAddStmt(ctx, "N += Ntail;\n");
+        kgenBeginBranch(ctx, "if (k < N)");
+
+        mulOpts.flags |= TILEMUL_SKEW_B;
+        genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
+                  mulOpts.flags, kflags);
+        mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K|TILEMUL_WRAP_AROUND_TAIL;
+        setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv);
+        ret = tileMulGen(ctx, &gset, &mulOpts);
+        if (ret != 0) {
+            return ret;
+        }
+        kgenEndBranch(ctx, NULL);
+    }
+
+    if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
+        gset.subdims[0].bwidth = subdims[1].bwidth;
+        mulOpts.flags &= ~TILEMUL_BW_STRIDE;
+    }
+
+    kgenEndBranch(ctx,NULL);
+
+    genStoreLocalResult(ctx, &gset.tileCY, lid);
+
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+    kgenAddBlankLine(ctx);
+
+    sprintf(tmp, "if (%s < %u && coordA < M && k0 < N)", lid, bStep);
+    kgenBeginBranch(ctx, tmp);
+
+    genAddLocalResult(ctx, &gset.tileCY, lid, cLocal, bStep);
+
+    /* write back the results */
+    /* y := alpha*A*x + beta*y */
+    setResultPos(ctx, kflags, vnames->coordA);
+
+    updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY);
+
+    kgenEndBranch(ctx, NULL);
+
+    kgenEndFuncBody(ctx);
+    ret = kgenAddBlankLine(ctx);
+
+    if (!ret) {
+        ret = (ssize_t)kgenSourceSize(ctx) + 1;
+    }
+
+    destroyKgenContext(ctx);
+    return (ret < 0) ? -EOVERFLOW : ret;
+}
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra)
+{
+    const CLBlasKargs *blasArgs = (const CLBlasKargs*)params;
+    KernelExtraFlags kflags = ((const CLBLASKernExtra*)extra)->flags;
+    cl_int inc;
+    int i;
+
+    initSizeKarg(&args[0], blasArgs->M);
+    initSizeKarg(&args[1], blasArgs->N);
+    assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype);
+    INIT_KARG(&args[3], blasArgs->A);
+    INIT_KARG(&args[4], blasArgs->B);
+    i = 5;
+    if (!(kflags & KEXTRA_BETA_ZERO)) {
+        assignScalarKarg(&args[i++], &(blasArgs->beta), blasArgs->dtype);
+    }
+    INIT_KARG(&args[i], blasArgs->C);
+    i++;
+    initSizeKarg(&args[i++], blasArgs->lda.matrix);
+    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
+        initSizeKarg(&args[i++], blasArgs->offA);
+    }
+    if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
+        initSizeKarg(&args[i++], blasArgs->offBX);
+    }
+    if (kflags & KEXTRA_CY_OFF_NOT_ZERO) {
+        initSizeKarg(&args[i++], blasArgs->offCY);
+    }
+    if (!(kflags & KEXTRA_INCX_ONE)) {
+        inc = blasArgs->ldb.vector;
+        INIT_KARG(&args[i], inc);
+        i++;
+    }
+    if (!(kflags & KEXTRA_INCY_ONE)) {
+        inc = blasArgs->ldc.vector;
+        INIT_KARG(&args[i], inc);
+        i++;
+    }
+}
+
+static void
+fixupArgs(void *args, SubproblemDim *subdims, void *extra)
+{
+    CLBlasKargs *kargs = (CLBlasKargs*)args;
+    KernelExtraFlags kflags = ((CLBLASKernExtra*)extra)->flags;
+
+    const size_t nChans = 8; // !!!DEVICE DEPENDED!!!
+    const size_t wideChans = 64; // !!!DEVICE DEPENDED!!!
+    const size_t sizeType[] = {1,2,2,4};
+
+    size_t sizeBlock = wideChans * nChans / sizeType[kargs->dtype];
+    size_t off = kargs->K % sizeBlock;
+    extraData_t *extraData = (extraData_t*)&((CLBLASKernExtra*)extra)->solverPriv;
+    if (off == 0 && !isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
+        /*
+         * FIXME: staggered access is not enabled now since for some reason
+         *        it leads to slowdown at small sizes
+         */
+        extraData->staggered = 0; // wideChans / sizeType[kargs->dtype];
+    }
+    else {
+        extraData->staggered = 0;
+    }
+
+    (void)subdims;
+
+    off = (kargs->offsetM) ? kargs->offsetM : kargs->offsetN;
+    if (off) {
+        if (isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
+            kargs->offA += off;
+        }
+        else {
+            kargs->offA += off * kargs->lda.matrix;
+        }
+        if (kargs->ldc.vector < 0) {
+            // K store the original height of the matrix A
+            kargs->offCY += (kargs->K - off) * abs(kargs->ldc.vector);
+        }
+        else {
+            kargs->offCY += off * kargs->ldc.vector;
+        }
+    }
+
+    kargs->offsetM = kargs->offsetN = 0;
+
+}
+
+static int
+subgGetDefaultDecomp(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    void * pArgs)
+{
+    (void)subdimsNum;
+    DUMMY_ARG_USAGE(pArgs);
+
+    pgran->wgDim = 1;
+    pgran->wgSize[0] = 64;
+    pgran->wgSize[1] = 1;
+
+    subdims[1].bwidth = 4;
+    subdims[1].itemX = subdims[1].x = 1;
+    subdims[1].itemY = subdims[1].y = 4;
+
+    subdims[0].bwidth = 8 * subdims[1].bwidth;
+    subdims[0].itemX = subdims[0].x = 1;
+    subdims[0].itemY = subdims[0].y = 8 * subdims[1].y;
+
+    return 0;
+}
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs)
+{
+    (void)kernelArgs;
+
+    if (1) {
+        cl_ulong size;
+
+	    /*
+         * One needs y1 * wgSize size of local memory in elements, but
+         * y1 is not calculated yet. The expression below produces
+         * reliable a larger value. It is larger in dims[1].bwidth times.
+         */
+        size = dim[0].y * dim[0].bwidth * dtypeSize(dtype);
+
+        return (size <= ldsSize);
+    }
+    return true;
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra)
+{
+    size_t yLen;     /* Length of "Y" vector */
+    const CLBlasKargs *kargs = args;
+    unsigned int subgr = pgran->wgSize[0] / (subdims[0].bwidth / subdims[1].bwidth);
+
+    (void)subdims;
+    (void)extra;
+
+    yLen = kargs->transA == clblasNoTrans ? kargs->M : kargs->N;
+
+    if (yLen == 0) {
+        yLen = 1;
+        //launch one group to avoid CL_INVALID_WORK_GROUP_SIZE error
+    }
+
+    //each work item handles y1 lines
+    threads[0] = divRoundUp(yLen, subdims[1].y) * subgr;
+    threads[0] = roundUp(threads[0], pgran->wgSize[0]);
+    threads[1] = 0;
+}
+
+static SolverFlags
+solverFlags(void)
+{
+    return (SF_WSPACE_1D);
+}
+
+static bool
+subgCheckCalcDecomp(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    DataType dtype,
+    int check)
+{
+    unsigned int divider1 = dtypeSize(dtype)/sizeof(cl_float);
+    unsigned int divider0 = 2-!isComplexType(dtype);
+    //EINVAL
+    if( (subdimsNum<2)||
+        (NULL==pgran)||
+        (NULL==subdims) ){
+
+        return false;
+    }
+
+    if( 0 == subdims[0].x ||
+        0 == subdims[0].y ||
+        0 == subdims[0].bwidth ||
+        0 == subdims[1].x ||
+        0 == subdims[1].y ||
+        0 == subdims[1].bwidth ){
+
+        return false;
+    }
+
+    if( subdims[1].x != subdims[1].itemX ||
+        subdims[1].y != subdims[1].itemY ){
+
+        return false;
+    }
+
+    // the group block must consist of integer number of subgroup blocks
+    if( subdims[0].x % subdims[1].x ||
+        subdims[0].y % subdims[1].y ||
+        subdims[0].bwidth % subdims[1].bwidth ){
+
+        return false;
+    }
+
+    //check fitting of bw to common vector sizes
+    if( isComplexType(dtype) ){
+
+        if( 2*subdims[1].bwidth > 32 ){
+
+            return false;
+        }
+    }
+
+    // check dimensions
+    if( subdims[1].bwidth > 16 / divider1 ||
+        subdims[1].x > 1 ||
+        subdims[1].y > 16 / divider1 ){
+
+        return false;
+    }
+
+    if( subdims[0].bwidth > 256 / divider0 ||
+        subdims[0].x > 1 ||
+        subdims[0].y > 256 / divider0 ){
+
+        return false;
+    }
+
+    if (64 != (subdims[0].y / subdims[1].y) *
+        (subdims[0].bwidth / subdims[1].bwidth)) {
+        return false;
+    }
+
+    // passed PGranularity should be checked
+    if( PGRAN_CHECK == check ){
+        if( pgran->wgSize[0] * pgran->wgSize[1] != 64 ){
+            return false;
+        }
+    }
+    // PGranularity should be calculated
+    else{
+        pgran->wgDim = 1;
+        pgran->wgSize[1] = 1;
+        pgran->wgSize[0] = 64;
+
+        //subdims[0].bwidth = (pgran->wgSize[0] * subdims[1].bwidth) /
+        //    (subdims[0].y / subdims[1].y);
+    }
+    /*Debug out for Tune*/
+
+    return true;
+}
+
+//-----------------------------------------------------------------------------
+
+void
+initGemvPattern(MemoryPattern *mempat)
+{
+    mempat->name = "Cached global memory based block gemv";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &gemvSops;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L1;
+    mpatExtra.bMset = CLMEM_LEVEL_L1;
+    mpatExtra.mobjA = CLMEM_BUFFER;
+    mpatExtra.mobjB = CLMEM_BUFFER;
+    mempat->extra = &mpatExtra;
+}
diff --git a/src/library/blas/gens/gen_helper.c b/src/library/blas/gens/gen_helper.c
new file mode 100644
index 0000000..bec622c
--- /dev/null
+++ b/src/library/blas/gens/gen_helper.c
@@ -0,0 +1,551 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+#include "gen_helper.h"
+#include "clblas_stddef.h"
+
+#define IDX_INVAL ((unsigned int)-1)
+
+typedef struct CopyPattern {
+    SubproblemDim dim;
+    const PGranularity *pgran;
+    DataType dtype;
+    DBlockCopyDirection dir;
+    DBlockCopyFlags flags;
+    bool generic;
+    bool zeroing;
+} CopyPattern;
+
+static __inline void
+dimSwapXY(SubproblemDim *dim)
+{
+    size_t tmp = dim->x;
+
+    dim->x = dim->y;
+    dim->y = tmp;
+}
+
+/*
+ * Initialize a dimension structure with the
+ * respective values if it's needed or mark them
+ * as unused
+ */
+static void
+checkInitSubdim(
+    SubproblemDim *dim,
+    unsigned int flags,
+    unsigned int checkedFlag,
+    size_t x,
+    size_t y)
+{
+    if (flags & checkedFlag) {
+        dim->x = x;
+        dim->y = y;
+    }
+    else {
+        dim->x = SUBDIM_UNUSED;
+        dim->y = SUBDIM_UNUSED;
+    }
+}
+
+/*
+ * check if such dimension instance
+ * does already exist in the array
+ */
+static int
+lookupDim(
+    const SubproblemDim *dim,
+    unsigned int idx)
+{
+    unsigned int i;
+
+    for (i = 0; i < idx; i++) {
+        if (dim[i].x == dim[idx].x &&
+            dim[i].y == dim[idx].y) {
+            break;
+        }
+    }
+
+    return (i == idx) ? IDX_INVAL : i;
+}
+
+static int
+cpyGenCallback(struct KgenContext *ctx, const void *pattern)
+{
+    const CopyPattern *pat = (CopyPattern*)pattern;
+    const void *dim = (pat->generic) ? NULL : &pat->dim;
+
+    return copyDataBlockGen(ctx, dim, pat->pgran, pat->dtype,
+                            pat->dir, pat->flags);
+}
+
+static void
+initCopyPattern(
+    CopyPattern *pattern,
+    const SubproblemDim *blasDim,
+    KernelExtraFlags flags,
+    MatrixRole mrole,
+    BlasFunctionID funcID)
+{
+    SubproblemDim *dim = &pattern->dim;
+    unsigned int vecFlag = 0;
+
+    pattern->flags = 0;
+
+    if (blasDim == NULL) {
+        pattern->generic = true;
+        dim->x = 0;
+        dim->y = 0;
+    }
+    else {
+        pattern->generic = false;
+
+        switch (mrole) {
+        case MATRIX_A:
+            dim->x = blasDim->bwidth;
+            dim->y = blasDim->y;
+            break;
+        case MATRIX_B:
+            dim->x = blasDim->bwidth;
+            dim->y = blasDim->x;
+            break;
+        case MATRIX_C:
+            dim->x = blasDim->x;
+            dim->y = blasDim->y;
+            break;
+        default:
+            break;
+        }
+    }
+
+    switch (mrole) {
+    case MATRIX_A:
+        vecFlag = KEXTRA_NO_COPY_VEC_A;
+        break;
+    case MATRIX_B:
+        vecFlag = KEXTRA_NO_COPY_VEC_B;
+        break;
+    case MATRIX_C:
+        if ((funcID == CLBLAS_TRMM) || (funcID == CLBLAS_TRSM)) {
+            vecFlag = KEXTRA_NO_COPY_VEC_B;
+        } else {
+            vecFlag = KEXTRA_NO_COPY_VEC_C;
+        }
+        break;
+    default:
+        break;
+    }
+
+    if (flags & vecFlag) {
+        pattern->flags |= DBLOCK_COPY_NOT_VECTORIZE;
+    }
+
+    if (isMatrixAccessColMaj(funcID, flags, mrole)) {
+        if ((pattern->dir == DBLOCK_GLOBAL_TO_LOCAL) &&
+            !pattern->generic) {
+            dimSwapXY(dim);
+        }
+        pattern->flags |= DBLOCK_COPY_TRANSPOSE;
+    }
+    if (isMatrixConj(flags, mrole)) {
+        pattern->flags |= DBLOCK_COPY_CONJUGATE;
+    }
+}
+
+int
+generateBufCopyFuncs(
+    CopyBufFuncs *funcNames,
+    struct KgenContext *ctx,
+    BlasFunctionID funcID,
+    const BlasGenSettings *gset,
+    BufCopyHelperFlags flags)
+{
+    CopyPattern pattern;
+    struct KgenGuard *guard;
+    int ret = 0;
+    MatrixRole mrole;
+    bool needed[MATRIX_ROLES_NUMBER];
+    KernelExtraFlags kgenFlags = gset->kextra->flags;
+    DataType dtype = gset->kextra->dtype;
+    const SubproblemDim *blasDim = gset->subdims;
+    const PGranularity *pgran = gset->pgran;
+    bool outputTails = (kgenFlags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N));
+
+    guard = createKgenGuard(ctx, cpyGenCallback, sizeof(CopyPattern));
+    if (guard == NULL) {
+        return -ENOMEM;
+    }
+
+    memset(&pattern, 0, sizeof(pattern));
+
+    pattern.dir = DBLOCK_GLOBAL_TO_LOCAL;
+    pattern.dtype = dtype;
+    pattern.pgran = pgran;
+
+    needed[MATRIX_A] = (flags & BCHF_MATRIX_A);
+    needed[MATRIX_B] = (flags & BCHF_MATRIX_B);
+    needed[MATRIX_C] = (flags & BCHF_READ_OUTPUT);
+
+    for (mrole = MATRIX_A; mrole <= MATRIX_C; mrole++) {
+        if (!needed[mrole]) {
+            continue;
+        }
+
+        initCopyPattern(&pattern, blasDim, kgenFlags, mrole, funcID);
+        findGenerateFunction(guard, &pattern, funcNames->read[mrole],
+                             FUNC_NAME_MAXLEN);
+        kgenAddBlankLine(ctx);
+    }
+
+    if (flags & BCHF_WRITE_OUTPUT) {
+        if (flags & BCHF_IMAGE_WRITE) {
+            pattern.dir = DBLOCK_LOCAL_TO_IMAGE;
+            initCopyPattern(&pattern, NULL, kgenFlags, MATRIX_A, funcID);
+            pattern.flags &= ~DBLOCK_COPY_TRANSPOSE;
+        }
+        else {
+            pattern.dir = DBLOCK_LOCAL_TO_GLOBAL;
+            initCopyPattern(&pattern, blasDim, kgenFlags, MATRIX_C, funcID);
+        }
+        ret = findGenerateFunction(guard, &pattern, funcNames->write,
+                                   FUNC_NAME_MAXLEN);
+        kgenAddBlankLine(ctx);
+    }
+
+    if (ret) {
+        destroyKgenGuard(guard);
+
+        return ret;
+    }
+
+    // reevaluate needed flags
+    needed[MATRIX_A] = needed[MATRIX_A] &&
+        (kgenFlags & (KEXTRA_TAILS_M | KEXTRA_TAILS_K));
+    needed[MATRIX_B] = needed[MATRIX_B] &&
+        (kgenFlags & (KEXTRA_TAILS_N | KEXTRA_TAILS_K));
+    needed[MATRIX_C] = needed[MATRIX_C] && outputTails;
+
+    pattern.dir = DBLOCK_GLOBAL_TO_LOCAL;
+    for (mrole = MATRIX_A; mrole <= MATRIX_C; mrole++) {
+        if (!needed[mrole]) {
+            continue;
+        }
+
+        initCopyPattern(&pattern, NULL, kgenFlags, mrole, funcID);
+        findGenerateFunction(guard, &pattern,
+                             funcNames->readGeneric[mrole],
+                             FUNC_NAME_MAXLEN);
+        kgenAddBlankLine(ctx);
+    }
+
+    if ((flags & BCHF_WRITE_OUTPUT) && outputTails) {
+        if (flags & BCHF_IMAGE_WRITE) {
+            pattern.dir = DBLOCK_LOCAL_TO_IMAGE;
+            initCopyPattern(&pattern, NULL, kgenFlags, MATRIX_A, funcID);
+            pattern.flags &= ~DBLOCK_COPY_TRANSPOSE;
+        }
+        else {
+            pattern.dir = DBLOCK_LOCAL_TO_GLOBAL;
+            initCopyPattern(&pattern,NULL, kgenFlags, MATRIX_C, funcID);
+        }
+        ret = findGenerateFunction(guard, &pattern, funcNames->writeGeneric,
+                                   FUNC_NAME_MAXLEN);
+        kgenAddBlankLine(ctx);
+    }
+
+    destroyKgenGuard(guard);
+
+    return ret;
+}
+
+int
+generateZeroingFuncs(
+    ZeroFuncs *funcNames,
+    struct KgenContext *ctx,
+    const SubproblemDim *blasDim,
+    const PGranularity *pgran,
+    DataType dtype,
+    ZeroGenHelperFlags flags)
+{
+    int ret = 0;
+    SubproblemDim dim[MATRIX_ROLES_NUMBER];
+    size_t tsize, nvecs;
+    unsigned int i, j;
+
+    tsize = dtypeSize(dtype);
+    nvecs = fl4RowWidth(blasDim->bwidth, tsize);
+
+    checkInitSubdim(&dim[MATRIX_A], flags, ZF_MATRIX_A, nvecs * blasDim->y, 1);
+    checkInitSubdim(&dim[MATRIX_B], flags, ZF_MATRIX_B, nvecs * blasDim->x, 1);
+    nvecs = fl4RowWidth(blasDim->x, tsize);
+    checkInitSubdim(&dim[MATRIX_C], flags, ZF_MATRIX_C, nvecs * blasDim->y, 1);
+
+    for (i = 0; (i < MATRIX_ROLES_NUMBER) && !ret; i++) {
+        if (dim[i].x == SUBDIM_UNUSED) {
+            continue;
+        }
+
+        // check whether the function is already generated
+        j = lookupDim(dim, i);
+        if (j != IDX_INVAL) {
+            strcpy(funcNames->names[i], funcNames->names[j]);
+        }
+        else {
+            ret = f4zeroBlockGen(ctx, &dim[i], pgran, "__local");
+            if (!ret) {
+                kgenGetLastFuncName(funcNames->names[i], FUNC_NAME_MAXLEN,
+                                    ctx);
+            }
+            kgenAddBlankLine(ctx);
+        }
+    }
+
+    return ret;
+}
+
+UpdateResultFlags
+kextraToUpresFlags(BlasFunctionID funcID, KernelExtraFlags kflags)
+{
+    UpdateResultFlags uf = 0;
+
+    if (funcHasBeta(funcID) && !(kflags & KEXTRA_BETA_ZERO)) {
+        uf |= UPRES_WITH_BETA;
+    }
+    if (isMatrixAccessColMaj(funcID, kflags, MATRIX_C)) {
+        uf |= UPRES_COLUMN_MAJOR;
+    }
+    if (kflags & KEXTRA_NO_COPY_VEC_C) {
+        uf |= UPRES_NO_VECTORIZATION;
+    }
+
+    return uf;
+}
+
+int
+generateResultUpdate(
+    struct KgenContext *ctx,
+    BlasFunctionID funcID,
+    const BlasGenSettings *gset,
+    const char *optFuncName,
+    const char *genericFuncName)
+{
+    UpdateResultFlags flags;
+
+    flags = kextraToUpresFlags(funcID, gset->kextra->flags);
+
+    return genResultUpdateWithFlags(ctx, funcID, gset, flags,
+                                    optFuncName, genericFuncName, NULL);
+}
+
+int
+genResultUpdateWithFlags(
+    struct KgenContext *ctx,
+    BlasFunctionID funcID,
+    const BlasGenSettings *gset,
+    UpdateResultFlags flags,
+    const char *optFuncName,
+    const char *genericFuncName,
+    const char *cachedName)
+{
+    KernelExtraFlags kflags = gset->kextra->flags;
+    UpdateResultOp op;
+    char tmp[1024];
+    int ret = 0;
+    const char *coordY, *coordX;
+    UpresVarNames uvars;
+    const KernelVarNames *kvarNames = &gset->varNames;
+    const SubproblemDim *dim = &gset->subdims[1];
+    bool areTails, useCondition;
+
+    memset(&uvars, 0, sizeof(uvars));
+
+    coordX = kvarNames->coordB;
+    coordY = kvarNames->coordA;
+
+    if (funcHasTriangMatrix(funcID)) {
+        if (flags & UPRES_TRIANG_WRITE_C) {
+            uvars.result = "C";
+        }
+        else {
+            uvars.result = "B";
+        }
+        uvars.ld = "ldb";
+    }
+    else {
+        uvars.result = "C";
+        uvars.ld = "ldc";
+    }
+
+    uvars.cachedName = cachedName;
+
+    /* For now, kernels that do not use UPRES_EXCEED_PROBLEM_CONDITION
+     * must return in case problem exceeds more precise lower level conditions
+     * (KEXTRA_TAILS_M_LOWER, KEXTRA_TAILS_N_LOWER) before updating result
+    */
+    areTails = (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N));
+    useCondition = areTails && ((flags & UPRES_EXCEED_PROBLEM_CONDITION) != 0);
+    if (useCondition) {
+        bool tailM = (kflags & KEXTRA_TAILS_M) != 0;
+        bool tailN = (kflags & KEXTRA_TAILS_N) != 0;
+
+        if (tailM) {
+            if (tailN) {
+                sprintf(tmp, "if ((%s < %s) && (%s < %s))",
+                        coordY, kvarNames->sizeM, coordX, kvarNames->sizeN);
+            }
+            else {
+                sprintf(tmp, "if (%s < %s)", coordY, kvarNames->sizeM);
+            }
+        }
+        else {
+            // here tailN is true
+            sprintf(tmp, "if (%s < %s)", coordX, kvarNames->sizeN);
+        }
+        kgenBeginBranch(ctx, tmp);
+    }
+    else {
+        kgenAddBlankLine(ctx);
+    }
+
+    if (optFuncName) {
+        const char *betaStr;
+        betaStr = (flags & UPRES_WITH_BETA) ? ", beta" : "";
+
+        // update with functions invoking
+        if (!(kflags & (KEXTRA_TAILS_M_LOWER | KEXTRA_TAILS_N_LOWER))) {
+            sprintf(tmp, "%s(%s, c, alpha, %s, %s, %s%s);\n",
+                    optFuncName, uvars.result, coordY, coordX,
+                    uvars.ld, betaStr);
+        }
+        else {
+            sprintf(tmp, "uint y = min(%luu, %s - (uint)%s);\n"
+                         "uint x = min(%luu, %s - (uint)%s);\n"
+
+                         "if ((y == %lu) && (x == %lu)) {\n"
+                         "    %s(%s, c, alpha, %s, %s, %s%s);\n"
+                         "}\n"
+                         "else {\n"
+                         "    %s(%s, c, alpha, %s, %s, %s%s, y, x);\n"
+                         "}\n",
+                     dim->y, kvarNames->sizeM, coordY,
+                     dim->x, kvarNames->sizeN, coordX,
+                     dim->y, dim->x,
+                     optFuncName, uvars.result, coordY, coordX, uvars.ld,
+                     betaStr,
+                     genericFuncName, uvars.result, coordY, coordX, uvars.ld,
+                     betaStr);
+        }
+
+        kgenAddStmt(ctx, tmp);
+    }
+    else {
+        // inline result update
+        flags |= UPRES_INLINE;
+
+        op = (flags & UPRES_WITH_BETA) ? UPRES_SUM : UPRES_SET;
+
+        uvars.startRow = coordY;
+        uvars.startCol = coordX;
+        uvars.nrRows = "y";
+        uvars.nrCols = "x";
+
+        if (!(kflags & (KEXTRA_TAILS_M_LOWER | KEXTRA_TAILS_N_LOWER))) {
+            ret = updateResultGen(ctx,
+                gset,
+                funcID,
+                op,
+                flags,
+                &uvars);
+        }
+        else {
+            sprintf(tmp, "uint y = min(%luu, %s - (uint)%s);\n"
+                         "uint x = min(%luu, %s - (uint)%s);\n",
+                    dim->y, kvarNames->sizeM, coordY,
+                    dim->x, kvarNames->sizeN, coordX);
+            kgenAddStmt(ctx, tmp);
+
+            sprintf(tmp, "if ((y == %lu) && (x == %lu))",
+                    dim->y, dim->x);
+            kgenBeginBranch(ctx, tmp);
+
+            // optimized update
+            updateResultGen(ctx,
+                gset,
+                funcID,
+                op,
+                flags,
+                &uvars);
+
+            kgenEndBranch(ctx, NULL);
+
+            kgenBeginBranch(ctx, "else ");
+
+            // not optimized update
+            flags |= UPRES_GENERIC;
+            updateResultGen(ctx,
+                gset,
+                funcID,
+                op,
+                flags,
+                &uvars);
+
+            ret = kgenEndBranch(ctx, NULL);
+        }
+    }
+
+    if (useCondition) {
+        ret = kgenEndBranch(ctx, NULL);
+    }
+
+    return (ret) ? -EOVERFLOW : 0;
+}
+
+//-----------------------------------------------------------------------------
+
+void checkGenBeginHitMatrixBlock(
+    struct KgenContext *ctx,
+    KernelExtraFlags kflags)
+{
+    bool tailsM = (kflags & KEXTRA_TAILS_M) != 0;
+    bool tailsN = (kflags & KEXTRA_TAILS_N) != 0;
+
+    if (tailsM) {
+        if (tailsN) {
+            kgenBeginBranch(ctx, "if ((coord.x < N) && (coord.y < M))");
+        }
+        else {
+            kgenBeginBranch(ctx, "if (coord.y < M)");
+        }
+    }
+    else {
+        if (tailsN) {
+            kgenBeginBranch(ctx, "if (coord.x < N)");
+        }
+    }
+}
+
+//-----------------------------------------------------------------------------
+
+void checkGenEndHitMatrixBlock(
+    struct KgenContext *ctx,
+    KernelExtraFlags kflags)
+{
+    if (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N)) {
+        kgenEndBranch(ctx, NULL);
+    }
+}
\ No newline at end of file
diff --git a/src/library/blas/gens/gen_helper.h b/src/library/blas/gens/gen_helper.h
new file mode 100644
index 0000000..70a177d
--- /dev/null
+++ b/src/library/blas/gens/gen_helper.h
@@ -0,0 +1,138 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef GEN_HELPER_H_
+#define GEN_HELPER_H_
+
+#include <kerngen.h>
+#include <dblock_kgen.h>
+#include <matrix_props.h>
+
+#include "blas_kgen.h"
+
+typedef enum BufCopyHelperFlags {
+    // buffer copy functions are needed for matrix A blocks
+    BCHF_MATRIX_A = 0x01,
+    // buffer copy functions are needed for matrix B blocks
+    BCHF_MATRIX_B = 0x02,
+    /*
+     * read block of output matrix
+     * (either B or C)
+     */
+    BCHF_READ_OUTPUT = 0x04,
+    // write block of output matrix
+    BCHF_WRITE_OUTPUT = 0x08,
+    // not unroll loops in transposing versions of customized generators
+    BCHF_NOT_UNROLL_TRANSPOSE = 0x10,
+    // output to image
+    BCHF_IMAGE_WRITE = 0x20
+} BufCopyHelperFlags;
+
+typedef enum ZeroGenHelperFlags {
+    ZF_MATRIX_A = 0x01,
+    ZF_MATRIX_B = 0x02,
+    ZF_MATRIX_C = 0x04
+} ZeroGenHelperFlags;
+
+/*
+ * Name of functions copying matrix blocks between the global
+ * and the local memory. Contains customized and generic transposing
+ * or not transposing variants for reading and writing back depending on
+ * generator flags, for all the matrices.
+ *
+ * A function name contained in a 'read*' field matches to a function
+ * copying data from the global memory to the local, and this one
+ * contained in a 'write*' field matches to a function copying in
+ * inverse direction.
+ */
+typedef struct CopyBufFuncs {
+    char read[MATRIX_ROLES_NUMBER][FUNC_NAME_MAXLEN];
+    char write[FUNC_NAME_MAXLEN];
+    char readGeneric[MATRIX_ROLES_NUMBER][FUNC_NAME_MAXLEN];
+    char writeGeneric[FUNC_NAME_MAXLEN];
+} CopyBufFuncs;
+
+/*
+ * Generate all needed functions copying matrix
+ * blocks between the global and the local memory
+ *
+ * @funcs: function names structure
+ * @ctx: generator context
+ * @funcID: function ID
+ * @gset: generator settings
+ * @flags: helper flags
+ *
+ * The 'flags' field of the 'gset' structure must store flags from
+ * the 'BufCopyHelperFlags' enumeration
+ *
+ * Name of functions dealing with blocks of the output matrix
+ * are always stored to 'MATRIX_C' name fields.
+ *
+ * On success returns 0. If generation fails due
+ * to buffer overflowing, returns -1.
+ */
+int
+generateBufCopyFuncs(
+    CopyBufFuncs *funcNames,
+    struct KgenContext *ctx,
+    BlasFunctionID funcID,
+    const BlasGenSettings *gset,
+    BufCopyHelperFlags flags);
+
+/*
+ * Have the same semantics as the previous helper,
+ * but generate functions for zeroing local buffers.
+ */
+int
+generateZeroingFuncs(
+    ZeroFuncs *funcNames,
+    struct KgenContext *ctx,
+    const SubproblemDim *blasDim,
+    const PGranularity *pgran,
+    DataType dtype,
+    ZeroGenHelperFlags flags);
+
+UpdateResultFlags
+kextraToUpresFlags(BlasFunctionID, KernelExtraFlags kflags);
+
+int
+generateResultUpdate(
+    struct KgenContext *ctx,
+    BlasFunctionID funcID,
+    const BlasGenSettings *gset,
+    const char *optFuncName,
+    const char *genericFuncName);
+
+int
+genResultUpdateWithFlags(
+    struct KgenContext *ctx,
+    BlasFunctionID funcID,
+    const BlasGenSettings *gset,
+    UpdateResultFlags flags,
+    const char *optFuncName,
+    const char *genericFuncName,
+    const char *cachedName);
+
+void checkGenBeginHitMatrixBlock(
+    struct KgenContext *ctx,
+    KernelExtraFlags kflags);
+
+void checkGenEndHitMatrixBlock(
+    struct KgenContext *ctx,
+    KernelExtraFlags kflags);
+
+#endif /* GEN_HELPER_H_ */
diff --git a/src/library/blas/gens/gen_init.c b/src/library/blas/gens/gen_init.c
new file mode 100644
index 0000000..b7ac3f7
--- /dev/null
+++ b/src/library/blas/gens/gen_init.c
@@ -0,0 +1,593 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Generators initialization
+ */
+
+#include <blas_mempat.h>
+
+#include "clblas-internal.h"
+#include "init.h"
+
+unsigned int
+initGemmMemPatterns(MemoryPattern *mempats)
+{
+    initGemmLdsPattern(&mempats[0]);
+    initGemmImgPattern(&mempats[1]);
+	InitGEMMCachedBlockPattern(&mempats[2]);
+	InitGEMMCachedSubgroupPattern(&mempats[3]);
+    return 4;
+}
+
+int
+getGemmMemPatternIndex(clblasImplementation impl)
+{
+    switch (impl) {
+	case clblasLdsBlockGemm:				return  0;
+    case clblasImageBlockGemm:			return  1;
+    case clblasBlockGemmWithCaching:		return  2;
+    case clblasSubgroupGemmWithCaching:	return	3;
+	default:								return -1;
+    }
+}
+
+clblasImplementation
+getGemmPreferredPattern(void)
+{
+    switch (clblasSolvers[CLBLAS_GEMM].defaultPattern) {
+    case 0:  return clblasLdsBlockGemm;
+    case 1:  return clblasImageBlockGemm;
+    case 2:  return clblasBlockGemmWithCaching;
+    case 3:  return clblasSubgroupGemmWithCaching;
+    default: return clblasDefaultGemm;
+    }
+}
+
+unsigned int
+initGemvMemPatterns(MemoryPattern *mempats)
+{
+    initGemvPattern(mempats);
+
+    return 1;
+}
+
+int
+getGemvMemPatternIndex(clblasImplementation impl)
+{
+    switch (impl) {
+    default:    return -1;
+    }
+}
+
+unsigned int
+initSymvMemPatterns(MemoryPattern *mempats)
+{
+    initSymvPattern(mempats);
+
+    return 1;
+}
+
+int
+getSymvMemPatternIndex(clblasImplementation impl)
+{
+    switch (impl) {
+    default:    return -1;
+    }
+}
+
+unsigned int
+initTrmmMemPatterns(MemoryPattern *mempats)
+{
+    initTrmmLdsPattern(mempats);
+    initTrmmImgPattern(&mempats[1]);
+    initTrmmCachedBlockPattern(&mempats[2]);
+    initTrmmCachedSubgroupPattern(&mempats[3]);
+
+    return 4;
+}
+
+int
+getTrmmMemPatternIndex(clblasImplementation impl)
+{
+    switch (impl) {
+
+        case clblasLdsBlockTrmm:             return  0;
+        case clblasImageBlockTrmm:           return  1;
+        case clblasBlockTrmmWithCaching:     return  2;
+        case clblasSubgroupTrmmWithCaching:  return 3;
+
+        default: return -1;
+    }
+}
+
+clblasImplementation
+getTrmmPreferredPattern(void)
+{
+    switch (clblasSolvers[CLBLAS_TRMM].defaultPattern) {
+
+        case 0: return clblasLdsBlockTrmm;
+        case 1: return clblasImageBlockTrmm;
+        case 2: return clblasBlockTrmmWithCaching;
+        case 3: return clblasSubgroupTrmmWithCaching;
+
+        default: return clblasDefaultTrmm;
+    }
+}
+
+unsigned int
+initTrsmMemPatterns(MemoryPattern *mempats)
+{
+    initTrsmLdsPattern(mempats);
+    initTrsmImgPattern(&mempats[1]);
+    initTrsmLdsLessCachedPattern(&mempats[2]);
+    initTrsmCachedPattern(&mempats[3]);
+
+    return 4;
+}
+
+int
+getTrsmMemPatternIndex(clblasImplementation impl)
+{
+    switch (impl) {
+    case clblasLdsBlockTrsm:         return  0;
+    case clblasImageBlockTrsm:       return  1;
+    case clblasBlockTrsmWithoutLds:  return  2;
+    case clblasBlockTrsmWithCaching: return  3;
+    default:                            return -1;
+    }
+}
+
+clblasImplementation
+getTrsmPreferredPattern(void)
+{
+    switch (clblasSolvers[CLBLAS_TRSM].defaultPattern) {
+    case 0:  return clblasLdsBlockTrsm;
+    case 1:  return clblasImageBlockTrsm;
+    case 2:  return clblasBlockTrsmWithoutLds;
+    case 3:  return clblasBlockTrsmWithCaching;
+    default: return clblasDefaultTrsm;
+    }
+}
+
+unsigned int
+initSyrkMemPatterns(MemoryPattern *mempats)
+{
+    initSyrkBlockPattern(&mempats[0]);
+    initSyrkSubgPattern(&mempats[1]);
+
+    return 2;
+}
+
+clblasImplementation
+getSyrkPreferredPattern(void)
+{
+    switch (clblasSolvers[CLBLAS_SYRK].defaultPattern) {
+
+    case 0:  return clblasBlockSyrk;
+    case 1:  return clblasSubgSyrk;
+    default: return clblasDefaultSyrk;
+
+    }
+}
+
+int
+getSyrkMemPatternIndex(clblasImplementation impl)
+{
+    switch (impl) {
+
+    case clblasBlockSyrk: return 0;
+    case clblasSubgSyrk: return 1;
+    default:    return -1;
+
+    }
+}
+
+unsigned int
+initSyr2kMemPatterns(MemoryPattern *mempats)
+{
+    initSyr2kBlockPattern(&mempats[0]);
+    initSyr2kSubgPattern(&mempats[1]);
+
+    return 2;
+}
+
+clblasImplementation
+getSyr2kPreferredPattern(void)
+{
+    switch (clblasSolvers[CLBLAS_SYR2K].defaultPattern) {
+
+    case 0:  return clblasBlockSyr2k;
+    case 1:  return clblasSubgSyr2k;
+    default: return clblasDefaultSyr2k;
+
+    }
+}
+
+int
+getSyr2kMemPatternIndex(clblasImplementation impl)
+{
+    switch (impl) {
+
+    case clblasBlockSyr2k: return 0;
+    case clblasSubgSyr2k: return 1;
+    default:    return -1;
+
+    }
+}
+
+unsigned int
+initTrmvMemPatterns(MemoryPattern *mempats)
+{
+	initTrmvRegisterPattern(&mempats[0]);
+	return 1;
+}
+
+int
+getTrmvMemPatternIndex(clblasImplementation impl)
+{
+	switch(impl) {
+	default: return -1;
+	}
+}
+
+unsigned int
+initTrsvMemPatterns(MemoryPattern *mempats)
+{
+	initTrsvDefaultPattern(&mempats[0]);
+	return 1;
+}
+
+int
+getTrsvMemPatternIndex(clblasImplementation impl)
+{
+	switch(impl) {
+	default: return -1;
+	}
+}
+
+unsigned int
+initSyrMemPatterns(MemoryPattern *mempats)
+{
+    initSyrDefaultPattern(&mempats[0]);
+    return 1;
+}
+
+int
+getSyrMemPatternIndex(clblasImplementation impl)
+{
+    switch(impl) {
+    default: return -1;
+    }
+}
+
+unsigned int
+initSyr2MemPatterns(MemoryPattern *mempats)
+{
+	initSyr2DefaultPattern(&mempats[0]);
+	return 1;
+}
+
+int
+getSyr2MemPatternIndex(clblasImplementation impl)
+{
+    switch(impl) {
+    default: return -1;
+    }
+}
+
+unsigned int
+initTrsvGemvMemPatterns(MemoryPattern *mempats)
+{
+	initTrsvGemvDefaultPattern(&mempats[0]);
+	return 1;
+}
+
+int
+getTrsvGemvMemPatternIndex(clblasImplementation impl)
+{
+	switch(impl) {
+	default: return -1;
+	}
+}
+
+unsigned int
+initSymmMemPatterns(MemoryPattern *mempats)
+{
+	initSymmDefaultPattern(&mempats[0]);
+	return 1;
+}
+
+
+int
+getSymmMemPatternIndex(clblasImplementation impl)
+{
+	switch(impl) {
+	default: return -1;
+	}
+}
+
+unsigned int
+initGemmV2MemPatterns(MemoryPattern *mempats)
+{
+	initGemmV2CachedPattern(mempats);
+	return 1;
+}
+
+int
+getGemmV2MemPatternIndex(clblasImplementation impl)
+{
+	switch(impl) {
+		default: return -1;
+	}
+}
+
+unsigned int
+initGemmV2TailMemPatterns(MemoryPattern *mempats)
+{
+	initGemmV2TailCachedPattern(mempats);
+	return 1;
+}
+
+int
+getGemmV2TailMemPatternIndex(clblasImplementation impl)
+{
+	switch(impl) {
+		default: return -1;
+	}
+}
+
+unsigned int
+initGerMemPatterns(MemoryPattern *mempats)
+{
+	initGerRegisterPattern(&mempats[0]);
+	return 1;
+}
+
+int
+getGerMemPatternIndex(clblasImplementation impl)
+{
+	switch(impl) {
+	default: return -1;
+	}
+}
+
+unsigned int
+initHerMemPatterns(MemoryPattern *mempats)
+{
+    initHerDefaultPattern(&mempats[0]);
+    return 1;
+}
+
+int
+getHerMemPatternIndex(clblasImplementation impl)
+{
+    switch(impl) {
+    default: return -1;
+    }
+}
+
+unsigned int
+initHer2MemPatterns(MemoryPattern *mempats)
+{
+	initHer2DefaultPattern(&mempats[0]);
+	return 1;
+}
+
+int
+getHer2MemPatternIndex(clblasImplementation impl)
+{
+    switch(impl) {
+    default: return -1;
+    }
+}
+
+unsigned int
+initGbmvMemPatterns(MemoryPattern *mempats)
+{
+	initGbmvRegisterPattern(&mempats[0]);
+	return 1;
+}
+
+int
+getGbmvMemPatternIndex(clblasImplementation impl)
+{
+	switch(impl) {
+	default: return -1;
+	}
+}
+
+unsigned int
+initSwapMemPatterns(MemoryPattern *mempats)
+{
+    initSwapRegisterPattern(&mempats[0]);
+    return 1;
+}
+
+int
+getSwapMemPatternIndex(clblasImplementation impl)
+{
+    switch(impl) {
+    default: return -1;
+    }
+}
+
+unsigned int
+initScalMemPatterns(MemoryPattern *mempats)
+{
+    initScalRegisterPattern(&mempats[0]);
+    return 1;
+}
+
+
+int
+getScalMemPatternIndex(clblasImplementation impl)
+{
+    switch(impl) {
+    default: return -1;
+    }
+}
+
+unsigned int
+initCopyMemPatterns(MemoryPattern *mempats)
+{
+    initCopyRegisterPattern(&mempats[0]);
+    return 1;
+}
+
+int
+getCopyMemPatternIndex(clblasImplementation impl)
+{
+    switch(impl) {
+    default: return -1;
+    }
+}
+
+unsigned int
+initAxpyMemPatterns(MemoryPattern *mempats)
+{
+    initAxpyRegisterPattern(&mempats[0]);
+    return 1;
+}
+
+int
+getAxpyMemPatternIndex(clblasImplementation impl)
+{
+    switch(impl) {
+    default: return -1;
+    }
+}
+
+unsigned int
+initDotMemPatterns(MemoryPattern *mempats)
+{
+    initDotRegisterPattern(&mempats[0]);
+    return 1;
+}
+
+int
+getDotMemPatternIndex(clblasImplementation impl)
+{
+    switch(impl) {
+    default: return -1;
+    }
+}
+
+unsigned int
+initReductionMemPatterns(MemoryPattern *mempats)
+{
+    initReductionRegisterPattern(&mempats[0]);
+    return 1;
+}
+
+int
+getReductionMemPatternIndex(clblasImplementation impl)
+{
+    switch(impl) {
+    default: return -1;
+    }
+}
+
+unsigned int
+initRotgMemPatterns(MemoryPattern *mempats)
+{
+    initRotgRegisterPattern(&mempats[0]);
+    return 1;
+}
+
+int
+getRotgMemPatternIndex(clblasImplementation impl)
+{
+    switch(impl) {
+    default: return -1;
+    }
+}
+
+unsigned int
+initRotmgMemPatterns(MemoryPattern *mempats)
+{
+    initRotmgRegisterPattern(&mempats[0]);
+    return 1;
+}
+
+int
+getRotmgMemPatternIndex(clblasImplementation impl)
+{
+    switch(impl) {
+    default: return -1;
+    }
+}
+
+unsigned int
+initRotmMemPatterns(MemoryPattern *mempats)
+{
+    initRotmRegisterPattern(&mempats[0]);
+    return 1;
+}
+
+int
+getRotmMemPatternIndex(clblasImplementation impl)
+{
+    switch(impl) {
+    default: return -1;
+    }
+}
+
+unsigned int
+initiAmaxMemPatterns(MemoryPattern *mempats)
+{
+    initiAmaxRegisterPattern(&mempats[0]);
+    return 1;
+}
+
+int
+getiAmaxMemPatternIndex(clblasImplementation impl)
+{
+    switch(impl) {
+    default: return -1;
+    }
+}
+
+unsigned int
+initNrm2MemPatterns(MemoryPattern *mempats)
+{
+    initNrm2RegisterPattern(&mempats[0]);
+    return 1;
+}
+
+int
+getNrm2MemPatternIndex(clblasImplementation impl)
+{
+    switch(impl) {
+    default: return -1;
+    }
+}
+
+unsigned int
+initAsumMemPatterns(MemoryPattern *mempats)
+{
+    initAsumRegisterPattern(&mempats[0]);
+    return 1;
+}
+
+int
+getAsumMemPatternIndex(clblasImplementation impl)
+{
+    switch(impl) {
+    default: return -1;
+    }
+}
diff --git a/src/library/blas/gens/ger_lds.cpp b/src/library/blas/gens/ger_lds.cpp
new file mode 100644
index 0000000..b74945e
--- /dev/null
+++ b/src/library/blas/gens/ger_lds.cpp
@@ -0,0 +1,414 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/*
+ * ger generator
+ */
+//#define DEBUG_GER
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include "blas_kgen.h"
+
+#include <kprintf.hpp>
+#include <ger.clT>
+#include <solution_seq.h>
+
+extern "C"
+unsigned int dtypeSize(DataType type);
+
+
+static char Prefix[4];
+
+static int
+getDefaultDecomposition(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    void *pArgs);
+
+static SolverFlags
+solverFlags(void)
+{
+	#ifdef DEBUG_GER
+	printf("solverFlags callen......\n");
+	#endif
+
+    return (SF_WSPACE_1D);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+
+static void
+assignKargs(KernelArg *args, const void *params, const void* );
+
+extern "C"
+void initGerRegisterPattern(MemoryPattern *mempat);
+
+static  KernelExtraFlags
+selectVectorization(
+    void *kargs,
+    unsigned int vlen );
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *kArgs);
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs);
+
+static SolverOps gerOps = {
+    generator,
+    assignKargs,
+    isFitToLDS,
+    NULL, // Prepare Translate Dims
+    NULL, // Inner Decomposition Axis
+    calcNrThreads,
+    NULL,	// Related to images
+    solverFlags,
+	NULL,
+    getDefaultDecomposition,
+	NULL,
+	setBuildOpts,
+	selectVectorization
+};
+
+static  KernelExtraFlags
+selectVectorization(
+	void *args,
+	unsigned int vlen )
+{
+	KernelExtraFlags kflags = KEXTRA_NO_FLAGS;
+	CLBlasKargs *kargs  = (CLBlasKargs *)args;
+
+    if(((kargs->lda.matrix) % vlen) != 0)
+    {
+        kflags = KEXTRA_NO_COPY_VEC_A;
+    }
+
+	return kflags;
+}
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+	const SolutionStep *step = (const SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
+	{
+		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+
+		#ifdef DEBUG_GER
+		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
+		#endif
+	}
+	return;
+}
+
+static CLBLASMpatExtra mpatExtra;
+
+extern "C"
+void initGerRegisterPattern(MemoryPattern *mempat)
+{
+    mempat->name = "Register accumulation based ger";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &gerOps;
+
+    //CHECK THIS
+	mpatExtra.aMset = CLMEM_LEVEL_L2;
+    mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS; // For "x" vector
+    mpatExtra.mobjA = CLMEM_BUFFER;
+    mpatExtra.mobjB = CLMEM_BUFFER;
+    mempat->extra = &mpatExtra;
+
+	Prefix[TYPE_FLOAT] = 'S';
+	Prefix[TYPE_DOUBLE] = 'D';
+	Prefix[TYPE_COMPLEX_FLOAT] = 'C';
+	Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
+
+	#ifdef DEBUG_GER
+	printf("initGerRegPattern called with mempat = 0x%p\n", mempat);
+	fflush(stdout);
+	#endif
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *_extra)
+{
+	const CLBlasKargs *kargs = (const CLBlasKargs *)args;
+	const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra;
+	size_t BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block
+	size_t BH, BW;
+    unsigned int VEC_LEN = extra->vecLenA;
+
+	clblasOrder order = ( extra->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor;
+
+	size_t nBlocksY;                //number of blocks in Y dir ( Although we say 1D block to opencl )
+    size_t nBlocksX;                //number of blocks in X dir
+
+    BH = subdims->y;
+	BW = subdims->x;
+
+    if ( order == clblasColumnMajor )
+    {
+		nBlocksY = ( kargs->M + BH*VEC_LEN - 1 ) / (BH*VEC_LEN);
+        nBlocksX = ( kargs->N + BW - 1) / BW;
+    }
+    else
+    {
+		nBlocksY = ( kargs->M + BH - 1) / BH;
+        nBlocksX = ( kargs->N + BW*VEC_LEN - 1) / (BW*VEC_LEN);
+    }
+	size_t blocks = nBlocksX * nBlocksY;
+	threads[0] = blocks * BLOCKSIZE;
+	threads[1] = 1;
+
+	#ifdef DEBUG_GER
+	printf("calcNrThreads called from GER_Reg.cpp.. wgSize[0]: %u\twgSize[1]: %u\n", pgran->wgSize[0], pgran->wgSize[1]);
+	printf("subdim->y :%u\t subdim->x : %u\n", subdims->y, subdims->x);
+	printf("kargs-> M : %d,  kargs-> N: %d,  BH: %d,  BW: %d\n", kargs->M, kargs->N, BH, BW);
+	printf("blocks : %d\tglobalthreads[0]  : %u\t VecLen :%d\n", blocks, threads[0], VEC_LEN);
+	#endif
+
+}
+
+//
+// FIXME: Report correct return value - Needs change in KPRINTF
+//
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+	size_t BH, BW;//BLOCKSIZE  = pgran->wgSize[0]; // Because we are using 1D block
+    unsigned int VEC_LEN;
+	char tempTemplate[32*1024];
+	char bhStr[10], bwStr[10];
+
+
+	pgran = pgran; // To remove warnings
+	if ( buf == NULL) // return buffer size
+    {
+          buflen = (64 * 1024 * sizeof(char));
+          return (ssize_t)buflen;
+    }
+	CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
+	clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor;
+	VEC_LEN = extraFlags->vecLenA;
+
+	#ifdef DEBUG_GER
+	printf("GER GENERATOR called.... with %s order,  DataType %c  & Vector-Length: %d\n",
+				((order == clblasColumnMajor)? "ColumnMajor": "RowMajor"), Prefix[extraFlags->dtype], VEC_LEN );
+	#endif
+
+	if( order == clblasColumnMajor )
+	{
+		strcpy( tempTemplate, (char*)ger_C_kernel );
+	}
+	else
+	{
+		strcpy( tempTemplate, (char*)ger_R_kernel );
+	}
+
+	// FIXME: VECTORSIZE HARD CODED
+	// FIXME: SetKernelArgs.. sends offa, offx, and lda should be received as uint
+
+	bool doVLOAD = false;
+	if( extraFlags->flags &  KEXTRA_NO_COPY_VEC_A )
+	{
+		doVLOAD = true;
+		#ifdef DEBUG_GER
+			printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
+		#endif
+	}
+	else
+	{
+		#ifdef DEBUG_GER
+			printf("Using Aligned Data Pointer .........................\n");
+		#endif
+	}
+	kprintf kobj( Prefix[extraFlags->dtype], VEC_LEN, doVLOAD, doVLOAD);
+
+	BH = subdims->y;
+	BW = subdims->x;
+	sprintf( bhStr, "%d", BH );
+	sprintf( bwStr, "%d", BW );
+
+	#ifdef DEBUG_GER
+    printf("BH = %s\n", bhStr);
+    printf("BW = %s\n", bwStr);
+	#endif
+
+    kobj.put("%BH_DEF", (const char *)bhStr);
+    kobj.put("%BW_DEF", (const char *)bwStr);
+    kobj.spit((char*)buf, tempTemplate);
+
+
+	return (64 * 1024 * sizeof(char));
+    // return 0;//(ret < 0) ? -EOVERFLOW : ret;
+}
+
+/*
+		( __global const %TYPE* X, __global const %TYPE* Y, __global %TYPE* A,
+				uint M, uint N, uint offx, int incx, uint offy, int incy, uint offa, uint lda,
+				%TYPE alpha, int doConj )
+*/
+
+static void
+assignKargs(KernelArg *args, const void *params, const void*)
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+    cl_int incx, incy, doConj;
+
+    INIT_KARG(&args[0], blasArgs->B); 	//  B - our X vector
+    INIT_KARG(&args[1], blasArgs->C); 	//  C - our Y vector
+    INIT_KARG(&args[2], blasArgs->A); 	//  A - matrix A
+    initSizeKarg(&args[3], blasArgs->M);
+	initSizeKarg(&args[4], blasArgs->N);
+
+	incx = blasArgs->ldb.vector;
+	incy = blasArgs->ldc.vector;
+	initSizeKarg(&args[5], blasArgs->offBX);
+    INIT_KARG(&args[6], incx);
+	initSizeKarg(&args[7], blasArgs->offCY);
+   	INIT_KARG(&args[8], incy);
+	initSizeKarg(&args[9], blasArgs->offa);
+    initSizeKarg(&args[10], blasArgs->lda.matrix);
+
+   	assignScalarKarg(&args[11], &(blasArgs->alpha), blasArgs->dtype);
+	doConj = (cl_int)(blasArgs->K);
+	INIT_KARG(&args[12], doConj);	// K was used as doConj
+
+	#ifdef DEBUG_GER
+	printf("doConj = %d\n", doConj );
+	#endif
+
+	return;
+}
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs)
+{
+    const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs;
+    SolutionStep *step = container_of(kargs, args, SolutionStep);
+    unsigned int vecLen;
+    vecLen = ((CLBLASKernExtra*)(step->kernels[CLBLAS_COMPUTING_KERNEL]->extra))->vecLenA;
+
+    cl_ulong maxSize;
+
+    if( kargs->order == clblasColumnMajor ) {
+        maxSize = ( dim[0].x + (dim[0].y * vecLen) ) * sizeof(dtype);
+    } else {
+        maxSize = ( (dim[0].x * vecLen) + dim[0].y ) * sizeof(dtype);
+    }
+    return ( maxSize <= ldsSize );
+}
+
+static int
+getDefaultDecomposition(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    void *pArgs)
+{
+    SolutionStep *step = container_of( pgran , pgran, SolutionStep);
+    size_t maxWorkGroupSize;
+    cl_device_id devID = step->device.id;
+    size_t wgX, wgY;
+    pArgs = pArgs;
+
+    clGetDeviceInfo(devID, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                    sizeof(size_t), &maxWorkGroupSize, NULL);
+
+    if( step->args.order == clblasColumnMajor )
+	{
+		wgY = 16;						// BH preferably 16(quarter wave-front)
+		subdims[0].y = wgY;
+		wgX = maxWorkGroupSize / wgY;	// BW is left upto maxWorkGroupSize of the device
+		wgX = szmin( wgX, 16 );
+		subdims[0].x = wgX;
+	}
+	else {
+		wgX = 16;
+		subdims[0].x = wgX;
+		wgY = maxWorkGroupSize / wgX;
+		wgY = szmin( wgY, 16 );
+		subdims[0].y = wgY;
+	}
+
+    pgran->wgDim = 1; //1D blocking
+    pgran->wgSize[0] = (unsigned int)(wgX * wgY);
+    pgran->wgSize[1] = 1;
+
+    if(subdimsNum > 0)
+    {
+        subdims[0].itemX = subdims[0].x;
+        subdims[0].itemY = subdims[0].y;
+        subdims[0].bwidth = 1;
+    }
+    if(subdimsNum > 1)
+    {
+        subdims[1].itemY = 1;
+        subdims[1].itemX = 1;
+        subdims[1].y = subdims[1].itemY;
+        subdims[1].x = subdims[1].itemX;
+        subdims[1].bwidth = 1;
+    }
+
+    return 0;
+}
diff --git a/src/library/blas/gens/her2_lds.cpp b/src/library/blas/gens/her2_lds.cpp
new file mode 100644
index 0000000..a409c1a
--- /dev/null
+++ b/src/library/blas/gens/her2_lds.cpp
@@ -0,0 +1,365 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/*
+ * HER2 Generator
+ */
+//#define DEBUG_HER2
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include "blas_kgen.h"
+
+#include <kprintf.hpp>
+#include <syr2_her2.clT>
+#include <solution_seq.h>
+
+extern "C"
+unsigned int dtypeSize(DataType type);
+
+
+static char Prefix[4];
+
+static SolverFlags
+solverFlags(void)
+{
+	#ifdef DEBUG_HER2
+	printf("solverFlags called......\n");
+	#endif
+
+    return (SolverFlags)(SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+
+static void
+assignKargs(KernelArg *args, const void *params, const void*);
+
+extern "C"
+void initHer2DefaultPattern(MemoryPattern *mempat);
+
+static  KernelExtraFlags
+selectVectorization(
+    void *kargs,
+    unsigned int vlen );
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *kArgs);
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs);
+
+static SolverOps her2Ops = {
+    generator,
+    assignKargs,
+    isFitToLDS,
+    NULL, // Prepare Translate Dims
+    NULL, // Inner Decomposition Axis
+    calcNrThreads,
+    NULL,
+    solverFlags,
+	NULL,
+	NULL,
+	NULL,
+	setBuildOpts,
+	selectVectorization
+};
+
+static  KernelExtraFlags
+selectVectorization(
+	void *args,
+	unsigned int vlen )
+{
+	KernelExtraFlags kflags = KEXTRA_NO_FLAGS;
+	CLBlasKargs *kargs  = (CLBlasKargs *)args;
+
+	if(kargs->uplo == clblasUpper)
+	{
+		if( (kargs->N) % vlen)
+        {
+			kflags = KEXTRA_NO_COPY_VEC_A;
+		}
+	}
+
+    if( kargs->pigFuncID == CLBLAS_HPR2 )
+    {
+        kflags = KEXTRA_NO_COPY_VEC_A;      // Packed-case never do aligned access
+    }
+
+	return kflags;
+}
+
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+	const SolutionStep *step = (const SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+	if ( kargs->dtype == TYPE_COMPLEX_DOUBLE )
+	{
+		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		#ifdef DEBUG_HER2
+		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
+		#endif
+	}
+	if( kargs->order == clblasRowMajor )
+	{
+		strcat( buildOptStr, " -DHER2_ROWMAJOR ");
+		#ifdef DEBUG_HER2
+		printf("Setting build options ... HERMITIAN2_ROWMAJOR... for row-major support\n");
+		#endif
+	}
+    if( kargs->pigFuncID == CLBLAS_HPR2 )
+    {
+        strcat( buildOptStr, " -DPACKED ");
+    }
+
+	//Build options for syr2_her2.clT to generate HER2 related code.
+	strcat( buildOptStr, " -DHER2_ONLY ");
+	return;
+}
+
+static CLBLASMpatExtra mpatExtra;
+
+extern "C"
+void initHer2DefaultPattern(MemoryPattern *mempat)
+{
+	#ifdef DEBUG_HER2
+	printf("initHerDefaultPattern called with mempat = 0x%p\n", (void *)mempat);
+	fflush(stdout);
+	#endif
+
+    mempat->name = "LDS based her2";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &her2Ops;
+
+    mpatExtra.aMset = 0;
+    mpatExtra.bMset = CLMEM_LEVEL_LDS; // For "x" vector
+	//mpatExtra.cMset = CLMEM_LEVEL_LDS; // For "y" vector
+    mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY;
+    mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY;
+	//mpatExtra.mobjC = CLMEM_GLOBAL_MEMORY;
+
+    mempat->extra = &mpatExtra;
+
+	Prefix[TYPE_COMPLEX_FLOAT] = 'C';
+	Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *_extra)
+{
+	int BLOCKSIZE = pgran->wgSize[0]; // 1D Block
+	#ifdef DEBUG_HER2
+	printf("calcNrThreads called from her2_lds.cpp\n");
+	#endif
+
+    const CLBlasKargs *kargs = (const CLBlasKargs *)args;
+	const CLBLASKernExtra *extra;
+	extra = ( CLBLASKernExtra *)_extra;
+
+	#ifdef DEBUG_HER2
+	printf("subdims->y : %d, subdims->x : %d\n", (int)subdims->y, (int)subdims->x);
+	#endif
+	size_t TARGETROWS = subdims->y ;
+
+	#ifdef DEBUG_HER2
+	printf("kargs-> N : %d, TARGETROWS: %d\n", (int)kargs->N, TARGETROWS);
+	#endif
+
+	size_t blocks = ((kargs->N - 1)/ TARGETROWS) + 1;
+	#ifdef DEBUG_HER2
+	printf("blocks : %d\n", blocks);
+	#endif
+
+	threads[0] = ((blocks * (blocks + 1)) / 2) * BLOCKSIZE;
+	#ifdef DEBUG_HER2
+	printf("pgran-wgSize[0] : %d, globalthreads[0] : %d\n", (int)pgran->wgSize[0], (int)threads[0]);
+	#endif
+	threads[1] = 1;
+}
+
+//
+// FIXME: Report correct return value - Needs change in KPRINTF
+//
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+	int BLOCKSIZE  = pgran->wgSize[0];
+	char tempTemplate[64*1024];
+	char targetRows[10], blockSize[10];
+
+	if ( buf == NULL) // return buffer size
+	{
+		buflen = (64 * 1024 * sizeof(char));
+		return (ssize_t)buflen;
+	}
+	CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
+
+	#ifdef DEBUG_HER2
+ 	printf("HER2 GENERATOR called....\n");
+	#endif
+
+	clblasUplo uplo   = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower;
+
+	if ((subdims->y % extraFlags->vecLenA) != 0)
+	{
+		printf("WARNING: HER2: generator: TARGETROWS must be divisible by Vector Length\n");
+		return 0;
+	}
+
+	size_t TARGETROWS = 0;
+	( uplo == clblasLower )?
+		     (strcpy(tempTemplate, (char*)syr2_her2_CL_kernel)) : (strcpy(tempTemplate, (char*)syr2_her2_CU_kernel));
+
+	TARGETROWS = subdims->y;
+	if ((BLOCKSIZE % TARGETROWS) != 0)
+	{
+		printf("WARNING: HER2: generator: Invalid Block Size\n");
+		return 0;
+	}
+
+	#ifdef DEBUG_HER2
+	printf("dataType : %c\n", Prefix[extraFlags->dtype]);
+	#endif
+
+	// FIXME: VECTORSIZE HARD CODED
+	// FIXME : SetKernelArgs.. sends offa, offx, and lda should be received as uint
+    unsigned int vecLenA = extraFlags->vecLenA;
+
+	#ifdef DEBUG_HER2
+	printf("Vector length used : %d\n\n", vecLenA);
+	#endif
+
+	bool doVLOAD = false;
+	if( extraFlags->flags &  KEXTRA_NO_COPY_VEC_A )
+	{
+		doVLOAD = true;
+		#ifdef DEBUG_HER2
+			printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
+		#endif
+	}
+	else
+	{
+		#ifdef DEBUG_HER2
+			printf("Using Aligned Data Pointer .........................\n");
+		#endif
+	}
+	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD);
+
+	sprintf( targetRows, "%d", TARGETROWS );
+	sprintf( blockSize, "%d", BLOCKSIZE );
+
+	#ifdef DEBUG_HER2
+    printf("TARGET ROWS = %s\n", targetRows);
+    printf("BLOCK SIZE = %s\n", blockSize);
+	#endif
+
+    kobj.put("%TARGET_ROWS", (const char *)targetRows);
+    kobj.put("%BLOCKSIZE", (const char *) blockSize);
+    kobj.spit((char*)buf, tempTemplate);
+
+	return (64 * 1024 * sizeof(char));
+    // return 0;//(ret < 0) ? -EOVERFLOW : ret;
+}
+
+/*
+( __global %TYPE* _A, __global const %TYPE* _X, __global const %TYPE* _Y, int N,
+				int offx, int incx, int offy, int incy, int offa, int lda, %TYPE alpha)
+*/
+static void
+assignKargs(KernelArg *args, const void *params, const void*)
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+    cl_int inc;
+
+    INIT_KARG(&args[0], blasArgs->A); 	//A - input/output matrix - argument
+    INIT_KARG(&args[1], blasArgs->B); 	//X - x vector
+	INIT_KARG(&args[2], blasArgs->C); 	//Y - y vector
+	initSizeKarg(&args[3], blasArgs->N);
+	initSizeKarg(&args[4], blasArgs->offBX);
+    inc = blasArgs->ldb.vector;
+    INIT_KARG(&args[5], inc);
+	initSizeKarg(&args[6], blasArgs->offCY);
+	inc = blasArgs->ldc.vector;
+	INIT_KARG(&args[7], inc);
+    initSizeKarg(&args[8], blasArgs->offa);
+	initSizeKarg(&args[9], blasArgs->lda.matrix);
+    assignScalarKarg(&args[10], &(blasArgs->alpha), blasArgs->dtype);
+	return;
+}
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs)
+{
+    cl_ulong maxSize;
+    CLBlasKargs *blasArgs;
+
+	blasArgs = (CLBlasKargs *)kernelArgs;
+
+	// 4  buffers for xShared, yShared, xSharedTrans and ySharedTrans and 2 integers for the values of iShared and jShared.
+	maxSize = (dim->y * 4 * sizeof(dtype)) + (2 * sizeof(int));
+
+    return ((maxSize) <= ldsSize);
+
+}
+//#undef DEBUG_HER2
+
diff --git a/src/library/blas/gens/her_lds.cpp b/src/library/blas/gens/her_lds.cpp
new file mode 100644
index 0000000..e174de2
--- /dev/null
+++ b/src/library/blas/gens/her_lds.cpp
@@ -0,0 +1,360 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/*
+ * HER Generator
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include "blas_kgen.h"
+
+#include <kprintf.hpp>
+#include <syr_her.clT>
+#include <solution_seq.h>
+//#define DEBUG_HER
+
+extern "C"
+unsigned int dtypeSize(DataType type);
+
+
+static char Prefix[4];
+
+static SolverFlags
+solverFlags(void)
+{
+	#ifdef DEBUG_HER
+	printf("solverFlags called......\n");
+	#endif
+
+    return (SolverFlags)(SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+
+static void
+assignKargs(KernelArg *args, const void *params, const void*);
+
+extern "C"
+void initHerDefaultPattern(MemoryPattern *mempat);
+
+static  KernelExtraFlags
+selectVectorization(
+    void *kargs,
+    unsigned int vlen );
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *kArgs);
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs);
+
+static SolverOps herOps = {
+    generator,
+    assignKargs,
+    isFitToLDS,
+    NULL, // Prepare Translate Dims
+    NULL, // Inner Decomposition Axis
+    calcNrThreads,
+    NULL,
+    solverFlags,
+	NULL,
+	NULL,
+	NULL,
+	setBuildOpts,
+	selectVectorization
+};
+
+static  KernelExtraFlags
+selectVectorization(
+	void *args,
+	unsigned int vlen )
+{
+	KernelExtraFlags kflags = KEXTRA_NO_FLAGS;
+	CLBlasKargs *kargs  = (CLBlasKargs *)args;
+
+	if(kargs->uplo == clblasUpper)
+	{
+		if( (kargs->N) % vlen)
+        {
+			kflags = KEXTRA_NO_COPY_VEC_A;
+		}
+	}
+
+    if( kargs->pigFuncID == CLBLAS_HPR )
+    {
+        kflags = KEXTRA_NO_COPY_VEC_A;      // Packed-case never do aligned access
+    }
+
+	return kflags;
+}
+
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+	const SolutionStep *step = (const SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+	if ( kargs->dtype == TYPE_COMPLEX_DOUBLE )
+	{
+		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		#ifdef DEBUG_HER
+		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
+		#endif
+	}
+	if( kargs->order == clblasRowMajor )
+	{
+		strcat( buildOptStr, " -DHERMITIAN_ROWMAJOR ");
+		#ifdef DEBUG_HER
+		printf("Setting build options ... HERMITIAN_ROWMAJOR... for row-major support\n");
+		#endif
+	}
+    if( kargs->pigFuncID == CLBLAS_HPR )
+    {
+        strcat( buildOptStr, " -DPACKED ");
+    }
+
+	//Build options for syr_her.clT to generate HER related code.
+	strcat( buildOptStr, " -DHER_ONLY ");
+	return;
+}
+
+static CLBLASMpatExtra mpatExtra;
+
+extern "C"
+void initHerDefaultPattern(MemoryPattern *mempat)
+{
+	#ifdef DEBUG_HER
+	printf("initHerDefaultPattern called with mempat = 0x%p\n", (void *)mempat);
+	fflush(stdout);
+	#endif
+
+    mempat->name = "LDS based HER";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &herOps;
+
+    mpatExtra.aMset = 0;
+    mpatExtra.bMset = CLMEM_LEVEL_LDS; // For "x" vector
+    mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY;
+    mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY;
+    mempat->extra = &mpatExtra;
+
+	Prefix[TYPE_COMPLEX_FLOAT] = 'C';
+	Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *_extra)
+{
+	int BLOCKSIZE = pgran->wgSize[0]; // 1D Block
+	#ifdef DEBUG_HER
+	printf("calcNrThreads called from her_lds.cpp\n");
+	#endif
+
+    const CLBlasKargs *kargs = (const CLBlasKargs *)args;
+	const CLBLASKernExtra *extra;
+	extra  = ( CLBLASKernExtra *)_extra;
+
+	#ifdef DEBUG_HER
+	printf("subdims->y : %d, subdims->x : %d\n", (int)subdims->y, (int)subdims->x);
+	#endif
+	size_t TARGETROWS = subdims->y ;
+
+	#ifdef DEBUG_HER
+	printf("kargs-> N : %d, TARGETROWS: %d\n", (int)kargs->N, TARGETROWS);
+	#endif
+
+	size_t blocks = ((kargs->N - 1)/ TARGETROWS) + 1;
+	#ifdef DEBUG_HER
+	printf("blocks : %d\n", blocks);
+	#endif
+
+	threads[0] = ((blocks * (blocks + 1)) / 2) * BLOCKSIZE;
+	#ifdef DEBUG_HER
+	printf("pgran-wgSize[0] : %d, globalthreads[0] : %d\n", (int)pgran->wgSize[0], (int)threads[0]);
+	#endif
+	threads[1] = 1;
+}
+
+//
+// FIXME: Report correct return value - Needs change in KPRINTF
+//
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+	int BLOCKSIZE = pgran->wgSize[0];
+	char tempTemplate[32*1024];
+	char targetRows[10], blockSize[10];
+
+	if ( buf == NULL) // return buffer size
+	{
+		buflen = (64 * 1024 * sizeof(char));
+		return (ssize_t)buflen;
+	}
+	CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
+
+	#ifdef DEBUG_HER
+ 	printf("HER GENERATOR called....\n");
+	#endif
+
+	clblasUplo uplo   = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower;
+
+	if ((subdims->y % extraFlags->vecLenA) != 0)
+	{
+		printf("WARNING: HER: generator: TARGETROWS must be divisible by Vector Length\n");
+		return 0;
+	}
+
+	size_t TARGETROWS = 0;
+
+	( uplo == clblasLower )?
+		     (strcpy(tempTemplate, (char*)syr_her_CL_kernel)) : (strcpy(tempTemplate, (char*)syr_her_CU_kernel));
+
+
+	TARGETROWS = subdims->y;
+	if ((BLOCKSIZE % TARGETROWS) != 0)
+	{
+		printf("WARNING: HER: generator: Invalid Block Size\n");
+		return 0;
+	}
+
+	#ifdef DEBUG_HER
+	printf("dataType : %c\n", Prefix[extraFlags->dtype]);
+	#endif
+
+	// FIXME: VECTORSIZE HARD CODED
+	// FIXME : SetKernelArgs.. sends offa, offx, and lda should be received as uint
+    unsigned int vecLenA = extraFlags->vecLenA;
+
+	#ifdef DEBUG_HER
+	printf("Vector length used : %d\n\n", vecLenA);
+	#endif
+
+	bool doVLOAD = false;
+	if( extraFlags->flags &  KEXTRA_NO_COPY_VEC_A )
+	{
+		doVLOAD = true;
+		#ifdef DEBUG_HER
+			printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
+		#endif
+	}
+	else
+	{
+		#ifdef DEBUG_HER
+			printf("Using Aligned Data Pointer .........................\n");
+		#endif
+	}
+	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD);
+
+	sprintf( targetRows, "%d", TARGETROWS );
+	sprintf( blockSize, "%d", BLOCKSIZE );
+
+	#ifdef DEBUG_HER
+    printf("TARGET ROWS = %s\n", targetRows);
+    printf("BLOCK SIZE = %s\n", blockSize);
+	#endif
+
+    kobj.put("%TARGET_ROWS", (const char *)targetRows);
+    kobj.put("%BLOCKSIZE", (const char *) blockSize);
+    kobj.spit((char*)buf, tempTemplate);
+
+	return (64 * 1024 * sizeof(char));
+    // return 0;//(ret < 0) ? -EOVERFLOW : ret;
+}
+
+/*
+	 ( __global %TYPE* _A, __global const %TYPE* _X, int N,
+										int offx, int incx, int offa, int lda, %PTYPE alpha )
+*/
+static void
+assignKargs(KernelArg *args, const void *params, const void*)
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+    cl_int incx;
+
+    INIT_KARG(&args[0], blasArgs->A); 	//A - input/output matrix - argument
+    INIT_KARG(&args[1], blasArgs->B); 	//x - x vector
+    initSizeKarg(&args[2], blasArgs->N);
+	initSizeKarg(&args[3], blasArgs->offBX);
+    incx = blasArgs->ldb.vector;
+    INIT_KARG(&args[4], incx);
+    initSizeKarg(&args[5], blasArgs->offa);
+	initSizeKarg(&args[6], blasArgs->lda.matrix);
+	DataType alphaType = (blasArgs->dtype == TYPE_COMPLEX_FLOAT)? TYPE_FLOAT : TYPE_DOUBLE;
+    assignScalarKarg(&args[7], &(blasArgs->alpha), alphaType);
+	return;
+}
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs)
+{
+    cl_ulong maxSize;
+    CLBlasKargs *blasArgs;
+
+	blasArgs = (CLBlasKargs *)kernelArgs;
+
+	// 2 buffers for xShared and yShared and 2 integers for the values of iShared and jShared.
+	maxSize = (dim->y * 2 * sizeof(dtype)) + (2 * sizeof(int));
+
+    return ((maxSize) <= ldsSize);
+
+}
+//#undef DEBUG_HER
diff --git a/src/library/blas/gens/iamax.cpp b/src/library/blas/gens/iamax.cpp
new file mode 100644
index 0000000..bf20afd
--- /dev/null
+++ b/src/library/blas/gens/iamax.cpp
@@ -0,0 +1,303 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/*
+ * amax generator
+ */
+//#define DEBUG_AMAX
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include "blas_kgen.h"
+#include <kprintf.hpp>
+#include <iamax.clT>
+#include <solution_seq.h>
+
+extern "C"
+unsigned int dtypeSize(DataType type);
+
+
+static char Prefix[4];
+
+static SolverFlags
+solverFlags(void)
+{
+	#ifdef DEBUG_AMAX
+	printf("solverFlags called...\n");
+	#endif
+
+    return (SF_WSPACE_1D);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+
+static void
+    fixupArgs(void *args, SubproblemDim *subdims, void *extra);
+
+static void
+assignKargs(KernelArg *args, const void *params, const void* extra );
+
+extern "C"
+void initAmaxRegisterPattern(MemoryPattern *mempat);
+
+static  KernelExtraFlags
+selectVectorization(
+    void *kargs,
+    unsigned int vlen );
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *kArgs);
+
+static SolverOps amaxOps = {
+    generator,
+    assignKargs,
+    NULL,
+    NULL, // Prepare Translate Dims
+    NULL, // Inner Decomposition Axis
+    calcNrThreads,
+    NULL,
+    solverFlags,
+	fixupArgs,
+	NULL,
+	NULL,
+	setBuildOpts,
+	selectVectorization
+};
+
+static  KernelExtraFlags
+selectVectorization(
+	void *args,
+	unsigned int vlen )
+{
+	KernelExtraFlags kflags = KEXTRA_NO_FLAGS;
+	CLBlasKargs *kargs  = (CLBlasKargs *)args;
+
+    if( (((kargs->offa) % vlen) != 0))
+    {
+        kflags = KEXTRA_NO_COPY_VEC_A;
+    }
+	return kflags;
+}
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+	const SolutionStep *step = (const SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
+	{
+		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		#ifdef DEBUG_AMAX
+		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
+		#endif
+    }
+
+    if( (kargs->ldb.vector) != 1)
+    {
+        strcat( buildOptStr, " -DINCX_NONUNITY ");
+    }
+
+    if( (kargs->ldb.vector) < 1)
+    {
+        strcat( buildOptStr, " -DRETURN_ON_INVALID ");
+    }
+
+    if( (kargs->redctnType == REDUCE_MAX_WITH_INDEX_ATOMICS))
+    {
+        strcat( buildOptStr, " -DREDUCE_MAX_WITH_INDEX_ATOMICS ");
+    }
+
+	return;
+}
+
+
+static CLBLASMpatExtra mpatExtra;
+
+extern "C"
+void initiAmaxRegisterPattern(MemoryPattern *mempat)
+{
+	#ifdef DEBUG_AMAX
+	printf("initRegPattern called with mempat = 0x%p\n", mempat);
+	#endif
+
+	fflush(stdout);
+    mempat->name = "Register AMAX";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &amaxOps;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L2;
+    mpatExtra.bMset = CLMEM_LEVEL_L2;
+    mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY;
+    mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY;
+    mempat->extra = &mpatExtra;
+
+	Prefix[TYPE_FLOAT] = 'S';
+	Prefix[TYPE_DOUBLE] = 'D';
+	Prefix[TYPE_COMPLEX_FLOAT] = 'C';
+	Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *_extra)
+{
+    DUMMY_ARG_USAGE(subdims);
+	int BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block
+    const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra;
+    unsigned int VEC_LEN = extra->vecLenA;
+	#ifdef DEBUG_AMAX
+	printf("calcNrThreads called from amax.cpp\n");
+	#endif
+
+    const CLBlasKargs *kargs = (CLBlasKargs *)args;
+
+	size_t blocks = ((kargs->N - 1)/ (BLOCKSIZE*VEC_LEN)) + 1;
+
+	#ifdef DEBUG_AMAX
+	printf("blocks : %d\n", blocks);
+	#endif
+
+	threads[0] = blocks * BLOCKSIZE;
+	#ifdef DEBUG_AMAX
+	printf("pgran-wgSize[0] : %d, globalthreads[0]  : %d\n", pgran->wgSize[0], threads[0]);
+	#endif
+	threads[1] = 1;
+}
+
+//
+// FIXME: Report correct return value - Needs change in KPRINTF
+//
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+
+	DUMMY_ARG_USAGE(subdims);
+	size_t BLOCKSIZE  = pgran->wgSize[0];
+	char tempTemplate[32*1024];
+
+	if ( buf == NULL) // return buffer size
+	{
+		buflen = (32 * 1024 * sizeof(char));
+        return (ssize_t)buflen;
+	}
+	CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
+
+	#ifdef DEBUG_AMAX
+ 	printf("AMAX GENERATOR called....\n");
+	printf("dataType : %c\n", Prefix[extraFlags->dtype]);
+	#endif
+
+    unsigned int vecLenA = extraFlags->vecLenA;
+
+	#ifdef DEBUG_AMAX
+	printf("Vector length used : %d\n\n", vecLenA);
+	#endif
+
+	bool doVLOAD = false;
+	if( extraFlags->flags &  KEXTRA_NO_COPY_VEC_A )
+	{
+		doVLOAD = true;
+		#ifdef DEBUG_AMAX
+		printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
+		#endif
+	}
+	else
+	{
+		#ifdef DEBUG_AMAX
+		printf("Using Aligned Data Pointer .........................\n");
+		#endif
+	}
+    strcpy( tempTemplate, (char*)iamax_kernel );
+	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD, BLOCKSIZE);
+    kobj.spit((char*)buf, tempTemplate);
+
+    return (32 * 1024 * sizeof(char));
+}
+
+/*
+__kernel void %PREFIXiamax_kernel( __global %TYPE *_X, __global %TYPE _scratchBuf, __global %TYPE *_iMax,
+                                        uint N, uint offx, int incx, uint offiMax )
+*/
+static void
+assignKargs(KernelArg *args, const void *params, const void* )
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+	cl_int incx;
+
+    INIT_KARG(&args[0], blasArgs->B);
+	INIT_KARG(&args[1], blasArgs->D);
+    initSizeKarg(&args[2], blasArgs->N);
+    initSizeKarg(&args[3], blasArgs->offb);
+    incx = blasArgs->ldb.vector;
+    INIT_KARG(&args[4], incx);
+
+	return;
+}
+
+/** The purpose of this function is to add an work-group size indicator in
+    kernelKey, so that a different kernel is generated when work-group size is changed.
+    Reduction loop is unrolled in kprintf based on work-group size.
+
+    Member of SubproblemDim- bwidth, will be used to store work-group size of the current kernel
+    this will become a kernelKey, and kernel cache will be accordingly managed.
+    Note -- SubproblemDim is a member of kernelKey
+**/
+static void
+fixupArgs(void *args, SubproblemDim *subdims, void *extra)
+{
+    DUMMY_ARG_USAGE(extra);
+    CLBlasKargs *kargs = (CLBlasKargs*)args;
+    SolutionStep *step = container_of(kargs, args, SolutionStep);
+
+    subdims->bwidth = (step->pgran.wgSize[0]) * (step->pgran.wgSize[1]);
+}
+
diff --git a/src/library/blas/gens/init.h b/src/library/blas/gens/init.h
new file mode 100644
index 0000000..8f5ac66
--- /dev/null
+++ b/src/library/blas/gens/init.h
@@ -0,0 +1,159 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Declarations generators initialization
+ */
+
+#ifndef INIT_H_
+#define INIT_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void
+initGemvPattern(MemoryPattern *mempat);
+
+void
+InitGEMMCachedBlockPattern(MemoryPattern *mempat);
+
+void
+InitGEMMCachedSubgroupPattern(MemoryPattern *mempat);
+
+void
+initGemmLdsPattern(MemoryPattern *mempat);
+
+void
+initGemmImgPattern(MemoryPattern *mempat);
+
+void
+initTrmmCachedBlockPattern(MemoryPattern *mempat);
+
+void
+initTrmmCachedSubgroupPattern(MemoryPattern *mempat);
+
+void
+initTrmmLdsPattern(MemoryPattern *mempat);
+
+void
+initTrmmImgPattern(MemoryPattern *mempat);
+
+void
+initTrsmLdsPattern(MemoryPattern *mempat);
+
+void
+initTrsmImgPattern(MemoryPattern *mempat);
+
+void
+initTrsmCachedPattern(MemoryPattern *mempat);
+
+void
+initTrsmLdsLessCachedPattern(MemoryPattern *mempat);
+
+void
+initSyr2kBlockPattern(MemoryPattern *mempat);
+
+void
+initSyr2kSubgPattern(MemoryPattern *mempat);
+
+void
+initSyrkBlockPattern(MemoryPattern *mempat);
+
+void
+initSyrkSubgPattern(MemoryPattern *mempat);
+
+void
+initSymvPattern(MemoryPattern *mempat);
+
+void
+initTrmvRegisterPattern(MemoryPattern *mempat);
+
+void
+initTrsvDefaultPattern(MemoryPattern *mempat);
+
+void
+initTrsvGemvDefaultPattern(MemoryPattern *mempat);
+
+void
+initSymmDefaultPattern(MemoryPattern *mempat);
+
+void
+initGerRegisterPattern(MemoryPattern *mempat);
+
+void
+initSyrDefaultPattern(MemoryPattern *mempat);
+
+void
+initSyr2DefaultPattern(MemoryPattern *mempat);
+
+void
+initHerDefaultPattern(MemoryPattern *mempat);
+
+void
+initHer2DefaultPattern(MemoryPattern *mempat);
+
+void
+initGemmV2CachedPattern(MemoryPattern *mempat);
+
+void
+initGemmV2TailCachedPattern(MemoryPattern *mempat);
+
+void
+initGbmvRegisterPattern(MemoryPattern *mempat);
+
+void
+initSwapRegisterPattern(MemoryPattern *mempat);
+
+void
+initScalRegisterPattern(MemoryPattern *mempat);
+
+void
+initCopyRegisterPattern(MemoryPattern *mempat);
+
+void
+initAxpyRegisterPattern(MemoryPattern *mempat);
+
+void
+initDotRegisterPattern(MemoryPattern *mempat);
+
+void
+initReductionRegisterPattern(MemoryPattern *mempat);
+
+void
+initRotgRegisterPattern(MemoryPattern *mempat);
+
+void
+initRotmgRegisterPattern(MemoryPattern *mempat);
+
+void
+initRotmRegisterPattern(MemoryPattern *mempat);
+
+void
+initiAmaxRegisterPattern(MemoryPattern *mempat);
+
+void
+initNrm2RegisterPattern(MemoryPattern *mempat);
+
+void
+initAsumRegisterPattern(MemoryPattern *mempat);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* INIT_H_ */
diff --git a/src/library/blas/gens/kprintf.cpp b/src/library/blas/gens/kprintf.cpp
new file mode 100644
index 0000000..54772fa
--- /dev/null
+++ b/src/library/blas/gens/kprintf.cpp
@@ -0,0 +1,2435 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <kprintf.hpp>
+
+static const char *types[] = {
+"float", "float2", "float3", "float4", "float8", "float16",
+      "double", "double2", "double3", "double4", "double8", "double16"
+};
+
+static const char*vloadTypes[] = {
+    "vload", "vload2", "vload3", "vload4", "vload8", "vload16"
+};
+
+static const char*vstoreTypes[] = {
+    "vstore", "vstore2", "vstore3", "vstore4", "vstore8", "vstore16"
+};
+
+static const char *vecIndices[] = {
+    "S0", "S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8", "S9",
+    "SA", "SB", "SC", "SD", "SE", "SF"
+};
+
+static const char *vecIndicesWithDot[] = {
+    ".S0", ".S1", ".S2", ".S3", ".S4", ".S5", ".S6", ".S7", ".S8", ".S9",
+    ".SA", ".SB", ".SC", ".SD", ".SE", ".SF"
+};
+
+static const char *vecComplexIndicesWithDot[] = {
+    ".s01", ".s23", ".s45", ".s67", ".s89", ".sAB", ".sCD", ".sEF"
+};
+
+static const char *vectorWidthTypes[] = {
+    "1", "2", "3", "4", "6", "8", "16"
+};
+
+static const char *numbers[] = {
+  "0", "1", "2", "3", "4" , "5", "6" ,"7", "8", "9", "10", "11", "12", "13", "14", "15", "16"
+};
+
+//#define MUL_SCALAR_UNROLL
+//#define DIV_SCALAR_UNROLL
+
+
+kprintf::fmt_t kprintf::get(const char *key)
+{
+    std::vector<struct fmt>::iterator t;
+    int l, knownLength, lengthKeyMax = -1;
+    struct fmt retval;
+
+    retval.key=NULL; retval.value=NULL;
+    knownLength = (int)strlen(key);
+
+    for(t = v.begin(); t != v.end(); t++)
+    {
+        l = (int)strlen((*t).key);
+        if (l > knownLength)
+        {
+            continue;
+        }
+        if (strncmp(key, (*t).key, l) == 0)
+        {
+            if (l > lengthKeyMax)
+            {
+                retval = (*t);
+                lengthKeyMax = l;
+            }
+        }
+    }
+    return retval;
+}
+
+
+const char * kprintf::findType(char *type)
+{
+    size_t i;
+
+    for(i=0; i<sizeof(types)/sizeof(const char*); i++)
+    {
+        if (strcmp(type, types[i]) == 0)
+            return types[i];
+    }
+    return NULL;
+}
+
+const char * kprintf::findVectorWidthType(char *type)
+{
+    size_t i;
+
+    for(i=0; i<sizeof(vectorWidthTypes)/sizeof(const char*); i++)
+    {
+        if (strcmp(type, vectorWidthTypes[i]) == 0)
+            return vectorWidthTypes[i];
+    }
+    return NULL;
+}
+
+const char *kprintf::findTypeVLOAD(char *type)
+{
+    size_t i;
+
+    for(i=0; i<sizeof(vloadTypes)/sizeof(const char*); i++)
+    {
+        if (strcmp(type, vloadTypes[i]) == 0)
+            return vloadTypes[i];
+    }
+    return NULL;
+}
+
+const char *kprintf::findTypeVSTORE(char *type)
+{
+    size_t i;
+
+    for(i=0; i<sizeof(vstoreTypes)/sizeof(const char*); i++)
+    {
+        if (strcmp(type, vstoreTypes[i]) == 0)
+            return vstoreTypes[i];
+    }
+    return NULL;
+}
+
+void kprintf::generateVecSuffix(char *p, int n)
+{
+    // FIXED
+    /*
+    if ( n == 1)
+    {
+        p[0] = 0;
+        return;
+    }
+    */
+    if (n < 10)
+    {
+        p[0] = (char)('0' + n);
+        p[1] = 0;
+    } else {
+        p[0] = (char)('0' + (n/10));
+        p[1] = (char)('0' + (n%10));
+        p[2] = 0;
+    }
+    return;
+}
+
+void kprintf::registerType(const char *baseType, int vecWidth, int internalVecWidth)
+{
+    char vecSuffix[3], vecSuffixPtype[3];
+    char derivedType[9], derivedTypePtype[9];
+    const char *string;
+
+    vectorWidth = vecWidth;
+    if (internalVecWidth == 1)
+    {
+        s_or_v = SCALAR;
+        effectiveVectorWidthOnBaseType = vecWidth;
+        put("%BASEWIDTH", "1");
+    } else {
+        s_or_v = VECTOR;
+        effectiveVectorWidthOnBaseType = vecWidth*internalVecWidth;
+        put("%BASEWIDTH", "2");
+    }
+
+    vecSuffix[0] = vecSuffix[1] = 0;
+    vecSuffixPtype[0] = vecSuffixPtype[1] = 0;
+    put("%TYPE", baseType);
+    BASE = baseType;
+    strcpy(derivedType, baseType);
+        //
+        //
+        if (derivedType[strlen(derivedType) -1] == '2')
+        {
+            derivedType[strlen(derivedType) -1] = '\0';
+        }
+    strcpy(derivedTypePtype, derivedType);
+
+    if (vecWidth > 1)
+    {
+        generateVecSuffix(vecSuffix, effectiveVectorWidthOnBaseType);
+        generateVecSuffix(vecSuffixPtype, vecWidth);
+        strcat(derivedType, vecSuffix);
+        strcat(derivedTypePtype, vecSuffixPtype);
+        string = findType(derivedType);
+        if (string != NULL)
+        {
+            put("%TYPE%V", string );
+            DERIVED = string;
+        } else {
+            std::cout << "kprint() constructor: Invalid vector width specified" << std::endl;
+            throw -1;
+        }
+
+        string = findType(derivedTypePtype);
+        if (string != NULL)
+        {
+            put("%PTYPE%V", string );
+    } else {
+            std::cout << "kprint() constructor: Invalid vector width specified" << std::endl;
+            throw -1;
+        }
+    } else {
+        put("%TYPE%V", baseType);
+        string = findType(derivedTypePtype);
+        put("%PTYPE%V", string);
+        // FIXED
+        DERIVED = baseType;
+    }
+
+    //
+    // Register HALF (%HV), QUARTER(%QV), HALF_QUARTER(%OV) types
+    //
+    struct fmt f;
+    f = get("%TYPE%V");
+    registerReducedTypes(f.value, 2);
+    registerReducedTypes(f.value, 4);
+    registerReducedTypes(f.value, 8);
+
+    registerSuperTypes(f.value, 2);
+    registerSuperTypes(f.value, 4);
+    registerSuperTypes(f.value, 8);
+
+    HALFWORD = get("%TYPE%HV").value;
+    QUARTERWORD = get("%TYPE%QV").value;
+    HALFQUARTERWORD  = get("%TYPE%OV").value;
+
+    registerVectorWidth();
+
+    // Register MakeVector : V, HV, QV, OV
+    put("%MAKEV", NULL);
+    put("%MAKHV", NULL);
+    put("%MAKQV", NULL);
+    put("%MAKOV", NULL);
+}
+
+void kprintf::registerReducedTypes( const char* in, int div)
+{
+    char vecSuffix[3] = {0};
+    char tempStr[9] = {0};
+    const char* reducedCase = (div == 2) ? "%TYPE%HV" : ( div == 4) ? "%TYPE%QV" : "%TYPE%OV";
+    const char* reducedVectorLength = (div == 2) ? "%HV" : ( div == 4) ? "%QV" : "%OV";
+    bool vecSuffixEmpty = false;
+
+    if ( !( effectiveVectorWidthOnBaseType / div))
+    {
+        //std::cout << "Warning : Vector reduces to zero - registering " << reducedCase << " as NULL" << std::endl;
+        put(reducedCase, "NULL");
+        return;
+    }
+
+    if ((effectiveVectorWidthOnBaseType / div) > 1)
+    {
+        generateVecSuffix( vecSuffix, effectiveVectorWidthOnBaseType / div);
+    } else {
+        vecSuffix[0] = '\0';
+        vecSuffixEmpty = true;
+    }
+
+    if( in[4] == 't') // float
+    {
+        strcpy( tempStr, "float");
+    }
+    else
+    {
+        strcpy( tempStr, "double");
+    }
+
+    strcat( tempStr, vecSuffix);
+    put( reducedCase, findType(tempStr));
+    if (vecSuffixEmpty == false)
+        put( reducedVectorLength, findVectorWidthType(vecSuffix));
+    else
+        put( reducedVectorLength, "1");
+}
+
+void kprintf::registerSuperTypes( const char* in, int mul)
+{
+    char vecSuffix[3] = {0};
+    char tempStr[9] = {0};
+    const char* superCase = ((mul == 2) ? "%TYPE%DV" : ( mul == 4) ? "%TYPE%QUADV" : "%TYPE%OCTAV");
+    const char* superVectorLength = ((mul == 2) ? "%DV" : ( mul == 4) ? "%QUADV" : "%OCTAV");
+
+    if ( ( effectiveVectorWidthOnBaseType * mul) > 16)
+    {
+        //std::cout << "Warning : Super Vector is not a OCL type- registering " << superCase << " as NULL" << std::endl;
+        put(superCase, "NULL");
+        return;
+    }
+
+    if ((effectiveVectorWidthOnBaseType * mul) > 1)
+    {
+        generateVecSuffix( vecSuffix, effectiveVectorWidthOnBaseType * mul);
+    } else {
+        vecSuffix[0] = '\0';
+    }
+
+    if( in[4] == 't') // float
+    {
+        strcpy( tempStr, "float");
+    }
+    else
+    {
+        strcpy( tempStr, "double");
+    }
+
+    strcat( tempStr, vecSuffix);
+    put( superCase, findType(tempStr));
+    put( superVectorLength, findVectorWidthType(vecSuffix));
+}
+
+char* kprintf::mystrtok( char* in, const char* tok)
+{
+    char* last;
+    if ( in ) // in is not NULL
+    {
+        last = in;
+        // Initialize strtokPtr
+        strtokPtr =  in;
+
+        // look for '('
+        while( *strtokPtr != '(')
+        {
+            strtokPtr++;
+        }
+
+        *strtokPtr = '\0';
+        strtokPtr++;
+        strtokCount = 1;
+    }
+    else
+    {
+        last = strtokPtr;
+        // Look for tokens other than '('
+        while(strtokPtr[0])
+        {
+            bool tokenFound = false;
+            for( size_t i=0 ; i <= (strlen(tok) - 1); i++)
+            {
+                if ((*strtokPtr == tok[i]))
+                {
+                    if ( tok[i] == '(')
+                    {
+                        strtokCount++;
+                        continue;
+                    }
+                    else if ( tok[i] == ')')
+                    {
+                        strtokCount--;
+                        if ( strtokCount != 0)
+                        {
+                            continue;
+                        }
+                    }
+
+                    // Token matched
+                    *strtokPtr = '\0';
+                    tokenFound = true;
+                    break;
+                }
+            }
+
+            if ( tokenFound)
+            {
+                strtokPtr++;
+                break;
+            }
+
+            strtokPtr++;
+        }
+    }
+    return last;
+}
+//
+// VLOAD %TYPE%V from (%PTYPE*) kind of memory locations
+// The Kernel writers should use "%TYPE" and "%TYPE%V" for kernel aguments, local variables etc..
+// However, while loading using %VLOAD, they should cast the pointers as "%PTYPE *" because
+// VLOADn imposes certain restrictions.
+// Having the pointers as %TYPE and %TYPE%V relieves us from address calculations for primitives
+// which are vectors (like float2, double2 etc..)
+//
+void kprintf::registerVLOAD()
+{
+    const char *string;
+    char vecSuffix[3] = {0};
+    char tempStr[9] = {0};
+
+    generateVecSuffix( vecSuffix, effectiveVectorWidthOnBaseType);  // VLOAD %TYPE%V from %PTYPE kind of memory locations
+    strcpy( tempStr, "vload");
+    strcat( tempStr, vecSuffix);
+    string = findTypeVLOAD(tempStr);
+    if (string != NULL)
+    {
+         put( "%VLOAD", string);
+    } else {
+        std::cerr << "registerVLOAD: " << tempStr << " not a valid VLOAD type" << std::endl;
+    }
+}
+
+void kprintf::registerVSTORE(void)
+{
+    const char *string;
+    char vecSuffix[3] = {0};
+    char tempStr[9] = {0};
+
+    generateVecSuffix( vecSuffix, effectiveVectorWidthOnBaseType);  // VSTORE %TYPE%V from %PTYPE kind of memory locations
+    strcpy( tempStr, "vstore");
+    if (effectiveVectorWidthOnBaseType > 1)
+    {
+        strcat( tempStr, vecSuffix);
+    }
+    string = findTypeVSTORE(tempStr);
+    if (string != NULL)
+       {
+           put( "%VSTORE_VALUE", string);
+    } else {
+           std::cerr << "registerVSTORE: " << tempStr << " not a valid VSTORE type" << std::endl;
+    }
+}
+
+void kprintf::registerVectorWidth()
+{
+    const char *string;
+    char vecSuffix[3] = {0};
+    generateVecSuffix( vecSuffix, vectorWidth);  // VLOAD %TYPE%V from %PTYPE kind of memory locations
+    string = findVectorWidthType(vecSuffix);
+    if (string != NULL)
+    {
+         put( "%V", string);
+
+    } else {
+        std::cerr << "registerVectorWidth: " << string << " not a valid Vector Width size" << std::endl;
+    }
+}
+
+void kprintf::handleMakeVector(char **_src, char **_dst, int div)
+{
+    int numCharsWritten = 0;
+    char id[256];
+    char * ptr;
+    char *src = *_src;
+    char *dst = *_dst;
+
+    ptr = mystrtok( src, "()");
+    ptr = mystrtok( NULL, "()"); // Get ID
+    strcpy( id, ptr);
+    *_src = ptr + strlen(ptr) + 1;
+
+
+    if ( div == 0 ) // Scalar Case
+    {
+        numCharsWritten = sprintf(dst,"(%s)(", BASE);
+        dst += numCharsWritten;
+
+        if ( s_or_v == VECTOR)
+        {
+            if ( strcmp( BASE,"float") == 0 || strcmp( BASE,"float2") == 0)
+            {
+                numCharsWritten = sprintf(dst," %s%c,", id, 'f');
+            }
+            else
+            {
+                numCharsWritten = sprintf(dst," %s,", id);
+            }
+            dst += numCharsWritten;
+        }
+
+        if ( strcmp( BASE,"float") == 0 || strcmp( BASE,"float2") == 0 )
+        {
+            numCharsWritten = sprintf(dst," %s%c)", id,'f');
+        }
+        else
+        {
+            numCharsWritten = sprintf(dst," %s)", id);
+
+        }
+        dst += numCharsWritten;
+        *_dst = dst;
+    }
+    else
+    {
+        numCharsWritten = sprintf(dst,"(%s)(", (div == 1)? DERIVED : (div == 2)? HALFWORD : (div == 4)? QUARTERWORD: HALFQUARTERWORD);
+        dst += numCharsWritten;
+
+        for( int i = 1 ; i < (vectorWidth/ div); i++)
+        {
+            numCharsWritten = sprintf(dst," %s,", id);
+            dst += numCharsWritten;
+        }
+
+        numCharsWritten = sprintf(dst," %s)", id);
+        dst += numCharsWritten;
+        *_dst = dst;
+    }
+}
+
+void kprintf::handleMUL(char **_src, char **_dst, bool vmul)
+{
+    int numCharsWritten = 0;
+    char id1[256], id2[256], id3[256];
+    char * ptr;
+    char *src = *_src;
+    char *dst = *_dst;
+    int vwidth=1;
+
+    ptr = mystrtok( src, "(,)");
+    ptr = mystrtok( NULL, "(,)"); // Get first ID
+    strcpy( id1, ptr);
+    ptr = mystrtok( NULL, "(,)"); // Get second ID
+    strcpy( id2, ptr);
+    ptr = mystrtok( NULL, "(,)"); // Get third ID
+    strcpy( id3, ptr);
+    *_src = ptr + strlen(ptr) + 1;
+    //std::cout << id1 << "  " << id2 << "  " << id3 << std::endl;
+    if ( (strcmp(id1, id2) == 0) || (strcmp(id1, id3)==0) || (strcmp(id2,id3) == 0) )
+    {
+        if (vmul == false)
+        {
+            std::cout << "%MUL( C, A, B) : C , A and B have to be UNIQUE" << std::endl;
+        } else {
+            std::cout << "%VMUL( C, A, B) : C , A and B have to be UNIQUE" << std::endl;
+        }
+        throw -1;
+    }
+
+    switch(s_or_v)
+    {
+        case SCALAR:
+            numCharsWritten = sprintf(dst, "%s = %s * %s", id1, id2, id3);
+            dst += numCharsWritten;
+            break;
+
+        case VECTOR:
+            if (vmul == true)
+            {
+                vwidth = vectorWidth;
+            } else {
+                vwidth = 1;
+            }
+#ifdef MUL_SCALAR_UNROLL
+            for(int i=0; i<vwidth; i++)
+            {
+                numCharsWritten = sprintf(dst, "%s.%s = (((%s.%s)*(%s.%s)) -( (%s.%s)*(%s.%s)));\n",
+                                                                          id1, vecIndices[i*2],
+                                                                          id2, vecIndices[i*2], id3, vecIndices[i*2],
+                                                                          id2, vecIndices[i*2 + 1], id3, vecIndices[i*2 + 1]);
+                dst += numCharsWritten;
+                numCharsWritten = sprintf(dst, "%s.%s = (((%s.%s)*(%s.%s)) + ( (%s.%s)*(%s.%s)));\n",
+                                                                          id1, vecIndices[i*2+1],
+                                                                          id2, vecIndices[i*2], id3, vecIndices[i*2 + 1],
+                                                                          id2, vecIndices[i*2 + 1], id3, vecIndices[i*2]);
+                dst += numCharsWritten;
+            }
+#else
+            //
+            // Vector Unroll - Extract ODD and EVEN stuff and express multiplication via vectors
+            //
+                numCharsWritten = sprintf( dst, "%s.even = ((%s.even) * (%s.even)) - ((%s.odd) * (%s.odd));\n", id1, id2, id3, id2, id3);
+                dst += numCharsWritten;
+
+                numCharsWritten = sprintf( dst, "%s.odd = ((%s.even) * (%s.odd)) + ((%s.odd) * (%s.even));\n", id1, id2, id3, id2, id3);
+                dst += numCharsWritten;
+#endif
+            break;
+        default:
+            std::cout << "handleMUL: s_or_v is neither scalar nor a vector" << std::endl;
+            throw -1;
+    }
+    *_dst = dst;
+}
+
+void kprintf::handleMAD(char **_src, char **_dst, bool vmul)
+{
+    int numCharsWritten = 0;
+    char id1[256], id2[256], id3[256];
+    char * ptr;
+    char *src = *_src;
+    char *dst = *_dst;
+    int vwidth=1;
+
+    ptr = mystrtok( src, "(,)");
+    ptr = mystrtok( NULL, "(,)"); // Get first ID
+    strcpy( id1, ptr);
+    ptr = mystrtok( NULL, "(,)"); // Get second ID
+    strcpy( id2, ptr);
+    ptr = mystrtok( NULL, "(,)"); // Get third ID
+    strcpy( id3, ptr);
+    *_src = ptr + strlen(ptr) + 1;
+    //std::cout << id1 << "  " << id2 << "  " << id3 << std::endl;
+    if ( (strcmp(id1, id2) == 0) || (strcmp(id1, id3)==0) || (strcmp(id2,id3) == 0) )
+    {
+        if (vmul == false)
+        {
+            std::cout << "%MAD( C, A, B) : C , A and B have to be UNIQUE" << std::endl;
+        } else {
+            std::cout << "%VMAD( C, A, B) : C , A and B have to be UNIQUE" << std::endl;
+        }
+        throw -1;
+    }
+
+    switch(s_or_v)
+    {
+        case SCALAR:
+            #ifdef ACCURACY_OVER_SPEED
+            numCharsWritten = sprintf(dst, "%s += %s * %s", id1, id2, id3);
+            //
+            // Enable the below to generated MADs - No much difference seen for SGEMM.
+            // Need to check for DGEMM
+            //
+            #else
+            numCharsWritten = sprintf(dst, "%s = mad(%s,%s,%s)", id1, id2, id3, id1);
+            #endif
+            dst += numCharsWritten;
+            break;
+
+        case VECTOR:
+            if (vmul == true)
+            {
+                vwidth = vectorWidth;
+            } else {
+                vwidth = 1;
+            }
+#ifdef MUL_SCALAR_UNROLL
+            for(int i=0; i<vwidth; i++)
+            {
+                numCharsWritten = sprintf(dst, "%s.%s = %s.%s + (((%s.%s)*(%s.%s)) -( (%s.%s)*(%s.%s)));\n",
+                                                                          id1, vecIndices[i*2], id1, vecIndices[i*2],
+                                                                          id2, vecIndices[i*2], id3, vecIndices[i*2],
+                                                                          id2, vecIndices[i*2 + 1], id3, vecIndices[i*2 + 1]);
+                dst += numCharsWritten;
+                numCharsWritten = sprintf(dst, "%s.%s = %s.%s + (((%s.%s)*(%s.%s)) + ( (%s.%s)*(%s.%s)));\n",
+                                                                          id1, vecIndices[i*2+1], id1, vecIndices[i*2+1],
+                                                                          id2, vecIndices[i*2], id3, vecIndices[i*2 + 1],
+                                                                          id2, vecIndices[i*2 + 1], id3, vecIndices[i*2]);
+                dst += numCharsWritten;
+            }
+#else
+            //
+            // Vector Unroll - Extract ODD and EVEN stuff and express multiplication via vectors
+            //
+            #define COMPLEX_MUL_ADD
+            #ifdef COMPLEX_MUL_ADD
+                numCharsWritten = sprintf( dst, "%s.even = %s.even + ((%s.even) * (%s.even)) - ((%s.odd) * (%s.odd));\n", id1, id1, id2, id3, id2, id3);
+                dst += numCharsWritten;
+
+                numCharsWritten = sprintf( dst, "%s.odd = %s.odd + ((%s.even) * (%s.odd)) + ((%s.odd) * (%s.even));\n", id1, id1, id2, id3, id2, id3);
+                dst += numCharsWritten;
+            #else
+            #define COMPLEX_MAD_USING_LOCAL_VARIABLES
+            #ifdef COMPLEX_MAD_USING_LOCAL_VARIABLES
+                numCharsWritten = sprintf(dst, "\n{ %s id2even = %s.even, id2odd = %s.odd, id3even = %s.even, id3odd = %s.odd;\n\t",
+                                                    HALFWORD, id2, id2, id3, id3);
+                dst += numCharsWritten;
+                numCharsWritten = sprintf( dst, "%s.even = mad(id2even, id3even, %s.even);\n\t", id1, id1);
+                dst += numCharsWritten;
+                numCharsWritten = sprintf( dst, "%s.odd  = mad(id2even, id3odd, %s.odd);\n\t", id1, id1);
+                dst += numCharsWritten;
+                numCharsWritten = sprintf( dst, "%s.even = mad(id2odd, -id3odd, %s.even);\n\t", id1, id1);
+                dst += numCharsWritten;
+                numCharsWritten = sprintf( dst, "%s.odd  = mad(id2odd, id3even, %s.odd);\n", id1, id1);
+                dst += numCharsWritten;
+                numCharsWritten = sprintf(dst, "}\n");
+                dst += numCharsWritten;
+            #else
+                numCharsWritten = sprintf( dst, "%s.even = mad(%s.even, %s.even, %s.even);\n\t", id1, id2, id3, id1);
+                dst += numCharsWritten;
+                numCharsWritten = sprintf( dst, "%s.even = mad(%s.odd, -%s.odd, %s.even);\n\t", id1, id2, id3, id1);
+                dst += numCharsWritten;
+                numCharsWritten = sprintf( dst, "%s.odd  = mad(%s.even, %s.odd, %s.odd);\n\t", id1, id2, id3, id1);
+                dst += numCharsWritten;
+                numCharsWritten = sprintf( dst, "%s.odd  = mad(%s.odd, %s.even, %s.odd);\n", id1, id2, id3, id1);
+                dst += numCharsWritten;
+            #endif
+#endif
+#endif
+            break;
+        default:
+            std::cout << "handleMAD: s_or_v is neither scalar nor a vector" << std::endl;
+            throw -1;
+    }
+    *_dst = dst;
+}
+
+void kprintf::handleVMAD_AND_REDUCE(char **_src, char **_dst)
+{
+    int numCharsWritten = 0;
+    char id1[256], id2[256], id3[256];
+    char * ptr;
+    char *src = *_src;
+    char *dst = *_dst;
+    int vwidth=1;
+
+    ptr = mystrtok( src, "(,)");
+    ptr = mystrtok( NULL, "(,)"); // Get first ID
+    strcpy( id1, ptr);
+    ptr = mystrtok( NULL, "(,)"); // Get second ID
+    strcpy( id2, ptr);
+    ptr = mystrtok( NULL, "(,)"); // Get third ID
+    strcpy( id3, ptr);
+    *_src = ptr + strlen(ptr) + 1;
+    //std::cout << id1 << "  " << id2 << "  " << id3 << std::endl;
+    if ( (strcmp(id1, id2) == 0) || (strcmp(id1, id3)==0) || (strcmp(id2,id3) == 0) )
+    {
+        std::cout << "%VMAD_AND_REDUCE( C, A, B) : C , A and B have to be UNIQUE" << std::endl;
+        throw -1;
+    }
+
+    switch(s_or_v)
+    {
+        case SCALAR:
+            if (vectorWidth == 1)
+            {
+                numCharsWritten = sprintf(dst, "%s = mad(%s,%s,%s);\n\t", id1, id2, id3, id1);
+                dst += numCharsWritten;
+
+            } else {
+                for(int i=0; i<vectorWidth; i++)
+                {
+                    numCharsWritten = sprintf(dst, "%s = mad((%s).%s,(%s).%s,(%s));\n\t", id1, id2, vecIndices[i], id3,
+                                                                               vecIndices[i], id1);
+                    dst += numCharsWritten;
+                }
+            }
+            break;
+
+        case VECTOR:
+            if (vectorWidth == 1)
+            {
+                numCharsWritten = sprintf(dst, "%s.S0 = mad((%s).S0,(%s).S0,%s.S0);\n\t", id1, id2, id3, id1);
+                dst += numCharsWritten;
+                numCharsWritten = sprintf(dst, "%s.S0 = mad((%s).S1,-(%s.S1),%s.S0);\n\t", id1, id2, id3, id1);
+                dst += numCharsWritten;
+
+                numCharsWritten = sprintf(dst, "%s.S1 = mad((%s).S0,(%s).S1,%s.S1);\n\t", id1, id2, id3, id1);
+                dst += numCharsWritten;
+                numCharsWritten = sprintf(dst, "%s.S1 = mad((%s).S1,(%s.S0),%s.S1);\n\t", id1, id2, id3, id1);
+                dst += numCharsWritten;
+            } else {
+                for(int i=0; i<vectorWidth; i++)
+                {
+                    numCharsWritten = sprintf(dst, "(%s).S0 = mad((%s).%s,(%s).%s,(%s).S0);\n\t", id1, id2, vecIndices[2*i], id3,
+                                                                               vecIndices[2*i], id1);
+                    dst += numCharsWritten;
+                    numCharsWritten = sprintf(dst, "(%s).S0 = mad((%s).%s,-(%s).%s,(%s).S0);\n\t", id1, id2, vecIndices[2*i + 1], id3,
+                                                                               vecIndices[2*i + 1], id1);
+                    dst += numCharsWritten;
+                    numCharsWritten = sprintf(dst, "(%s).S1 = mad((%s).%s,(%s).%s,(%s).S1);\n\t", id1, id2, vecIndices[2*i], id3,
+                                                                               vecIndices[2*i + 1], id1);
+                    dst += numCharsWritten;
+                    numCharsWritten = sprintf(dst, "(%s).S1 = mad((%s).%s,(%s).%s,(%s).S1);\n\t", id1, id2, vecIndices[2*i + 1], id3,
+                                                                               vecIndices[2*i], id1);
+                    dst += numCharsWritten;
+                }
+            }
+            break;
+
+        default:
+            std::cout << "handleVMAD_AND_REDUCE: s_or_v is neither scalar nor a vector" << std::endl;
+            throw -1;
+    }
+    *_dst = dst;
+}
+
+void kprintf::handleMAD_AND_REDUCE(char **_src, char **_dst)
+{
+    int numCharsWritten = 0;
+    char id1[256], id2[256], id3[256];
+    char * ptr;
+    char *src = *_src;
+    char *dst = *_dst;
+    int vwidth=1;
+
+    ptr = mystrtok( src, "(,)");
+    ptr = mystrtok( NULL, "(,)"); // Get first ID
+    strcpy( id1, ptr);
+    ptr = mystrtok( NULL, "(,)"); // Get second ID
+    strcpy( id2, ptr);
+    ptr = mystrtok( NULL, "(,)"); // Get third ID
+    strcpy( id3, ptr);
+    *_src = ptr + strlen(ptr) + 1;
+    //std::cout << id1 << "  " << id2 << "  " << id3 << std::endl;
+    if ( (strcmp(id1, id2) == 0) || (strcmp(id1, id3)==0) || (strcmp(id2,id3) == 0) )
+    {
+        std::cout << "%MAD_AND_REDUCE( C, A, B) : C , A and B have to be UNIQUE" << std::endl;
+        throw -1;
+    }
+
+    switch(s_or_v)
+    {
+        case SCALAR:
+            //
+            // e.g. float += float4*float4
+            //      We will use only the first vector component
+            //
+            if (vectorWidth == 1)
+            {
+                numCharsWritten = sprintf(dst, "%s = mad(%s,%s,%s);\n\t", id1, id2, id3, id1);
+                dst += numCharsWritten;
+
+            } else {
+                numCharsWritten = sprintf(dst, "%s = mad(%s.%s,%s.%s,%s);\n\t", id1, id2, vecIndices[0], id3,
+                                                                               vecIndices[0], id1);
+                dst += numCharsWritten;
+            }
+            break;
+
+        case VECTOR:
+            numCharsWritten = sprintf(dst, "%s.S0 = mad((%s).S0,(%s).S0,%s.S0);\n\t", id1, id2, id3, id1);
+            dst += numCharsWritten;
+            numCharsWritten = sprintf(dst, "%s.S0 = mad((%s).S1,-(%s.S1),%s.S0);\n\t", id1, id2, id3, id1);
+            dst += numCharsWritten;
+
+            numCharsWritten = sprintf(dst, "%s.S1 = mad((%s).S0,(%s).S1,%s.S1);\n\t", id1, id2, id3, id1);
+            dst += numCharsWritten;
+            numCharsWritten = sprintf(dst, "%s.S1 = mad((%s).S1,(%s.S0),%s.S1);\n\t", id1, id2, id3, id1);
+            dst += numCharsWritten;
+            break;
+
+        default:
+            std::cout << "handleMAD_AND_REDUCE: s_or_v is neither scalar nor a vector" << std::endl;
+            throw -1;
+    }
+    *_dst = dst;
+}
+
+void kprintf::handleComplexJoin(char **_src, char **_dst)
+{
+    int numCharsWritten = 0;
+    char id1[256], id2[256], id3[256];
+    char *ptr;
+    char *src = *_src;
+    char *dst = *_dst;
+
+    ptr = mystrtok( src, "(,)");
+    ptr = mystrtok( NULL, "(,)"); // Get first ID
+    strcpy( id1, ptr);
+    ptr = mystrtok( NULL, "(,)"); // Get second ID
+    strcpy( id2, ptr);
+    ptr = mystrtok( NULL, "(,)"); // Get third ID
+    strcpy( id3, ptr);
+    *_src = ptr + strlen(ptr) + 1;
+    //std::cout << id1 << "  " << id2 << "  " << id3 << std::endl;
+
+    switch(s_or_v)
+    {
+        case SCALAR:
+            //
+            // Dont do a thing...ComplexJoin not applicable for Real numbers
+            //
+            break;
+
+        case VECTOR:
+            for(int i=0; i<effectiveVectorWidthOnBaseType; i++)
+            {
+                if (effectiveVectorWidthOnBaseType > 2)
+                {
+                    if ((i % 2) == 0)
+                    {
+                        numCharsWritten = sprintf(dst, "%s.%s = %s.%s;\n",
+                                                    id1, vecIndices[i],
+                                                    id2, vecIndices[i/2]);
+                        dst += numCharsWritten;
+                    } else {
+                        numCharsWritten = sprintf(dst, "%s.%s = %s.%s;\n",
+                                                    id1, vecIndices[i],
+                                                    id3, vecIndices[i/2]);
+                        dst += numCharsWritten;
+                    }
+                } else {
+                    if ((i % 2) == 0)
+                    {
+                        numCharsWritten = sprintf(dst, "%s.%s = %s;\n",
+                                                    id1, vecIndices[i],
+                                                    id2);
+                        dst += numCharsWritten;
+                    } else {
+                        numCharsWritten = sprintf(dst, "%s.%s = %s;\n",
+                                                    id1, vecIndices[i],
+                                                    id3);
+                        dst += numCharsWritten;
+                    }
+                }
+            }
+            break;
+
+        default:
+            std::cout << "handleComplexJoin: s_or_v is neither scalar nor a vector" << std::endl;
+            throw -1;
+    }
+    *_dst = dst;
+}
+
+void kprintf::handleDIV(char **_src, char **_dst, bool vdiv)
+{
+    int numCharsWritten = 0;
+    char id1[256], id2[256], id3[256];
+    char * ptr;
+    char *src = *_src;
+    char *dst = *_dst;
+    int vwidth=1;
+
+    ptr = mystrtok( src, "(,)");
+    ptr = mystrtok( NULL, "(,)"); // Get first ID
+    strcpy( id1, ptr);
+    ptr = mystrtok( NULL, "(,)"); // Get second ID
+    strcpy( id2, ptr);
+    ptr = mystrtok( NULL, "(,)"); // Get third ID
+    strcpy( id3, ptr);
+    *_src = ptr + strlen(ptr) + 1;
+    //std::cout << id1 << "  " << id2 << "  " << id3 << std::endl;
+    if ( (strcmp(id1, id2) == 0) || (strcmp(id1, id3)==0) || (strcmp(id2,id3) == 0) )
+    {
+        if (vdiv == false)
+        {
+            std::cout << "%DIV( C, A, B) : C , A and B have to be UNIQUE" << std::endl;
+        } else {
+            std::cout << "%VDIV( C, A, B) : C , A and B have to be UNIQUE" << std::endl;
+        }
+        throw -1;
+    }
+
+    switch(s_or_v)
+    {
+        case SCALAR:
+            numCharsWritten = sprintf(dst, "%s = %s / %s", id1, id2, id3);
+            dst += numCharsWritten;
+            break;
+
+        case VECTOR:
+            if (vdiv == true)
+            {
+                vwidth = vectorWidth;
+            } else {
+                vwidth = 1;
+            }
+#ifdef DIV_SCALAR_UNROLL
+            for(int i=0; i<vwidth; i++)
+            {
+                numCharsWritten = sprintf(dst, "%s.%s = (((%s.%s)*(%s.%s)) + ( (%s.%s)*(%s.%s)));\n",
+                                                                          id1, vecIndices[i*2],
+                                                                          id2, vecIndices[i*2], id3, vecIndices[i*2],
+                                                                          id2, vecIndices[i*2 + 1], id3, vecIndices[i*2 + 1]);
+                dst += numCharsWritten;
+
+                numCharsWritten = sprintf(dst, "%s.%s = (-((%s.%s)*(%s.%s)) + ( (%s.%s)*(%s.%s)));\n",
+                                                                          id1, vecIndices[i*2+1],
+                                                                          id2, vecIndices[i*2], id3, vecIndices[i*2 + 1],
+                                                                          id2, vecIndices[i*2 + 1], id3, vecIndices[i*2]);
+                dst += numCharsWritten;
+
+                numCharsWritten = sprintf(dst, "%s.%s /= ((%s.%s * %s.%s) + (%s.%s * %s.%s));\n",
+                                                                id1, vecIndices[i*2],
+                                                                id3, vecIndices[i*2], id3, vecIndices[i*2],
+                                                                  id3, vecIndices[i*2+1], id3, vecIndices[i*2+1]);
+                dst += numCharsWritten;
+
+                numCharsWritten = sprintf(dst, "%s.%s /= ((%s.%s * %s.%s) + (%s.%s * %s.%s));\n",
+                                                                id1, vecIndices[i*2 + 1],
+                                                                id3, vecIndices[i*2], id3, vecIndices[i*2],
+                                                                  id3, vecIndices[i*2+1], id3, vecIndices[i*2+1]);
+                dst += numCharsWritten;
+            }
+#else
+            //
+            // Vector Unroll - Extract ODD and EVEN stuff and express multiplication via vectors
+            //
+                numCharsWritten = sprintf( dst, "%s.even = ((%s.even) * (%s.even)) + ((%s.odd) * (%s.odd));\n", id1, id2, id3, id2, id3);
+                dst += numCharsWritten;
+
+                numCharsWritten = sprintf( dst, "%s.odd = -((%s.even) * (%s.odd)) + ((%s.odd) * (%s.even));\n", id1, id2, id3, id2, id3);
+                dst += numCharsWritten;
+
+                numCharsWritten = sprintf( dst, "%s.even /= (%s.even*%s.even) + (%s.odd*%s.odd) ;\n", id1, id3, id3, id3, id3);
+                dst += numCharsWritten;
+                numCharsWritten = sprintf( dst, "%s.odd /= (%s.even*%s.even) + (%s.odd*%s.odd) ;\n", id1, id3, id3, id3, id3);
+                dst += numCharsWritten;
+
+#endif
+            break;
+        default:
+            std::cout << "handleDIV: s_or_v is neither scalar nor a vector" << std::endl;
+            throw -1;
+    }
+    *_dst = dst;
+}
+
+void kprintf::handleAlignedDataAccess(char **_src, char **_dst)
+{
+    int numCharsWritten = 0;
+    char id1[256];
+    char id2[256];
+    char * ptr, * offsetptr;
+    char *src = *_src;
+    char *dst = *_dst;
+
+    ptr = mystrtok( src, "()");
+    ptr = mystrtok( NULL, "()");
+    strcpy( id1, ptr);
+    *_src = ptr + strlen(ptr) + 1;
+    strcpy( id2, id1);
+
+        // To skip offset in id1
+    ptr = id1;
+    for( int i=0;;i++, ptr++)
+    {
+        if ( *ptr == ',')
+            break;
+    }
+    ptr++;
+
+    if (( ! this->doVLOAD) || (effectiveVectorWidthOnBaseType == 1))
+    {
+        numCharsWritten = sprintf(dst, "*((__global %s*)(%s))", DERIVED, ptr);
+        dst += numCharsWritten;
+    }
+    else
+    {
+        offsetptr = id2;
+        for( int i=0; ; i++, offsetptr++)
+        {
+            if ( *offsetptr == ',')
+                break;
+        }
+        offsetptr++;
+        *offsetptr = '\0';
+
+        const char *string;
+        char vecSuffix[3] = {0};
+        char tempStr[9] = {0};
+
+        generateVecSuffix( vecSuffix, effectiveVectorWidthOnBaseType);  // VLOAD %TYPE%V from %PTYPE kind of memory locations
+        strcpy( tempStr, "vload");
+        strcat( tempStr, vecSuffix);
+        string = findTypeVLOAD(tempStr);
+        if (string != NULL)
+        {
+             put( "%VLOAD", string);
+        } else {
+            std::cerr << "handleAlignedDataAccess: " << tempStr << " not a valid VLOAD type" << std::endl;
+        }
+
+
+        struct fmt f;
+        f = get("%PTYPE");
+
+        numCharsWritten = sprintf(dst, "%s( %s (__global %s *)%s)", tempStr, id2, f.value, ptr);
+        dst += numCharsWritten;
+    }
+
+    *_dst = dst;
+}
+
+//
+// %VSTORE(data, 0, address)
+//
+void kprintf::handleAlignedVSTORE(char **_src, char **_dst)
+{
+    int numCharsWritten = 0;
+    char * ptr, *id1, *id2, *id3;
+    char *src = *_src;
+    char *dst = *_dst;
+
+    ptr = mystrtok( src, "()");     // Get rid of %VSTORE keyword
+    id1 = mystrtok( NULL, ",");     // PTR now points to "data"
+    id2 = mystrtok( NULL, ",");     // PTR now points to  "0"
+    id3 = mystrtok( NULL, "()");     // PTR now points to "address" which is wrapped around in ()
+    *_src = id3 + strlen(id3) + 1;
+
+    if (( ! this->doVSTORE) || (effectiveVectorWidthOnBaseType == 1))
+    {
+        numCharsWritten = sprintf(dst, "*((__global %s*)(%s) + %s) = %s", DERIVED, id3, id2, id1); // NOTE:Assuming "__global"
+        dst += numCharsWritten;
+    }
+    else
+    {
+        struct fmt vstore, ptype;
+        vstore = get("%VSTORE_VALUE");
+        ptype  = get("%PTYPE");
+        if ((vstore.value == NULL) || (ptype.value == NULL))
+        {
+            numCharsWritten = sprintf(dst, "--ERROR in VSTORE--");
+            dst += numCharsWritten;
+            return;
+        }
+
+        numCharsWritten = sprintf(dst, "%s( %s, %s, (__global %s *)%s)", vstore.value, id1, id2, ptype.value, id3);
+        dst += numCharsWritten;
+    }
+    *_dst = dst;
+    return;
+}
+
+void kprintf::handlePredicate(char **_src, char **_dst)
+{
+    //int numCharsWritten = 0;
+    char * ptr, *id1;
+    char *src = *_src;
+    char *dst = *_dst;
+
+    ptr = mystrtok( src, "()");     // Get rid of %IF keyword
+    id1 = mystrtok( NULL, ")");     // PTR now points to "data"
+    *_src = id1 + strlen(id1) + 1;
+    src = *_src;
+
+    struct fmt predicate = get(id1);
+    int condition = atoi(predicate.value);
+    if (condition >= 1) // PENDING: (condition > 1) worked fine before.
+    {
+        //printf("KPRINTF: Handle Predicate is TRUE - Predicate = %s\n", predicate.value);
+        return;
+    } else {
+        //printf("KPRINTF: Handle Predicate is FALSE - predicate = %s\n", predicate.value);
+        while((*src != '\0') && (*src != '\n'))
+        {
+            src++;
+        }
+        *dst = '\n';
+        dst++;
+    }
+
+    *_dst = dst;
+    *_src = src;
+    return;
+}
+
+void kprintf::handleADD_SUB(char **_src, char **_dst, const char op)
+{
+    int numCharsWritten = 0;
+    char id1[256], id2[256], id3[256];
+    char * ptr;
+    char *src = *_src;
+    char *dst = *_dst;
+
+    ptr = mystrtok( src, "(,)");
+    ptr = mystrtok( NULL, "(,)"); // Get first ID
+    strcpy( id1, ptr);
+    ptr = mystrtok( NULL, "(,)"); // Get second ID
+    strcpy( id2, ptr);
+    ptr = mystrtok( NULL, "(,)"); // Get third ID
+    strcpy( id3, ptr);
+    *_src = ptr + strlen(ptr) + 1;
+
+    numCharsWritten = sprintf(dst, "%s = %s %c %s", id1, id2, op, id3);
+    dst += numCharsWritten;
+
+    *_dst = dst;
+}
+
+void kprintf::handleVLoadWithIncx(char **_src, char **_dst, bool ignoreFirst)
+{
+    int numCharsWritten = 0;
+    char id1[256], id2[256], id3[256];
+    char * ptr;
+    char *src = *_src;
+    char *dst = *_dst;
+
+    ptr = mystrtok( src, "(,)");
+    ptr = mystrtok( NULL, "(,)"); // Get first ID
+    strcpy( id1, ptr);
+    ptr = mystrtok( NULL, "(,)"); // Get second ID
+    strcpy( id2, ptr);
+    ptr = mystrtok( NULL, "(,)"); // Get third ID
+    strcpy( id3, ptr);
+    *_src = ptr + strlen(ptr) + 1;
+
+
+    if (ignoreFirst == false)
+    {
+        numCharsWritten = sprintf(dst,"%s = ", id1);
+        dst += numCharsWritten;
+    }
+
+    numCharsWritten = sprintf(dst,"(%s)(", DERIVED);
+    dst += numCharsWritten;
+
+    for( int i = 0 ; i < (vectorWidth - 1); i++)
+    {
+        numCharsWritten = sprintf(dst," %s[0 + (%s * %d)],", id2, id3, i);
+        dst += numCharsWritten;
+    }
+
+    numCharsWritten = sprintf(dst," %s[0 + (%s * %d)])", id2, id3, vectorWidth - 1);
+    dst += numCharsWritten;
+    *_dst = dst;
+}
+
+
+void kprintf::handleVStoreWithIncx(char **_src, char **_dst)
+{
+    int numCharsWritten = 0;
+    char id1[256], id2[256], id3[256];
+    char * ptr;
+    char *src = *_src;
+    char *dst = *_dst;
+
+    ptr = mystrtok( src, "(,)");
+    ptr = mystrtok( NULL, "(,)"); // Get first ID
+    strcpy( id1, ptr);
+    ptr = mystrtok( NULL, "(,)"); // Get second ID
+    strcpy( id2, ptr);
+    ptr = mystrtok( NULL, "(,)"); // Get third ID
+    strcpy( id3, ptr);
+    *_src = ptr + strlen(ptr) + 1;
+
+    if ( s_or_v == SCALAR)
+    {
+
+        for( int i = 0 ; i < (vectorWidth); i++)
+        {
+            if (vectorWidth != 1)
+            {
+                numCharsWritten = sprintf(dst," %s[0 + (%s * %d)] = %s.%s;\n", id1, id3, i, id2, vecIndices[i]);
+            } else {
+                numCharsWritten = sprintf(dst," %s[0 + (%s * %d)] = %s;\n", id1, id3, i, id2);
+            }
+            dst += numCharsWritten;
+        }
+    }
+    else
+    {
+        for( int i = 0 ; i < (vectorWidth); i++)
+        {
+            numCharsWritten = sprintf(dst," %s[0 + (%s * %d)] = %s.s%d%d;\n", id1, id3, i, id2, (i*2), (i*2 + 1));
+            dst += numCharsWritten;
+        }
+    }
+
+    *_dst = dst;
+}
+
+void kprintf::handleReduceSum(char **_src, char **_dst)
+{
+    int numCharsWritten = 0;
+    char id1[256];
+    char * ptr;
+    char *src = *_src;
+    char *dst = *_dst;
+
+    ptr = mystrtok( src, "(,)");
+    ptr = mystrtok( NULL, "(,)"); // Get first ID
+    strcpy( id1, ptr);
+    *_src = ptr + strlen(ptr) + 1;
+
+    if(vectorWidth > 1)
+    {
+    if ( s_or_v == SCALAR)
+    {
+        for( int i = 0 ; i < (vectorWidth - 1); i++)
+        {
+            numCharsWritten = sprintf(dst,"%s.%s + ", id1, vecIndices[i]);
+            dst += numCharsWritten;
+        }
+        numCharsWritten = sprintf(dst,"%s.%s;\n", id1, vecIndices[ (vectorWidth - 1)]);
+        dst += numCharsWritten;
+    }
+    else
+    {
+        for( int i = 0 ; i < (vectorWidth- 1); i++)
+        {
+            numCharsWritten = sprintf(dst,"%s.s%d%d + ", id1,(i*2), (i*2 + 1));
+            dst += numCharsWritten;
+        }
+        numCharsWritten = sprintf(dst,"%s.s%d%d;\n", id1,((vectorWidth- 1)*2), ((vectorWidth- 1)*2 + 1));
+        dst += numCharsWritten;
+    }
+    } else {
+        numCharsWritten = sprintf(dst,"(%s);\n", id1);
+        dst += numCharsWritten;
+    }
+
+    *_dst = dst;
+}
+
+void kprintf::handleReduceMax(char **_src, char **_dst)
+{
+    int numCharsWritten = 0;
+        // val, maxVal, index, impl
+    char id1[256], id2[256], id3[256], id4[256];
+    char tempStr[512];
+    char * ptr;
+    char *src = *_src;
+    char *dst = *_dst;
+    bool reduceMaxWithIndex = false, followLowIndex = true;
+
+    ptr = mystrtok( src, "(,)");
+    ptr = mystrtok( NULL, "(,)"); // Get first ID
+    strcpy( id1, ptr);
+    // After the first parameter is parsed, extract everything till you encounter ';'
+    // Store this substring in a temp string. Then check if any extra parameter(overloaded) was passed using this substring
+    ptr = mystrtok( NULL, ";");
+    *_src = ptr + strlen(ptr) + 1;  // 'src' string parsing is over at this point
+
+    tempStr[0] = '(';
+    tempStr[1] = 0;
+    strcat(tempStr, ptr);
+    ptr = mystrtok( tempStr, "(,)");
+    ptr = mystrtok( NULL, "(,)");       // extract 2nd parameter from tempStr. Will be empty if 2nd parameter was not passed
+    strcpy( id2, ptr);
+    ptr = mystrtok( NULL, "(,)");
+    strcpy( id3, ptr);
+    ptr = mystrtok( NULL, "(,)");
+    strcpy( id4, ptr);
+
+    if(strcmp(id3, "") != 0)
+    {
+        reduceMaxWithIndex = true;
+    }
+
+    if(!strcmp(id4, "0"))
+    {
+        followLowIndex = false;
+    }
+
+    #ifdef DEBUG_AMAX
+    std::cerr << "Handling AMAX CASE: reduceMaxWithIndex:" << reduceMaxWithIndex
+              << " and followLowIndex: " << followLowIndex
+              << " id1:" << id1 <<  " id2:" << id2 << " id3:" << id3 << " id4:" << id4 << std::endl;
+    #endif
+
+    if(vectorWidth > 1)
+    {
+        if ((s_or_v == SCALAR) && (!reduceMaxWithIndex))
+        {
+            for( int i = 0 ; i < (vectorWidth - 1); i++)
+            {
+                numCharsWritten = sprintf(dst,"fmax( %s.%s, ", id1, vecIndices[i]);
+                dst += numCharsWritten;
+            }
+            numCharsWritten = sprintf(dst," %s.%s ", id1, vecIndices[ (vectorWidth - 1)]);
+            dst += numCharsWritten;
+            for( int i = 0 ; i < (vectorWidth - 1); i++)
+            {
+                numCharsWritten = sprintf(dst,")");
+                dst += numCharsWritten;
+            }
+            numCharsWritten = sprintf(dst,";\n");
+            dst += numCharsWritten;
+        }
+        else if(reduceMaxWithIndex)
+        {
+            if(followLowIndex)
+            {
+                numCharsWritten = sprintf(dst,"%s = 0;",id3);
+                dst += numCharsWritten;
+                for(int i = 1 ; i < (vectorWidth); i++)
+                {
+                    numCharsWritten = sprintf(dst,"\n\t(%s.%s > %s.S0)? (%s = %d, %s.S0 = %s.%s):1;",
+                                         id1, vecIndices[i], id1, id3, i, id1, id1, vecIndices[i]);
+                    dst += numCharsWritten;
+                }
+                numCharsWritten = sprintf(dst,"\n\t%s = %s.s0;", id2, id1);
+                dst += numCharsWritten;
+            }
+            else // Follow High Index
+            {
+                numCharsWritten = sprintf(dst,"%s = 0;",id3);
+                dst += numCharsWritten;
+                for(int i = 1 ; i < (vectorWidth); i++)
+                {
+                    numCharsWritten = sprintf(dst,"\n\t(%s.%s >= %s.S0)? (%s = %d, %s.S0 = %s.%s):1;",
+                                         id1, vecIndices[i], id1, id3, i, id1, id1, vecIndices[i]);
+                    dst += numCharsWritten;
+                }
+                numCharsWritten = sprintf(dst,"\n\t%s = %s.s0;", id2, id1);
+                dst += numCharsWritten;
+            }
+        }
+        else
+        {
+            for( int i = 0 ; i < (vectorWidth - 1); i++)
+            {
+                numCharsWritten = sprintf(dst,"fmax( %s.s%d%d, ", id1, (i*2), (i*2 + 1));
+                dst += numCharsWritten;
+            }
+            numCharsWritten = sprintf(dst," %s.s%d%d ", id1, ((vectorWidth- 1)*2), ((vectorWidth- 1)*2 + 1));
+            dst += numCharsWritten;
+            for( int i = 0 ; i < (vectorWidth - 1); i++)
+            {
+                numCharsWritten = sprintf(dst,")");
+                dst += numCharsWritten;
+            }
+            numCharsWritten = sprintf(dst,";\n");
+            dst += numCharsWritten;
+        }
+    }
+    else
+    {
+        if(reduceMaxWithIndex)
+        {
+            numCharsWritten = sprintf(dst, "%s = 0;\n",id3);
+            dst += numCharsWritten;
+            numCharsWritten = sprintf(dst, "%s = %s;\n", id2, id1);
+            dst += numCharsWritten;
+        }
+        else
+        {
+            numCharsWritten = sprintf(dst,"(%s);\n", id1);
+            dst += numCharsWritten;
+        }
+    }
+
+    *_dst = dst;
+}
+
+
+void kprintf::handleReduceMin(char **_src, char **_dst)
+{
+    int numCharsWritten = 0;
+    char id1[256];
+    char * ptr;
+    char *src = *_src;
+    char *dst = *_dst;
+
+    ptr = mystrtok( src, "(,)");
+    ptr = mystrtok( NULL, "(,)"); // Get first ID
+    strcpy( id1, ptr);
+    *_src = ptr + strlen(ptr) + 1;
+
+    if(vectorWidth > 1)
+    {
+        if ( s_or_v == SCALAR)
+        {
+            for( int i = 0 ; i < (vectorWidth - 1); i++)
+            {
+                numCharsWritten = sprintf(dst,"fmin( %s.%s, ", id1, vecIndices[i]);
+                dst += numCharsWritten;
+            }
+            numCharsWritten = sprintf(dst," %s.%s ", id1, vecIndices[ (vectorWidth - 1)]);
+            dst += numCharsWritten;
+            for( int i = 0 ; i < (vectorWidth - 1); i++)
+            {
+                numCharsWritten = sprintf(dst,")");
+                dst += numCharsWritten;
+            }
+            numCharsWritten = sprintf(dst,";\n");
+            dst += numCharsWritten;
+        }
+        else
+        {
+            for( int i = 0 ; i < (vectorWidth - 1); i++)
+            {
+                numCharsWritten = sprintf(dst,"fmin( %s.s%d%d, ", id1, (i*2), (i*2 + 1));
+                dst += numCharsWritten;
+            }
+            numCharsWritten = sprintf(dst," %s.s%d%d ", id1, ((vectorWidth- 1)*2), ((vectorWidth- 1)*2 + 1));
+            dst += numCharsWritten;
+            for( int i = 0 ; i < (vectorWidth - 1); i++)
+            {
+                numCharsWritten = sprintf(dst,")");
+                dst += numCharsWritten;
+            }
+            numCharsWritten = sprintf(dst,";\n");
+            dst += numCharsWritten;
+        }
+    } else {
+        numCharsWritten = sprintf(dst,"(%s);\n", id1);
+        dst += numCharsWritten;
+    }
+
+    *_dst = dst;
+}
+
+void kprintf::handleReduceHypot(char **_src, char **_dst)
+{
+    int numCharsWritten = 0;
+    char id1[256];
+    char * ptr;
+    char *src = *_src;
+    char *dst = *_dst;
+
+    ptr = mystrtok( src, "(,)");
+    ptr = mystrtok( NULL, "(,)"); // Get first ID
+    strcpy( id1, ptr);
+    *_src = ptr + strlen(ptr) + 1;
+
+    if(vectorWidth > 1)
+    {
+        if ( s_or_v == SCALAR)
+        {
+            for( int i = 0 ; i < (vectorWidth - 1); i++)
+            {
+                numCharsWritten = sprintf(dst,"hypot( %s.%s, ", id1, vecIndices[i]);
+                dst += numCharsWritten;
+            }
+            numCharsWritten = sprintf(dst," %s.%s ", id1, vecIndices[ (vectorWidth - 1)]);
+            dst += numCharsWritten;
+            for( int i = 0 ; i < (vectorWidth - 1); i++)
+            {
+                numCharsWritten = sprintf(dst,")");
+                dst += numCharsWritten;
+            }
+            numCharsWritten = sprintf(dst,";\n");
+            dst += numCharsWritten;
+        }
+        else
+        {
+            for( int i = 0 ; i < (vectorWidth - 1); i++)
+            {
+                numCharsWritten = sprintf(dst,"hypot( %s.s%d%d, ", id1, (i*2), (i*2 + 1));
+                dst += numCharsWritten;
+            }
+            numCharsWritten = sprintf(dst," %s.s%d%d ", id1, ((vectorWidth- 1)*2), ((vectorWidth- 1)*2 + 1));
+            dst += numCharsWritten;
+            for( int i = 0 ; i < (vectorWidth - 1); i++)
+            {
+                numCharsWritten = sprintf(dst,")");
+                dst += numCharsWritten;
+            }
+            numCharsWritten = sprintf(dst,";\n");
+            dst += numCharsWritten;
+        }
+    } else {
+        numCharsWritten = sprintf(dst,"(%s);\n", id1);
+        dst += numCharsWritten;
+    }
+
+    *_dst = dst;
+}
+
+
+//
+// scalar = %REDUCE_SUM_REAL_HV(half-vector), %REDUCE_SUM_REAL_V(vector)
+//
+void kprintf::handleReduceSumReal(char **_src, char **_dst, int vlength)
+{
+    int numCharsWritten = 0;
+    char id1[256];
+    char * ptr;
+    char *src = *_src;
+    char *dst = *_dst;
+
+
+    ptr = mystrtok( src, "(,)");
+    ptr = mystrtok( NULL, "(,)"); // Get first ID
+    strcpy( id1, ptr);
+    *_src = ptr + strlen(ptr) + 1;
+
+    if (!vlength) //Can happen for SCALAR cases where source code contains this within COMPLEX define
+    {
+        //
+        // Dont generate a thing.
+        // The src pointer has already been advanced to next line
+        // Just move on..
+        //
+        return;
+    }
+
+    if (vlength != 1)
+    {
+        for( int i = 0 ; i < (vlength - 1); i++)
+        {
+            numCharsWritten = sprintf(dst,"(%s).%s + ", id1, vecIndices[i]);
+            dst += numCharsWritten;
+        }
+        numCharsWritten = sprintf(dst,"(%s).%s;\n", id1, vecIndices[ (vlength - 1)]);
+        dst += numCharsWritten;
+    } else {
+        numCharsWritten = sprintf(dst,"(%s);\n ", id1);
+        dst += numCharsWritten;
+    }
+
+    *_dst = dst;
+}
+
+void kprintf::handleCONJUGATE(char **_src, char **_dst)
+{
+    // %CONJUGATE( doConj, loadedA );
+    // loadedA = ((doConj == 1)? (loadedA.odd = -loadedA.odd, loadedA) : loadedA);
+
+    int numCharsWritten = 0;
+    char id1[256], id2[256];
+    char * ptr;
+    char *src = *_src;
+    char *dst = *_dst;
+
+    ptr = mystrtok( src, "(,)");
+    ptr = mystrtok( NULL, "(,)"); // Get first ID
+    strcpy( id1, ptr);
+    ptr = mystrtok( NULL, "(,)"); // Get second ID
+    strcpy( id2, ptr);
+    *_src = ptr + strlen(ptr) + 1;
+
+    if ( s_or_v == VECTOR)
+    {
+        numCharsWritten = sprintf(dst,"%s = ((%s == 1)? ( %s.odd = -%s.odd, %s) : %s)", id2, id1, id2, id2, id2, id2);
+        dst += numCharsWritten;
+    }
+
+    *_dst = dst;
+}
+
+void kprintf::handleClearImaginary(char **_src, char **_dst)
+{
+    // %CLEAR_IMAGINARY( varName );
+    // generates varName.odd = 0;     incase of complex type
+
+    int numCharsWritten = 0;
+    char id1[256];
+    char * ptr;
+    char *src = *_src;
+    char *dst = *_dst;
+
+    ptr = mystrtok( src, "(,)");
+    ptr = mystrtok( NULL, "(,)"); // Get first ID
+    strcpy( id1, ptr);
+    *_src = ptr + strlen(ptr) + 1;
+
+    if ( s_or_v == VECTOR)
+    {
+        numCharsWritten = sprintf(dst,"%s.odd = 0.0f", id1);
+        dst += numCharsWritten;
+    }
+
+    *_dst = dst;
+}
+
+static const char * itoa(int n)
+{
+    if (n > 16)
+        return (const char*) NULL;
+    return numbers[n];
+}
+
+//
+// PENDING: COMPLEX DATA TYPE HANDLING may need special attention
+//
+void kprintf::handleVFOR(char **src, char **dst, bool isReal)
+{
+    char *start, *end;
+    char *vforBody, *vforBodyTemp, *vforGeneratedBody;
+    int bracecount = 0;
+    int vforBodyLength;
+
+    if (isReal == false)
+    {
+        start = (*src) + strlen("%VFOR");
+    } else {
+        start = (*src) + strlen("%VFOR_REAL");
+    }
+
+    while ( (*start != '{') && (*start != 0))
+    {
+        //PENDING: if (notwhitespace(*start)) { signal exception bad syntax }
+        start++;
+    }
+    if (*start == 0)
+    {
+        // PENDING: Raise an EXCEPTION!
+        printf("KPRINTF: handleVFOR: Bad Syntax...\n");
+        return;
+    }
+
+    bracecount = 1;
+    end = start+1;
+    while(bracecount)
+    {
+        if (*end == 0)
+        {
+            break;
+        } else if (*end == '{')
+        {
+            bracecount++;
+        } else if (*end == '}') {
+            bracecount--;
+        }
+        end++;
+    }
+
+    if (*end == 0)
+    {
+        // PENDING: Raise an EXCEPTION!
+        printf("KPRINTF: handleVFOR: Bad Syntax...\n");
+        return;
+    }
+
+    vforBodyLength = end - start;
+    vforBody = (char*)malloc((vforBodyLength + 1)*sizeof(char));
+    vforBodyTemp = (char*)malloc((vforBodyLength + 1)*sizeof(char));
+    vforGeneratedBody = (char*)malloc(((vforBodyLength + 1)*sizeof(char)) * vectorWidth * 2);
+    memcpy(vforBody, start, vforBodyLength);
+    vforBody[vforBodyLength] = 0;
+
+    for(int v=0; v<vectorWidth; v++)
+    {
+        kprintf *child = new kprintf(this->dataType, this->vectorWidth, this->doVLOAD, this->doVSTORE);
+
+        child->put("%VFORINDEX", itoa(v));
+        if ((isReal == true) || (this->dataType == 'S') || (this->dataType == 'D'))
+        {
+            //
+            // Treat like REAL type
+            //
+            if (vectorWidth != 1)
+            {
+                child->put("%VFORSUFFIX", vecIndicesWithDot[v]);
+            } else {
+                child->put("%VFORSUFFIX", "");
+            }
+        } else {
+            // Complex Data Type Involved
+            if (vectorWidth != 1)
+            {
+                child->put("%VFORSUFFIX", vecComplexIndicesWithDot[v]);
+            } else {
+                child->put("%VFORSUFFIX", "");
+            }
+        }
+        strcpy(vforBodyTemp, vforBody);
+        child->spit(vforGeneratedBody, vforBodyTemp);
+        strcat(*dst, vforGeneratedBody);
+        *dst += strlen(vforGeneratedBody);
+
+        delete child;
+    }
+
+    *src = end;
+
+    free(vforBody);
+    free(vforBodyTemp);
+    free(vforGeneratedBody);
+    return;
+}
+
+void kprintf::handleReductionFramework(char **_src, char **_dst, REDUCTION_TYPE reductionType)
+{
+    /*
+     *  Syntax: %REDUCTION_BY_SUM( privateVariableName ); or
+     *          %REDUCTION_BY_MAX( privateVariableName ); or
+     *          %REDUCTION_BY_MAX( privateVariableName, privateVariableName2, privateVarName3); or
+     *          %REDUCTION_BY_MIN( privateVariableName ); or
+     *          %REDUCTION_BY_HYPOT( privateVariableName ); or
+     *          %REDUCTION_BY_SSQ( scale, ssq );
+     *  Reduces all elements in a workgroup by taking value from 'privateVariableName' of each work-item
+     *  and places the reduced item in 'privateVariableName' of the first work-item (work-item 0)
+     *
+    */
+
+    int numCharsWritten = 0;
+    // Value, Index, Implementation
+    char privateVarName[256], privateVarName2[256], privateVarName3[256];
+    char tempStr[512];
+
+    char * ptr;
+    char *src = *_src;
+    char *dst = *_dst;
+    bool reductionWithIndex = false;
+    RedWithIndexImpl impl;
+
+    ptr = mystrtok( src, "(,)");
+    ptr = mystrtok( NULL, "(,)"); // Get first ID
+    strcpy( privateVarName, ptr);
+    // After the first parameter is parsed, extract everything till you encounter ';'
+    // Store this substring in a temp string. Then check if any extra parameter(overloaded) was passed using this substring
+    ptr = mystrtok( NULL, ";");
+    *_src = ptr + strlen(ptr) + 1;  // 'src' string parsing is over at this point
+
+    tempStr[0] = '(';
+    tempStr[1] = 0;
+    strcat(tempStr, ptr);
+    ptr = mystrtok( tempStr, "(,)");
+    ptr = mystrtok( NULL, "(,)");       // extract 2nd parameter from tempStr. Will be empty if 2nd parameter was not passed
+    strcpy( privateVarName2, ptr);
+    ptr = mystrtok( NULL, "(,)");
+    strcpy( privateVarName3, ptr);
+
+
+    // This indicates that there was a second parameter in the call
+    // Overloaded call of REDUCTION_BY_MAX for MAX_WITH_INDEX
+    //
+    if(strcmp(privateVarName3, "") != 0)
+    {
+        reductionWithIndex = true;
+
+        if(!strcmp(privateVarName3, "0"))
+        {
+            impl = ATOMIC_FLI;
+        }
+        else if(!strcmp(privateVarName3, "1"))
+        {
+            impl = REG_FLI;
+        }
+        else if(!strcmp(privateVarName3, "2"))
+        {
+            impl = ATOMIC_FHI;
+        }
+        else if(!strcmp(privateVarName3, "3"))
+        {
+            impl = REG_FHI;
+        }
+        else
+        {
+            std::cerr << "ERROR: Invalid Reduction Type implementation";
+        }
+    }
+
+    char ldsVarName[8], ldsVarName2[8], localId[8], selected[8];
+    char p1[8], p2[8], p3[8], p4[8], p5[8];
+    getRandomString(ldsVarName, 5);
+    getRandomString(ldsVarName2, 5);
+    getRandomString(localId, 5);
+    getRandomString(selected, 5);
+    getRandomString(p1, 5);
+    getRandomString(p2, 5);
+    getRandomString(p3, 5);
+    getRandomString(p4, 5);
+    getRandomString(p5, 5);
+
+    if(reductionWithIndex)
+    {
+        numCharsWritten = sprintf(dst, "uint %s;\n", selected);
+        dst += numCharsWritten;
+        numCharsWritten = sprintf(dst, "__local %s %s [ %d ];\n", (get("%PTYPE").value), ldsVarName, (this->wgSize));
+        dst += numCharsWritten;
+        numCharsWritten = sprintf(dst, "\tuint %s = get_local_id(0);\n\t%s [ %s ] = %s;\n",
+                            localId, ldsVarName, localId, privateVarName);
+        dst += numCharsWritten;
+
+        switch(impl)
+        {
+            case REG_FLI:
+            numCharsWritten = sprintf(dst, "\t__local uint %s [ %d ];\n", ldsVarName2, (this->wgSize));
+            dst += numCharsWritten;
+            numCharsWritten = sprintf(dst, "\t%s [ %s ] = %s;\n",
+                                ldsVarName2, localId, privateVarName2);
+            dst += numCharsWritten;
+            break;
+
+            case ATOMIC_FLI:
+            numCharsWritten = sprintf(dst, "\t__local uint %s[1];\n", ldsVarName2);
+            dst += numCharsWritten;
+            numCharsWritten = sprintf(dst, "\tif(%s == 0){%s[0] = UINT_MAX;}\n", localId, ldsVarName2);
+            dst += numCharsWritten;
+            break;
+
+        }
+    }
+    else
+    {
+        if(reductionType == REDUCTION_BY_SSQ)
+        {
+            numCharsWritten = sprintf(dst, "__local %s %s [ %d ], %s [ %d ];\n", (get("%PTYPE").value),
+                                ldsVarName, (this->wgSize), ldsVarName2, (this->wgSize) );
+            dst += numCharsWritten;
+            numCharsWritten = sprintf(dst, "\tuint %s = get_local_id(0);\n\t %s [ %s ] = %s; %s [ %s ] = %s;\n",
+                            localId, ldsVarName, localId, privateVarName, ldsVarName2, localId, privateVarName2);
+            dst += numCharsWritten;
+            numCharsWritten = sprintf(dst, "\t%s %s, %s, %s, %s, %s;\n", (get("%PTYPE").value), p1, p2, p3, p4, p5);
+            dst += numCharsWritten;
+        }
+        else
+        {
+    numCharsWritten = sprintf(dst, "__local %s %s [ %d ];\n", (get("%TYPE").value), ldsVarName, (this->wgSize));
+    dst += numCharsWritten;
+    numCharsWritten = sprintf(dst, "\tuint %s = get_local_id(0);\n\t %s [ %s ] = %s;\n",
+                            localId, ldsVarName, localId, privateVarName);
+    dst += numCharsWritten;
+        }
+    }
+
+    numCharsWritten = sprintf(dst, "\tbarrier(CLK_LOCAL_MEM_FENCE);\n\n");
+    dst += numCharsWritten;
+
+    // selected = (ldsVal[lid+32] > ldsVal[lid]) ? lid + 32 : lid;
+    // selected = (ldsVal[lid+32] == ldsVal[lid]) ? (ldsIndex[lid+32] < ldsIndex[lid] ? lid + 32 : lid) : selected;
+    for( int i=(this->wgSize/2); i>=2; i=(i/2) )
+    {
+        if(reductionWithIndex)
+        {
+            switch(impl)
+            {
+                //case ATOMIC_FLI:
+                //case ATOMIC_FHI:
+                case REG_FLI:
+                //case REG_FHI:
+                numCharsWritten = sprintf(dst, "\tif( %s < %d ) {\n ", localId, i);
+        dst += numCharsWritten;
+                numCharsWritten = sprintf(dst,
+                                     "\n\t%s = (%s[%s + %d] > %s[%s]) ? %s + %d  : %s;",
+                                     selected, ldsVarName, localId, i, ldsVarName, localId,
+                                     localId, i, localId);
+                dst += numCharsWritten;
+
+                numCharsWritten = sprintf(dst,
+                                     "\n\t%s = (%s[%s + %d] == %s[%s]) ? ((%s[%s + %d] < %s[%s]) ? %s + %d : %s) : %s;",
+                                     selected, ldsVarName, localId, i, ldsVarName, localId,
+                                     ldsVarName2, localId, i, ldsVarName2, localId, localId, i, localId, selected);
+                dst += numCharsWritten;
+                numCharsWritten = sprintf(dst, "\t%s[%s] = %s[%s];\n\t %s[%s] = %s[%s];\n",
+                                       ldsVarName, localId, ldsVarName, selected,
+                                       ldsVarName2, localId, ldsVarName2, selected);
+                dst += numCharsWritten;
+                break;
+
+                case ATOMIC_FLI:
+                numCharsWritten = sprintf(dst, "\tif( %s < %d ) {\n ", localId, i);
+                dst += numCharsWritten;
+                numCharsWritten = sprintf(dst,
+                                     "\n\t%s[%s] = fmax(%s[%s + %d], %s[%s]);",
+                                     ldsVarName, localId, ldsVarName, localId, i, ldsVarName, localId);
+                dst += numCharsWritten;
+                break;
+
+            }
+        }
+        else
+        {
+            numCharsWritten = sprintf(dst, "\tif( %s < %d ) {\n\t\t",
+                                localId, i);
+            dst += numCharsWritten;
+
+        switch( reductionType )
+        {
+                case REDUCTION_BY_SUM : numCharsWritten = sprintf(dst, " %s [ %s ] = %s [ %s ] + %s [ %s + %d ];\n",
+                                            ldsVarName, localId, ldsVarName, localId, ldsVarName, localId, i);
+                        dst += numCharsWritten;
+                        break;
+
+                case REDUCTION_BY_MAX : numCharsWritten = sprintf(dst, " %s [ %s ] = fmax( %s [ %s ] , %s [ %s + %d ] );\n",
+                                           ldsVarName, localId, ldsVarName, localId, ldsVarName, localId, i);
+                        dst += numCharsWritten;
+                        break;
+
+                case REDUCTION_BY_MIN : numCharsWritten = sprintf(dst, " %s [ %s ] = fmin( %s [ %s ] , %s [ %s + %d ] );\n",
+                                           ldsVarName, localId, ldsVarName, localId, ldsVarName, localId, i);
+                        dst += numCharsWritten;
+                        break;
+
+                case REDUCTION_BY_HYPOT : numCharsWritten = sprintf(dst, " %s [ %s ] = hypot( %s [ %s ] , %s [ %s + %d ] );\n",
+                                           ldsVarName, localId, ldsVarName, localId, ldsVarName, localId, i);
+                                        dst += numCharsWritten;
+                                        break;
+
+                case REDUCTION_BY_SSQ : numCharsWritten = sprintf(dst, " %s = %s = %s [ %s ];\n", p1, p2, ldsVarName, localId);
+                                        dst += numCharsWritten;
+                                        numCharsWritten = sprintf(dst, "\t %s = %s [ %s ];\n", p3, ldsVarName2, localId);
+                                        dst += numCharsWritten;
+                                        numCharsWritten = sprintf(dst, "\t %s = %s [ %s + %d];\n\t %s = %s [ %s + %d];\n",
+                                                p4, ldsVarName, localId, i, p5, ldsVarName2, localId, i);
+                                        dst += numCharsWritten;
+                                        numCharsWritten = sprintf(dst, "\t %s = fmax( %s, %s );\n", p2, p2, p4);
+                                        dst += numCharsWritten;
+                                        numCharsWritten = sprintf(dst, "\t %s = (isnotequal(%s, (%s)0.0))?\n", p3, p2, (get("%PTYPE").value));
+                                        dst += numCharsWritten;
+                                        numCharsWritten = sprintf(dst, "\t (((%s / %s) * (%s / %s) * %s) + ((%s / %s) * (%s / %s) * %s)) : %s;\n",
+                                                                           p1, p2, p1, p2, p3, p4, p2, p4, p2, p5, p3);
+                                        dst += numCharsWritten;
+                                        numCharsWritten = sprintf(dst, "\t %s [ %s ] = %s;\n %s [ %s ] = %s;\n",
+                                                ldsVarName, localId, p2, ldsVarName2, localId, p3);
+                                        dst += numCharsWritten;
+                                        break;
+
+            default   : printf("\nInvalid reduction operator!!\n");
+                        throw -1;
+                        break;
+        }
+        }
+        numCharsWritten = sprintf(dst, "\t}\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n\n");
+        dst += numCharsWritten;
+    }
+
+    if(reductionWithIndex)
+    {
+       switch(impl)
+       {
+            case REG_FLI:
+            numCharsWritten = sprintf(dst, "\tif( %s == 0 ) {\n\t%s = (%s[1] > %s[0]) ? 1 : 0;\n",
+                                        localId, selected, ldsVarName, ldsVarName);
+    dst += numCharsWritten;
+            numCharsWritten = sprintf(dst, "\t%s = (%s[1] == %s[0]) ? ((%s[1] < %s[0]) ? 1 : 0) : %s;\n",
+                                        selected, ldsVarName, ldsVarName, ldsVarName2, ldsVarName2, selected);
+            dst += numCharsWritten;
+            numCharsWritten = sprintf(dst, "\t%s = %s[%s];\n\t %s = %s[%s];}\n",
+                                   privateVarName, ldsVarName, selected, privateVarName2, ldsVarName2, selected);
+            dst += numCharsWritten;
+            break;
+
+            case ATOMIC_FLI:
+            numCharsWritten = sprintf(dst, "\tif(%s == 0){%s[0] = fmax(%s[1], %s[0]);}\n",
+                                        localId, ldsVarName, ldsVarName, ldsVarName);
+            dst += numCharsWritten;
+            numCharsWritten = sprintf(dst, "\tbarrier(CLK_LOCAL_MEM_FENCE);\n");
+            dst += numCharsWritten;
+            numCharsWritten = sprintf(dst, "\tif(%s == %s[0]){atomic_min((%s + 0), %s);}\n",
+                                        privateVarName, ldsVarName, ldsVarName2, privateVarName2);
+            dst += numCharsWritten;
+            numCharsWritten = sprintf(dst, "\tbarrier(CLK_LOCAL_MEM_FENCE);\n");
+            dst += numCharsWritten;
+            numCharsWritten = sprintf(dst, "\tif(%s == 0){%s = %s[0]; %s = %s[0];}\n",
+                                    localId, privateVarName2, ldsVarName2, privateVarName, ldsVarName);
+            dst += numCharsWritten;
+            numCharsWritten = sprintf(dst, "\tbarrier(CLK_LOCAL_MEM_FENCE);\n");
+            dst += numCharsWritten;
+            break;
+
+        }
+    }
+    else
+    {
+        numCharsWritten = sprintf(dst, "\tif( %s == 0 ) {\n\t", localId);
+        dst += numCharsWritten;
+
+    switch( reductionType )
+    {
+            case REDUCTION_BY_SUM : numCharsWritten = sprintf(dst, "%s = %s [0] + %s [1];\n\t}",
+                                                        privateVarName, ldsVarName, ldsVarName);
+                    dst += numCharsWritten;
+                    break;
+
+            case REDUCTION_BY_MAX : numCharsWritten = sprintf(dst, "%s = fmax( %s [0] , %s [1] );\n\t}",
+                                                        privateVarName, ldsVarName, ldsVarName);
+                    dst += numCharsWritten;
+                    break;
+
+            case REDUCTION_BY_MIN : numCharsWritten = sprintf(dst, "%s = fmin( %s [0] , %s [1] );\n\t}",
+                                                        privateVarName, ldsVarName, ldsVarName);
+                    dst += numCharsWritten;
+                    break;
+
+            case REDUCTION_BY_HYPOT : numCharsWritten = sprintf(dst, "%s = hypot( %s [0] , %s [1] );\n\t}",
+                                                          privateVarName, ldsVarName, ldsVarName);
+                                    dst += numCharsWritten;
+                                    break;
+
+            case REDUCTION_BY_SSQ : numCharsWritten = sprintf(dst, " %s = %s = %s [0];\n", p1, p2, ldsVarName);
+                                        dst += numCharsWritten;
+                                        numCharsWritten = sprintf(dst, "\t %s = %s [0];\n", p3, ldsVarName2);
+                                        dst += numCharsWritten;
+                                        numCharsWritten = sprintf(dst, "\t %s = %s [1];\n\t %s = %s [1];\n",
+                                                p4, ldsVarName, p5, ldsVarName2);
+                                        dst += numCharsWritten;
+                                        numCharsWritten = sprintf(dst, "\t %s = fmax( %s, %s );\n", p2, p2, p4);
+                                        dst += numCharsWritten;
+                                        numCharsWritten = sprintf(dst, "\t %s = (isnotequal(%s, (%s)0.0))?\n", p3, p2, (get("%PTYPE").value));
+                                        dst += numCharsWritten;
+                                        numCharsWritten = sprintf(dst, "\t (((%s / %s) * (%s / %s) * %s) + ((%s / %s) * (%s / %s) * %s)) : %s;\n",
+                                                                           p1, p2, p1, p2, p3, p4, p2, p4, p2, p5, p3);
+                                        dst += numCharsWritten;
+                                        numCharsWritten = sprintf(dst, "\t %s = %s;\n\t %s = %s;\n\t}",
+                                                privateVarName, p2, privateVarName2, p3);
+                                        dst += numCharsWritten;
+                                        break;
+
+        default   : printf("\nInvalid reduction operator!!\n");
+                    throw -1;
+                    break;
+    }
+    }
+    *_dst = dst;
+}
+
+void kprintf::handleVABS(char **_src, char **_dst)
+{
+    int numCharsWritten = 0;
+    char id1[256];
+    char * ptr;
+    char *src = *_src;
+    char *dst = *_dst;
+
+    ptr = mystrtok( src, "(,)");
+    ptr = mystrtok( NULL, "(,)"); // Get first ID
+    strcpy( id1, ptr);
+    *_src = ptr + strlen(ptr) + 1;
+
+    if(s_or_v == SCALAR)
+    {
+        numCharsWritten = sprintf(dst, "fabs(%s)", id1);
+        dst += numCharsWritten;
+    }
+    else
+    {
+        numCharsWritten = sprintf(dst, "fabs(%s.even) + fabs(%s.odd)", id1, id1);
+        dst += numCharsWritten;
+    }
+
+    *_dst = dst;
+}
+
+void kprintf::getRandomString(char *str, int length)
+{
+    static char charset[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890";
+    length = (length==0)? 1: length;
+
+    str[0] = charset[rand() % 52];   // First char has to be alphabet
+    for (int i = 1; i < length; i++)
+        str[i] = charset[rand() % 62];
+
+    str[length] = '\0';
+    return;
+}
+
+void kprintf::doConstruct(const char *type, int vecWidth, bool doVLOAD, bool doVSTORE, int _wgSize)
+{
+    this->doVLOAD = doVLOAD;
+    this->doVSTORE = doVSTORE;
+    this->wgSize = _wgSize;
+
+    if ((strcmp(type, "single") != 0) &&
+         (strcmp(type,"double") != 0) &&
+        (strcmp(type,"complex") != 0) &&
+        (strcmp(type,"doublecomplex") != 0))
+    {
+        std::cout << "kprint() constructor: Type is not supported" << std::endl;
+        throw -1;
+    }
+
+    if (vecWidth <= 0)
+    {
+        std::cout << "kprint() constructor: vecWidth is <= 0" << std::endl;
+        throw -1;
+    }
+
+    maxKeySize = 0; // NOTE: This has to be done before REGISTERING types. Dependency on "put"
+
+    //
+    // Arrive at %TYPE and %TYPE%V attributes
+    //
+    if (strcmp(type,"single") == 0)
+    {
+        put("%PTYPE", "float"); // Primitive Type
+        put("%PREFIX", "S");    // Prefix
+        registerType("float", vecWidth);
+    }
+
+    if (strcmp(type,"double") == 0)
+    {
+        put("%PTYPE", "double"); // Primitive Type
+        put("%PREFIX", "D");     // Prefix
+        registerType("double", vecWidth);
+    }
+
+    if (strcmp(type,"complex") == 0)
+    {
+        put("%PTYPE", "float"); // Primitive Type
+        put("%PREFIX", "C");    // Prefix
+        registerType("float2", vecWidth, 2);
+    }
+
+    if (strcmp(type,"doublecomplex") == 0)
+    {
+        put("%PTYPE", "double"); // Primitive Type
+        put("%PREFIX", "Z");     // Prefix
+        registerType("double2", vecWidth, 2);
+    }
+
+    registerVSTORE(); //Get "%VSTORE_VALUE" - This is for internal use to handle %VLOAD
+
+    put("%VLOAD", NULL);
+    put("%VSTORE", NULL);
+    put("%CONJUGATE", NULL);//Directive
+    put("%CLEAR_IMAGINARY", NULL);//Directive
+    put("%COMPLEX_JOIN", NULL);//Directive
+    put("%MAD", NULL);    //Directive
+    put("%VMAD", NULL);    //Directive
+    put("%VMAD_AND_REDUCE", NULL);    //Directive
+    put("%MAD_AND_REDUCE", NULL);    //Directive
+    put("%MUL", NULL);      //Directive
+    put("%VMUL", NULL);      //Directive
+    put("%ADD", NULL);      //Directive
+    put("%SUB", NULL);      //Directive
+    put("%DIV", NULL);      //Directive
+    put("%VDIV", NULL);      //Directive
+    put("%MAKEVEC", NULL);  //Directive
+    put("%VMAKEVEC", NULL);  //Directive
+    put("%INIT", NULL); //Directive
+    put("%VMAKEHVEC", NULL);//Directive
+    put("%VMAKEQVEC", NULL);//Directive
+    put("%VMAKEOVEC", NULL);//Directive
+    put("%VLOADWITHINCX", NULL);//Directive
+    put("%VLOADWITHINCXV2", NULL);//Directive
+    put("%VSTOREWITHINCX", NULL);//Directive
+    put("%REDUCE_SUM", NULL);//Directive
+    put("%REDUCE_SUM_REAL_HV", NULL);//Directive
+    put("%REDUCE_MAX", NULL);//Directive
+    put("%REDUCE_MIN", NULL);//Directive
+    put("%REDUCE_HYPOT", NULL);//Directive
+    put("%IF", NULL);//Directive
+    put("%VFOR_REAL", NULL);//Directive
+    put("%VFOR", NULL);//Directive
+    put("%REDUCTION_BY_SUM", NULL);   //Directive
+    put("%REDUCTION_BY_MAX", NULL);   //Directive
+    put("%REDUCTION_BY_MIN", NULL);   //Directive
+    put("%REDUCTION_BY_HYPOT", NULL);   //Directive
+    put("%REDUCTION_BY_SSQ", NULL);   //Directive
+    put("%VABS", NULL);      //Directive
+    put("%ABS", NULL);      //Directive
+
+    srand((unsigned int)time(NULL));
+
+    return;
+}
+
+kprintf::kprintf(char _type, int vecWidth, bool doVLOAD, bool doVSTORE, int _wgSize)
+{
+    this->dataType = _type;
+    switch(_type)
+    {
+        case 'S':
+            doConstruct("single", vecWidth, doVLOAD, doVSTORE, _wgSize);
+            break;
+        case 'D':
+            doConstruct("double", vecWidth, doVLOAD, doVSTORE, _wgSize);
+            break;
+        case 'C':
+            doConstruct("complex", vecWidth, doVLOAD, doVSTORE, _wgSize);
+            break;
+        case 'Z':
+            doConstruct("doublecomplex", vecWidth, doVLOAD, doVSTORE, _wgSize);
+            break;
+        default:
+            printf("WARNING: kprintf called with wrong arguments!\n");
+            break;
+    }
+    return;
+}
+
+kprintf::kprintf(const char *type, int vecWidth, bool doVLOAD, bool doVSTORE, int _wgSize)
+{
+    if (strcmp(type, "single") == 0)
+        this->dataType = 'S';
+    else if (strcmp(type, "double") == 0)
+        this->dataType = 'D';
+    else if (strcmp(type, "complex") == 0)
+        this->dataType = 'C';
+    else if (strcmp(type, "doublecomplex") == 0)
+        this->dataType = 'Z';
+
+    doConstruct(type, vecWidth, doVLOAD, doVSTORE, _wgSize);
+    return;
+}
+
+void kprintf::put(const char *key, const char *value)
+{
+    struct fmt f;
+
+    if(key[0] != '%')
+    {
+        std::cout << "Addition of key " << key << " failed as it does not start with %" << std::endl;
+        return;
+    }
+    f.key = key; f.value = value;
+    if (strlen(key) > maxKeySize)
+    {
+        maxKeySize = strlen(key);
+    }
+    v.push_back(f);
+    return;
+}
+
+//
+// PENDING:
+// Needs ammendment at a later point of time when we support MACROS
+//
+int kprintf::real_strlen(const char *src)
+{
+    int length = 0;
+    struct fmt f;
+    while(src[0])
+    {
+        f = get(src);
+        if (f.value != NULL)
+        {
+            length += (int)strlen(f.value);
+            src += strlen(f.key);
+        } else {
+            length++;
+            src++;
+        }
+    }
+    return length+1; // +1 for the '\0' character
+}
+
+void kprintf::spit(char *dst, char *src)
+{
+    struct fmt f;
+
+    while(src[0])
+    {
+        f = get(src);
+        if ((f.value != NULL) || (f.key != NULL))
+        {
+            if(f.value != NULL)
+            {
+                //
+                // Normal Replacement Would Suffice
+                //
+                strncpy(dst, f.value, strlen(f.value));
+                dst += strlen(f.value);
+                src += strlen(f.key);
+            } else {
+                //
+                // Directive - Function Like Macro
+                //
+                if( strcmp(f.key, "%MAD") == 0)
+                {
+                    handleMAD(&src, &dst);
+                }
+                else if ( strcmp(f.key, "%VMAD") == 0)
+                {
+                    handleMAD(&src, &dst, true);
+                } else if ( strcmp(f.key, "%VMAD_AND_REDUCE") == 0)
+                {
+                    handleVMAD_AND_REDUCE(&src, &dst);
+                } else if ( strcmp(f.key, "%MAD_AND_REDUCE") == 0)
+                {
+                    handleMAD_AND_REDUCE(&src, &dst);
+                } else if ( strcmp(f.key, "%CONJUGATE") == 0)
+                {
+                    handleCONJUGATE(&src, &dst);
+                } else if ( strcmp(f.key, "%CLEAR_IMAGINARY") == 0)
+                {
+                    handleClearImaginary(&src, &dst);
+                }
+                else if (strcmp(f.key, "%MUL") == 0)
+                {
+                    handleMUL(&src, &dst);
+                }
+                else  if (strcmp(f.key, "%VMUL") == 0)
+                {
+                    handleMUL(&src, &dst, true);
+                } else if (strcmp(f.key, "%ADD") == 0)
+                {
+                    handleADD_SUB(&src, &dst, '+');
+                }
+                else if (strcmp(f.key, "%SUB") == 0)
+                {
+                    handleADD_SUB(&src, &dst, '-');
+                }
+                else if (strcmp(f.key, "%DIV") == 0)
+                {
+                    handleDIV(&src, &dst);
+                } else if (strcmp(f.key, "%VDIV") == 0)
+                {
+                    handleDIV(&src, &dst, true);
+                }  else if (strcmp(f.key, "%VMAKEVEC") == 0)
+                {
+                    handleMakeVector(&src, &dst);
+                } else if (strcmp(f.key, "%VMAKEHVEC") == 0)
+                {
+                    handleMakeVector(&src, &dst, 2);
+                } else if (strcmp(f.key, "%VMAKEQVEC") == 0)
+                {
+                    handleMakeVector(&src, &dst, 4);
+                } else if (strcmp(f.key, "%VMAKEOVEC") == 0)
+                {
+                    handleMakeVector(&src, &dst, 8);
+                } else if ((strcmp(f.key, "%MAKEVEC") == 0) || (strcmp(f.key, "%INIT") == 0) )
+                {
+                    handleMakeVector(&src, &dst, 0); // To handle Scalar case
+                } else if (strcmp(f.key, "%VLOADWITHINCX") == 0)
+                {
+                    handleVLoadWithIncx(&src, &dst);
+                }else if (strcmp(f.key, "%VLOADWITHINCXV2") == 0)
+                {
+                    handleVLoadWithIncx(&src, &dst, true);
+                } else if (strcmp(f.key, "%VSTOREWITHINCX") == 0)
+                {
+                    handleVStoreWithIncx(&src, &dst);
+                }else if (strcmp(f.key, "%REDUCE_SUM") == 0)
+                {
+                    handleReduceSum(&src, &dst);
+                } else if (strcmp(f.key, "%REDUCE_SUM_REAL_HV") == 0)
+                {
+                    handleReduceSumReal(&src, &dst, effectiveVectorWidthOnBaseType/2);
+                } else if (strcmp(f.key, "%REDUCE_MAX") == 0)
+                {
+                    handleReduceMax(&src, &dst);
+                } else if (strcmp(f.key, "%REDUCE_MIN") == 0)
+                {
+                    handleReduceMin(&src, &dst);
+                } else if (strcmp(f.key, "%REDUCE_HYPOT") == 0)
+                {
+                    handleReduceHypot(&src, &dst);
+                }else if (strcmp(f.key, "%VLOAD") == 0)
+                {
+                    handleAlignedDataAccess(&src, &dst);
+                }else if (strcmp(f.key, "%VSTORE") == 0)
+                {
+                    handleAlignedVSTORE(&src, &dst);
+                } else if (strcmp(f.key, "%IF") == 0)
+                {
+                    handlePredicate(&src, &dst);
+                } else if (strcmp(f.key, "%COMPLEX_JOIN") == 0)
+                {
+                    handleComplexJoin(&src, &dst);
+                } else if (strcmp(f.key, "%VFOR_REAL") == 0)
+                {
+                    handleVFOR(&src, &dst, true);
+                } else if (strcmp(f.key,"%VFOR") == 0)
+                {
+                    handleVFOR(&src, &dst, false);
+                } else if (strcmp(f.key,"%REDUCTION_BY_SUM") == 0)
+                {
+                    handleReductionFramework(&src, &dst, REDUCTION_BY_SUM);
+                } else if (strcmp(f.key,"%REDUCTION_BY_MAX") == 0)
+                {
+                    handleReductionFramework(&src, &dst, REDUCTION_BY_MAX);
+                } else if (strcmp(f.key,"%REDUCTION_BY_MIN") == 0)
+                {
+                    handleReductionFramework(&src, &dst, REDUCTION_BY_MIN);
+                } else if (strcmp(f.key,"%REDUCTION_BY_HYPOT") == 0)
+                {
+                    handleReductionFramework(&src, &dst, REDUCTION_BY_HYPOT);
+                } else if (strcmp(f.key,"%REDUCTION_BY_SSQ") == 0)
+                {
+                    handleReductionFramework(&src, &dst, REDUCTION_BY_SSQ);
+                } else if (strcmp(f.key,"%VABS") == 0)
+                {
+                    handleVABS(&src, &dst);
+                }
+                else {
+                    std::cerr <<  "Problems in spitting: Internal error. Unable to handle key " << f.key << std::endl;
+                    *dst = *src;
+                    dst++;
+                    src++;
+                }
+            }
+        } else {
+            *dst = *src;
+            dst++;
+            src++;
+        }
+    }
+    *dst = '\0';
+}
+
+
diff --git a/src/library/blas/gens/legacy/blas_kgen_legacy.c b/src/library/blas/gens/legacy/blas_kgen_legacy.c
new file mode 100644
index 0000000..4d566cb
--- /dev/null
+++ b/src/library/blas/gens/legacy/blas_kgen_legacy.c
@@ -0,0 +1,625 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * common stuff for blas related
+ * kernel generators, legacy part
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include <list.h>
+#include <clblas_stddef.h>
+
+#include <matrix_props.h>
+#include <matrix_dims.h>
+#include <dis_warning.h>
+
+#include "blas_kgen_legacy.h"
+
+void
+declareBlasEnums(struct KgenContext *ctx)
+{
+    kgenAddStmt(ctx,
+        "typedef enum clblasOrderEnum {\n"
+        "   clblasRowMajor,\n"
+        "   clblasColumnMajor\n"
+        "} clblasOrder;\n"
+        "\n"
+        "typedef enum clblasTransposeEnum {\n"
+        "   clblasNoTrans,\n"
+        "   clblasTrans,\n"
+        "   clblasConjTrans\n"
+        "} clblasTranspose;\n"
+        "\n"
+        "typedef enum clblasUploEnum {\n"
+        "   clblasUpper,\n"
+        "   clblasLower\n"
+        "} clblasUplo;\n"
+        "\n"
+        "typedef enum clblasDiagEnum {\n"
+        "   clblasUnit,\n"
+        "   clblasNonUnit\n"
+        "} clblasDiag;\n"
+        "\n"
+        "typedef enum clblasSideEnum {\n"
+        "   clblasLeft,\n"
+        "   clblasRight\n"
+        "} clblasSide;\n\n");
+}
+
+static unsigned int
+getTmpVecLen(
+    const BlasGenSettings *gset,
+    UpdateResultFlags uflags,
+    const char **vecName)
+{
+    const CLBLASKernExtra *kextra = gset->kextra;
+    unsigned int vecLen;
+
+    if (isComplexType(kextra->dtype) || (uflags & (UPRES_GENERIC |
+                                         UPRES_NO_VECTORIZATION))) {
+        vecLen = 1;
+    }
+    else {
+        vecLen = (gset->flags & BGF_DISTINCT_VECLEN) ? kextra->vecLenC :
+                                                       kextra->vecLen;
+        getVectorTypeName(kextra->dtype, vecLen, vecName, NULL);
+    }
+
+    return vecLen;
+}
+
+static void
+updateOptimResultGen(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    unsigned int wvlen,
+    unsigned int pitch,
+    unsigned int regOff,
+    const char *ldName,
+    UpdateResultOp op,
+    UpdateResultFlags flags,
+    const char *cachedName)
+{
+    char tmp[1024];
+    int tra, isDouble;
+    bool useReg = true;
+    char *regRole;
+    char dst[80], src[80];
+    char vchunkTmp[64], vchunkReg[64];
+    unsigned int sizes[2];
+    unsigned int i, j, k;
+    unsigned int off;
+    const char *vfield;
+    DataType dtype = gset->kextra->dtype;
+    bool isPrivDest = ((flags & UPRES_PRIV_DEST) != 0);
+    unsigned int vecLen;     // vector length of the result's register block
+    // vector length to update with at immediate operations
+    unsigned int uplen;
+    // vector length of the temporary storage location
+    unsigned int tmpVecLen;
+    const char *ptrName;
+
+    sizes[0] = (unsigned int)gset->subdims[1].y;
+    sizes[1] = (unsigned int)gset->subdims[1].x;
+
+    j = 0;
+    tra = ((flags & UPRES_COLUMN_MAJOR) != 0);
+    isDouble = isDoubleBasedType(dtype);
+    vfield = dtypeUPtrField(dtype);
+    vecLen = (gset->flags & BGF_DISTINCT_VECLEN) ? gset->kextra->vecLenC :
+                                                   gset->kextra->vecLen;
+    tmpVecLen = getTmpVecLen(gset, flags, NULL);
+    getVectorTypeName(dtype, wvlen, NULL, &ptrName);
+    if (isComplexType(dtype)) {
+        vecLen = 1;
+    }
+    uplen = (tra || (flags & UPRES_NO_VECTORIZATION)) ? 1 : vecLen;
+
+    /*
+     * Pass recursively over the major dimension with power of 2 vectors.
+     * If the used type size is less then the current vector size,
+     * use assembling/disassembling into/from a temporary vector. This is
+     * for trying to increase effectiveness of operations with the global
+     * memory due to vectorization.
+     */
+    if (wvlen > sizes[1 - tra]) {
+        wvlen /= 2;
+        updateOptimResultGen(ctx, gset, wvlen, pitch, regOff, ldName,
+                             op, flags, cachedName);
+        return;
+    }
+
+    if (wvlen == 1) {
+        kgenAddStmt(ctx, "// Copy with single words\n");
+    }
+    else {
+        const char *s = (isDouble) ? "double" : "float";
+
+        sprintf(tmp, "// Copy with %s%d vectors\n", s, wvlen);
+        kgenAddStmt(ctx, tmp);
+    }
+
+    for (i = 0; i < sizes[tra]; i++) {
+        unsigned int roff;
+
+        if (tra) {
+            roff = regOff + i;
+        }
+        else {
+            roff = regOff + i * pitch;
+        }
+
+        for (j = 0; j < sizes[1 - tra] / wvlen; j++) {
+            if (wvlen > uplen) {
+                if (isPrivDest) {
+                    sprintfVecChunk(vchunkTmp, tmpVecLen, wvlen, 0);
+                    sprintf(tmp, "tmp%s = uC.%s[%u];\n",
+                            vchunkTmp, ptrName, j);
+                    kgenAddStmt(ctx, tmp);
+                }
+                else {
+                    // assemble vector
+                    for (k = 0; k < wvlen; k += uplen) {
+                        off = (tra) ? (roff + k * pitch) : (roff + k);
+                        sprintfVecChunk(vchunkTmp, tmpVecLen, uplen, k);
+                        sprintfVecChunk(vchunkReg, vecLen, uplen, off % vecLen);
+                        sprintf(tmp, "tmp%s = c[%u]%s;\n",
+                                vchunkTmp, off / vecLen, vchunkReg);
+                        kgenAddStmt(ctx, tmp);
+                    }
+                }
+            }
+
+            if (isPrivDest && (wvlen > uplen)) {
+                // disassemble temporary vector and do immediate result update
+                for (k = 0; k < wvlen; k += uplen) {
+                    off = (tra) ? (roff + k * pitch) : (roff + k);
+                    sprintfVecChunk(vchunkTmp, tmpVecLen, uplen, k);
+                    sprintfVecChunk(vchunkReg, vecLen, uplen, off % vecLen);
+                    sprintf(src, "tmp%s", vchunkTmp);
+                    sprintf(dst, "c[%u]%s", off / vecLen, vchunkReg);
+                    genUpdateResultSingle(ctx, dst, src, gset, op, flags);
+                }
+            }
+            else {
+                if (wvlen > uplen) {
+                    sprintfVecChunk(vchunkTmp, tmpVecLen, wvlen, 0);
+                    sprintf(src, "tmp%s", vchunkTmp);
+                    useReg = false;
+                }
+
+                if (!isPrivDest) {
+                    sprintf(dst, "uC.%s[%u]", ptrName, j);
+                    if (cachedName) {
+                        char *p = dst + strlen(dst);
+                        strcat(p, " = ");
+                        p = dst + strlen(dst);
+                        sprintf(p, cachedName, i, j);
+                    }
+                    regRole = src;
+                }
+                else {
+                    useReg = true;
+                    regRole = dst;
+                    sprintf(src, "uC.%s[%u]", ptrName, j);
+                }
+
+                if (useReg) {
+                    sprintfVecChunk(vchunkReg, vecLen, uplen, roff % vecLen);
+                    sprintf(regRole, "c[%u]%s", roff / vecLen, vchunkReg);
+                }
+
+                genUpdateResultSingle(ctx, dst, src, gset, op, flags);
+            }
+
+            // update register offset
+            if (tra) {
+                roff += wvlen * pitch;
+            }
+            else {
+                roff += wvlen;
+            }
+        }
+
+        // move the destination pointer to the next line
+        if ((i != sizes[tra] - 1)) {
+            sprintf(tmp, "uC.%s += %s;\n", vfield, ldName);
+            kgenAddStmt(ctx, tmp);
+            if (tra) {
+                kgenAddBlankLine(ctx);
+            }
+        }
+    }
+
+    if (j * wvlen != sizes[1 - tra]) {
+        // increment pointers
+        if (tra) {
+            regOff += j * wvlen * pitch;
+        }
+        else {
+            regOff += j * wvlen;
+        }
+
+        sprintf(tmp, "\n"
+                     "uC.%s = tmpC.%s + %u;\n"
+                     "tmpC = uC;\n",
+                vfield, vfield, j * wvlen);
+        kgenAddStmt(ctx, tmp);
+
+        // go down
+        sizes[1 - tra] -= j * wvlen;
+        wvlen /= 2;
+        updateOptimResultGen(ctx, gset, wvlen, pitch, regOff, ldName,
+                             op, flags, cachedName);
+    }
+}
+
+static void
+updateGenericResultGen(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    size_t pitch,
+    UpresVarNames* uvars,
+    UpdateResultOp op,
+    UpdateResultFlags flags,
+    const char *cachedName)
+{
+    char tmp[1024], dst[128], src[128];
+    const char *boundNames[2] = {uvars->nrRows, uvars->nrCols};
+    const char *vecType = NULL;
+    const char *vFieldVectorized;
+    DataType dtype = gset->kextra->dtype;
+    unsigned int wvlen;
+    unsigned int sizes[2];
+    const char*  vfield = dtypeUPtrField(dtype);
+    bool tra = ((flags & UPRES_COLUMN_MAJOR) != 0);
+    bool row = ((flags & UPRES_TAIL_ROW));
+    bool col = ((flags & UPRES_TAIL_COL));
+    bool iwc = ((flags & UPRES_INDEXING_WITH_CONSTANTS) != 0);
+    int l0;
+    int l1;
+    unsigned int vecLen;     // vector length of the result's register block
+    // vector length to update with at immediate operations
+    unsigned int uplen;
+    // vector length of the temporary storage location
+    char vchunkReg[64];
+    bool revert = false;
+
+    vecLen = (gset->flags & BGF_DISTINCT_VECLEN) ? gset->kextra->vecLenC :
+                                                   gset->kextra->vecLen;
+    if (isComplexType(dtype)) {
+        vecLen = 1;
+    }
+    uplen = (tra || (flags & UPRES_NO_VECTORIZATION)) ? 1 : vecLen;
+    uplen = 1;
+
+
+    sizes[0] = (unsigned int)gset->subdims[1].y;
+    sizes[1] = (unsigned int)gset->subdims[1].x;
+
+    if (iwc) {
+        const char* l0var =  boundNames[tra];
+        revert =  (tra && col) || (!tra && row);
+
+        if (revert) {
+            sprintf(tmp, "uC.%s += (%s-1) * %s;\n", vfield, l0var, uvars->ld);
+        }
+        else {
+            sprintf(tmp, "\n");
+        }
+        kgenAddStmt(ctx, tmp);
+
+    }
+    wvlen = getTmpVecLen(gset, flags, &vecType);
+    getVectorTypeName(dtype, wvlen, NULL, &vFieldVectorized);
+    sprintf(tmp, "res.%s = c;\n", vFieldVectorized);
+    kgenAddStmt(ctx, tmp);
+
+    if (flags & (UPRES_TAIL_ROW | UPRES_TAIL_COL)) {
+        char offStr[64];
+        char *p = offStr;
+
+        offStr[0] = '\0';
+        if (flags & UPRES_TAIL_ROW) {
+            sprintf(offStr, " + (%u - %s) * %lu",
+                    sizes[0], uvars->nrRows, pitch);
+            p += strlen(offStr);
+        }
+        if (flags & UPRES_TAIL_COL) {
+            sprintf(p, " + (%u - %s)", sizes[1], uvars->nrCols);
+        }
+        if (iwc) {
+            sprintf(tmp, "res.%s = uC.%s%s;\n", vfield, vfield, offStr);
+            sprintf(tmp, "\n");
+        }
+        else {
+            sprintf(tmp, "res.%s = res.%s%s;\n", vfield, vfield, offStr);
+        }
+        kgenAddStmt(ctx, tmp);
+
+    }
+    if (iwc) {
+        int l0st = 1; int l0en = sizes[tra];
+        int l1st = 1; int l1en = sizes[1-tra];
+
+        const char* l0var =  boundNames[tra];
+        const char* l1var = boundNames[1-tra];
+
+        for (l0 = l0en; l0 >= l0st; l0--) {
+
+            sprintf(tmp, "if (%s) ",l0var);
+            kgenBeginBranch(ctx, tmp);
+
+            sprintf(tmp, "switch (%s)", l1var);
+            kgenBeginBranch(ctx, tmp);
+
+            for (l1 = l1en; l1 >= l1st; l1--) {
+                int resId;
+
+                sprintf(tmp, "case %d:\n", l1);
+                kgenAddStmt(ctx, tmp);
+
+                if (tra) {
+                    resId = (row)
+                             ? (l1en-l1)*(int)pitch
+                             : (l1-l1st)*(int)pitch;
+
+                    resId += (col)? (l0-l0st): (l0en-l0);
+                }
+                else {
+                    ///////////////////////////
+                    resId = (row)
+                            ? (l0-l0st)*(int)pitch
+                            : (l0en-l0)*(int)pitch;
+                    resId += (col)? (l1en-l1) : (l1-l1st);
+                }
+
+                if ((tra && row) || (!tra && col)) {
+                     sprintf(dst, "uC.%s[(%s+%d) %% %i]",
+                             vfield, l1var, (l1en - l1),  (int)l1en);
+                }
+                else {
+                   sprintf(dst, "uC.%s[%d]", vfield, (l1-l1st));
+                }
+                sprintfVecChunk(vchunkReg, vecLen, uplen, resId % vecLen);
+                sprintf(src, "c[%u]%s", resId / vecLen, vchunkReg);
+
+                if (flags & UPRES_PRIV_DEST) {
+                    genUpdateResultSingle(ctx, src, dst, gset, op, flags);
+                }
+                else {
+                    genUpdateResultSingle(ctx, dst, src, gset, op, flags);
+                }
+            }
+            kgenEndBranch(ctx, NULL);
+
+            if (revert) {
+                sprintf(tmp, "uC.%s -= %s;\n", vfield, uvars->ld);
+            }
+            else {
+                sprintf(tmp, "uC.%s += %s;\n", vfield, uvars->ld);
+            }
+
+            kgenAddStmt(ctx, tmp);
+
+            sprintf(tmp, "%s--;\n", l0var);
+            kgenAddStmt(ctx, tmp);
+            kgenEndBranch(ctx, NULL);
+        }
+
+    }
+    else {
+
+        sprintf(tmp, "for (i = 0; i < %s; i++)", boundNames[tra]);
+        kgenBeginBranch(ctx, tmp);
+        sprintf(tmp, "for (j = 0; j < %s; j++)", boundNames[1 - tra]);
+        kgenBeginBranch(ctx, tmp);
+        sprintf(dst, "uC.%s[i * %s + j]", vfield, uvars->ld);
+        if (cachedName) {
+            unsigned int i;
+            char tmpcachedName[80] = " = ";
+            strcat(tmpcachedName, cachedName);
+            for (i = 3; i < strlen(tmpcachedName); i++) {
+                if (strncmp(tmpcachedName+i, "%u", 2) == 0) {
+                    tmpcachedName[i+1] = 's';
+                }
+            }
+            sprintf(tmp, tmpcachedName, "i", "[j]");
+            strcat(dst, tmp);
+        }
+        if (tra) {
+            sprintf(src, "res.%s[j * %lu + i]", vfield, pitch);
+        }
+        else {
+            sprintf(src, "res.%s[i * %lu + j]", vfield, pitch);
+        }
+        if (flags & UPRES_PRIV_DEST) {
+            genUpdateResultSingle(ctx, src, dst, gset, op, flags);
+        }
+        else {
+            genUpdateResultSingle(ctx, dst, src, gset, op, flags);
+        }
+        kgenEndBranch(ctx, NULL);
+        kgenEndBranch(ctx, NULL);
+    }
+}
+
+int
+updateResultGenOld(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    UpdateResultOp op,
+    UpdateResultFlags flags,
+    const UpresVarNames *uvarNames)
+{
+    char tmp[1024];
+    char *p = tmp;
+    const char *typeName;
+    const char *vecType = NULL;
+    const char *vfield;
+    const char *suff1;
+    const char *suff2;
+    int ret = 0;
+    unsigned int sizes[2];
+    bool generic, tra;
+    unsigned int wvlen;     // length of vectors to copy with
+    unsigned int uplen;     // length of vectors to update result with
+    size_t pitch;
+    char LG;
+    DataType dtype = gset->kextra->dtype;
+    unsigned int vecLen;
+    bool isInlined = (flags & UPRES_INLINE);
+    UpresVarNames uvars;
+
+    vecLen = (gset->flags & BGF_DISTINCT_VECLEN) ? gset->kextra->vecLenC :
+                                                   gset->kextra->vecLen;
+    sizes[0] = (unsigned int)gset->subdims[1].y;
+    sizes[1] = (unsigned int)gset->subdims[1].x;
+
+    if (isComplexType(dtype)) {
+        vecLen = 1;
+    }
+
+    if ((flags & UPRES_WITH_BETA) && (op != UPRES_SUM)) {
+        return -EINVAL;
+    }
+
+    tra = ((flags & UPRES_COLUMN_MAJOR) != 0);
+    generic = ((flags & UPRES_GENERIC) != 0);
+    typeName = dtypeBuiltinType(dtype);
+    vfield = dtypeUPtrField(dtype);
+    pitch = roundUp(sizes[1], vecLen);
+
+    // select write vectorization
+    wvlen = getTmpVecLen(gset, flags, &vecType);
+    uplen = (tra || (flags & UPRES_NO_VECTORIZATION)) ? 1 : vecLen;
+
+    suff1 = (generic) ? "Generic" : "";
+    suff2 = (flags & UPRES_PRIV_DEST) ? "Rev" : "";
+    LG = (flags & UPRES_USE_LDS) ? 'L' : 'G';
+
+    if (!isInlined) {
+        const char *outTypeName;
+        const char *memPref = (flags & UPRES_USE_LDS) ? "__local" :
+                                                           "__global";
+
+        getResultGPRsInfo(dtype, NULL, vecLen, NULL, &outTypeName);
+
+        // define the function
+        sprintf(tmp, "void\n"
+                     "updateResult%s%s%c(\n"
+                     "    %s %s *C,\n"
+                     "    %s *c,\n"
+                     "    %s alpha,\n"
+                     "    uint startRow,\n"
+                     "    uint startCol,\n"
+                     "    uint ld",
+                     suff1, suff2, LG, memPref, typeName,
+                     outTypeName, typeName);
+
+        p += strlen(p);
+        if (flags & UPRES_WITH_BETA) {
+            sprintf(p, ",\n    %s beta", typeName);
+            p += strlen(p);
+        }
+        if (generic) {
+            sprintf(p, ",\n    uint nrRows,\n"
+                       "    uint nrCols");
+        }
+
+        uvars.result = "C";
+        uvars.ld = "ld";
+        uvars.startRow = "startRow";
+        uvars.startCol = "startCol";
+        uvars.nrRows = "nrRows";
+        uvars.nrCols = "nrCols";
+
+        strcat(p, ")\n");
+        kgenDeclareFunction(ctx, tmp);
+        kgenBeginFuncBody(ctx);
+    }
+    else {
+        memcpy(&uvars, uvarNames, sizeof(uvars));
+    }
+
+    // declare local variables
+    sprintf(tmp, "%cPtr uC;\n", LG);
+    kgenAddStmt(ctx, tmp);
+    if (generic) {
+        kgenAddStmt(ctx, "int i, j;\n"
+                         "PPtr res;\n");
+    }
+    else {
+        /*
+         * temporary pointer to pass correctly over the
+         * destination array since destination rows can be
+         * not aligned on a vector bound
+         */
+        if (sizes[1 - tra] % wvlen != 0) {
+            sprintf(tmp, "%cPtr tmpC;\n", LG);
+            kgenAddStmt(ctx, tmp);
+        }
+        if (wvlen > uplen) {
+            sprintf(tmp, "%s tmp;\n", vecType);
+            kgenAddStmt(ctx, tmp);
+        }
+    }
+    if (isComplexType(dtype) && !(flags & UPRES_WITHOUT_ALPHA)) {
+        declareComplexMultParts(ctx, "alpha", typeName);
+        if (flags & UPRES_WITH_BETA) {
+            declareComplexMultParts(ctx, "beta", typeName);
+        }
+
+    }
+    kgenAddBlankLine(ctx);
+
+    if (tra) {
+        sprintf(tmp, "uC.%s = %s + %s * %s + %s;\n",
+                vfield, uvars.result, uvars.startCol, uvars.ld,
+                uvars.startRow);
+    }
+    else {
+        sprintf(tmp, "uC.%s = %s + %s * %s + %s;\n",
+                vfield, uvars.result, uvars.startRow, uvars.ld,
+                uvars.startCol);
+    }
+    kgenAddStmt(ctx, tmp);
+
+    if ((sizes[1 - tra] % wvlen != 0) && !generic) {
+        kgenAddStmt(ctx, "tmpC = uC;\n");
+    }
+    ret = kgenAddBlankLine(ctx);
+
+    if (generic) {
+        updateGenericResultGen(ctx, gset, pitch, &uvars, op, flags,
+                               uvarNames ? uvarNames->cachedName : NULL);
+    }
+    else {
+        updateOptimResultGen(ctx, gset, wvlen, (unsigned int)pitch, 0, uvars.ld,
+                           op, flags, uvarNames ? uvarNames->cachedName : NULL);
+    }
+
+    if (!isInlined) {
+        ret = kgenEndFuncBody(ctx);
+    }
+
+    return (ret) ? -EOVERFLOW : 0;
+}
diff --git a/src/library/blas/gens/legacy/blas_kgen_legacy.h b/src/library/blas/gens/legacy/blas_kgen_legacy.h
new file mode 100644
index 0000000..cd419db
--- /dev/null
+++ b/src/library/blas/gens/legacy/blas_kgen_legacy.h
@@ -0,0 +1,195 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef BLAS_KGEN_LEGACY_H_
+#define BLAS_KGEN_LEGACY_H_
+
+#include "../blas_kgen.h"
+
+/**
+ * @internal
+ * @brief Block multiplier flags
+ * @ingroup BLAS_MAJOR_GENS
+ */
+typedef enum BlkmulFlags {
+    BLKMUL_NO_FLAGS,            /**< No flags */
+    BLKMUL_TRANSPOSE = 0x01,    /**< Transpose result */
+    BLKMUL_IMAGE_PACKED = 0x02, /**< Data in image are packed */
+    /**
+     * Accumulate multiplication results to a
+     * private location provided by caller
+     */
+    BLKMUL_OUTPUT_PRIVATE = 0x04,
+    BLKMUL_SKEW_ROW = 0x08,     /**< Use skew over block rows */
+    BLKMUL_SKEW_COLUMN = 0x10,  /**< Use skew over block columns */
+    BLKMUL_INLINE = 0x20,       /**< Generate an inline version */
+    BLKMUL_TRANSPOSED_B = 0x40, /**< Block B is transposed */
+    /** Don't use "&" operation in cyclic address evaluation, use always "%" */
+    BLKMUL_AVOID_AND = 0x80
+} BlkMulFlags;
+
+/**
+ * @internal
+ * @brief Block multiplier core
+ * @ingroup BLAS_MAJOR_GENS
+ */
+typedef enum BlkmulCore {
+    /** Use separate multiplication and summation implemented by hand */
+    BLKMUL_SEPARATE_MULADD,
+    /** Use the 'dot' function */
+    BLKMUL_DOT,
+    /** Use the 'mad' function */
+    BLKMUL_MAD
+} BlkmulCore;
+
+/**
+ * @internal
+ * @brief Argument names for the inline version of the block
+ *        multiplier
+ * @ingroup BLAS_MAJOR_GENS
+ */
+typedef struct BlkmulArgNames {
+    const char *coordA;     /**< Matrix A start coordinates */
+    const char *coordB;     /**< Matrix B start coordinates */
+    const char *skewRow;    /**< Skew over rows */
+    const char *skewCol;    /**< Skew over columns */
+    const char *k;          /**< Counter name in the loop over K */
+    const char *vectBoundK; /**< Bound in the loop over K */
+} BlkmulArgNames;
+
+/**
+ * @internal
+ * @brief Options for matrix block multiplication
+ *        generator
+ * @ingroup BLAS_MAJOR_GENS
+ */
+typedef struct BlkMulOpts {
+    /** OpenCL memory object type storing matrix (whole or its blocks) A */
+    CLMemType aMobj;
+    /** OpenCL memory object type storing matrix (whole or its blocks) A */
+    CLMemType bMobj;
+    BlkMulFlags flags;      /**< Specific flags */
+    BlkmulCore core;        /**< Multiply and add core */
+    /** List of argument names for the inline version */
+    BlkmulArgNames argNames;
+} BlkMulOpts;
+
+void
+declareBlasEnums(struct KgenContext *ctx);
+
+/**
+ * @internal
+ * @brief Matrix block multiplication generator
+ *
+ * @param[out] ctx          Generator context
+ * @param[in] subdims       Subproblem dimensions; the first level reflects
+ *                          dimensions of the large blocks processed with the
+ *                          whole work group, and the second level
+ *                          reflects sizes of immediately multiplied small
+ *                          blocks within the single work item
+ * @param[in] dtype         Data type the multiplying function will be
+ *                          generated for
+ * @param[in] opts          Block multiplication options
+ *
+ * Generated functions have the following definitions: \n
+ *\n
+ * For the buffer based version:
+ * @code
+ * void
+ * funcName(
+ *     <type> alpha,
+ *     LPtr A,
+ *     LPtr B,
+ *     LPtr C,
+ *     [,int2 skewRow]
+ *     [,int skewCol]);
+ * @endcode
+ *
+ * Function naming rule:
+ * (type prefix)gemmBlock[Transp]_<width>_<height>
+ *\n
+ * It's assumed A, B and C point to start of data to be
+ * processed during this step.
+ *\n
+ * For the image based version: \n
+ * @code
+ * void
+ * funcName(
+ *     <type> alpha,
+ *     __read_only image2d_t A,
+ *     int2 coordA,
+ *     __read_only image2d_t B,
+ *     int2 coordB,
+ *     LPtr C,
+ *     [,int2 skewRow],
+ *     [,int skewCol]);
+ * @endcode
+ *
+ * Where coordA and coordB mean start image coordinates to fetch data from.
+ *\n
+ * For the image based version a mixed variant is possible when
+ * either A or B blocks are passed through the local memory.
+ *\n
+ * The 'skewRow' and 'skewCol' are optional arguments if the
+ * 'BLKMUL_SKEW_ROW' and "BLKMUL_SKEW_COLUMN" flag is specified
+ * respectively. 'y' field of the row skew is for the block A, and the
+ * 'x' one is for the block B.
+ *\n
+ * Output result can be put directly into a private location provided by the
+ * caller instead of the local one. It is achieved with 'BLKMUL_OUTPUT_PRIVATE'
+ * flag using.
+ *\n
+ * Pointer to this location should have the following types depending on the type
+ * of processed data: \n
+ * - float4 - for float
+ * - float2 - for complex float
+ * - double2 - for double and complex double
+ *\n\n
+ * Alpha is not taken in this case.
+ *\n
+ * The multiplier can be generated as well in the form of the dedicated
+ * function as in the inline form inserted to a kernel. \n In case of inline
+ * version the block multiplier becomes in fact the tile multiplier. In this
+ * case the caller should provide iteration over K.
+ *
+ * @return 0 on success, -EOVERFLOW on source buffer overflowing
+ */
+
+/**
+ * @internal
+ * @defgroup BLAS_MAJOR_GENS BLAS specific generators
+ * @ingroup MAJOR_GENS
+ */
+/*@{*/
+int
+blkMulGen(
+    struct KgenContext *ctx,
+    const SubproblemDim subdims[2],
+    DataType dtype,
+    const BlkMulOpts *opts);
+
+int
+updateResultGenOld(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    UpdateResultOp op,
+    UpdateResultFlags flags,
+    const UpresVarNames *uvarNames);
+
+/*@}*/
+
+#endif /* BLAS_KGEN_LEGACY_H_ */
diff --git a/src/library/blas/gens/legacy/blkmul.c b/src/library/blas/gens/legacy/blkmul.c
new file mode 100644
index 0000000..8b78d80
--- /dev/null
+++ b/src/library/blas/gens/legacy/blkmul.c
@@ -0,0 +1,823 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * TODO: throw away this generator and replace it with tileMulGen() in all
+ *       kernel generators
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include <defbool.h>
+#include <clblas_stddef.h>
+#include <sys/types.h>
+#include <kerngen.h>
+#include <matrix_props.h>
+#include <matrix_dims.h>
+#include <dis_warning.h>
+
+#include "../blas_kgen.h"
+#include "blas_kgen_legacy.h"
+
+#define MAX_LENGTH 4096
+#define BITS_INT (sizeof(int) * 8)
+
+typedef enum VectMulType {
+    VECT_MULT_REAL,
+    VECT_MULT_COMPLEX_REAL,
+    VECT_MULT_IMAG_FLOAT,
+    VECT_MULT_IMAG_DOUBLE
+} VectMulType;
+
+static __inline bool
+isPower2(size_t a)
+{
+    return (a && ((a & (a - 1)) == 0));
+}
+
+/*
+ * get vector chunk size to copy
+ * taking into account its alignment
+ */
+static unsigned int
+vecChunkSize(size_t offset, size_t vecLen)
+{
+    size_t chunk;
+
+    for (chunk = vecLen; (chunk > 1) && (offset % chunk); chunk /= 2) { }
+
+    return (unsigned int)chunk;
+}
+
+static void
+getCyclicAddrData(
+    BlkMulFlags flags,
+    const char **op,
+    size_t *value,
+    size_t bound)
+{
+    if (isPower2(bound) && !(flags & BLKMUL_AVOID_AND)) {
+        *op = "&";
+        *value = bound - 1;
+    }
+    else {
+        *op = "%";
+        *value = bound;
+    }
+}
+
+static void
+sprintfInputOffset(
+    char *buf,
+    MatrixRole mrole,
+    int row,
+    int col,
+    size_t vecPitch,
+    size_t bheight,
+    const BlkMulOpts *opts,
+    BlkmulArgNames *argNames,
+    bool singleStepK)
+{
+    const char *vfield;
+    const char *coordName;
+    const char *op;
+    size_t bound;
+    char colOff[64], rowOff[64];
+    CLMemType mtype;
+    BlkMulFlags flags = opts->flags;
+
+    vfield = (mrole == MATRIX_A) ? "y" : "x";
+    mtype = (mrole == MATRIX_A) ? opts->aMobj : opts->bMobj;
+    if ((mrole == MATRIX_B) && (flags & BLKMUL_TRANSPOSED_B)) {
+        flags &= ~BLKMUL_SKEW_ROW;
+    }
+
+    if (flags & BLKMUL_SKEW_ROW) {
+        getCyclicAddrData(flags, &op, &bound, bheight);
+        sprintf(rowOff, "((%s.%s + %d) %s %lu)",
+                argNames->skewRow, vfield, row, op, bound);
+    }
+    else {
+        sprintf(rowOff, "%d", row);
+    }
+
+    if (flags & BLKMUL_SKEW_COLUMN) {
+        getCyclicAddrData(flags, &op, &bound, vecPitch);
+        if (flags & BLKMUL_INLINE) {
+            if (singleStepK) {
+                sprintf(colOff, "%d", col);
+            }
+            else {
+                sprintf(colOff, "(%s + %s + %d) %% %s",
+                        argNames->skewCol, argNames->k, col,
+                        argNames->vectBoundK);
+            }
+        }
+        else {
+            if (singleStepK) {
+                sprintf(colOff, "%s", argNames->skewCol);
+            }
+            else {
+                sprintf(colOff, "((skewCol + k + %d) %s %lu)",
+                        col, op, bound);
+            }
+        }
+    }
+    else {
+        sprintf(colOff, "%d", col);
+    }
+
+    if (mtype == CLMEM_IMAGE) {
+        coordName = (mrole == MATRIX_A) ? argNames->coordA : argNames->coordB;
+        if (flags & BLKMUL_IMAGE_PACKED) {
+            sprintf(buf, "(int2)(%s.x + mad24(%s, %lu, %s), %s.y)",
+                    coordName, rowOff, vecPitch, colOff, coordName);
+        }
+        else {
+            sprintf(buf, "(int2)(%s.x + %s, %s.y + %s)",
+                    coordName, colOff, coordName, rowOff);
+        }
+    }
+    else {
+        if (flags & BLKMUL_SKEW_ROW) {
+            sprintf(buf, "mad24(%s, %lu, %s)", rowOff, vecPitch, colOff);
+        }
+        else {
+            sprintf(buf, "%lu + %s", row * vecPitch, colOff);
+        }
+    }
+}
+
+static void
+genRealDot(
+    struct KgenContext *ctx,
+    size_t m,
+    size_t n,
+    size_t nrCols,
+    size_t lenK,
+    unsigned int vecLen)
+{
+    size_t k;
+    char tmp[MAX_LENGTH], prefix[MAX_LENGTH];
+    const char *vect = "xyzw";
+    size_t regPitch = nrCols;
+    size_t off;
+
+    if (regPitch % vecLen) {
+        regPitch += vecLen - regPitch % vecLen;
+    }
+
+    off = m * regPitch + n;
+    sprintf(prefix, "c[%lu].%c += ", off / vecLen, vect[off % vecLen]);
+
+    for (k = 0; k < lenK / vecLen; k++) {
+        off = n * lenK / vecLen + k;
+        sprintf(tmp, "%sdot(a[%lu], b[%lu]);\n", prefix, k, off);
+        kgenAddStmt(ctx, tmp);
+    }
+}
+
+/*
+ * sprintf vector multiplication expression
+ */
+static void
+genVecMul(
+    struct KgenContext *ctx,
+    size_t currCol,
+    size_t lenK,
+    VectMulType type)
+{
+    size_t k;
+    char tmp[MAX_LENGTH];
+    const char *suff[] = {"", "", ".yxwz", ".yx"};
+
+    sprintf(tmp, "sum = a[%d] * b[%lu]%s", 0, currCol * lenK, suff[type]);
+    for (k = 1; k < lenK; k++) {
+        sprintf(tmp, "%s + a[%lu] * b[%lu]%s", tmp, k,
+                currCol * lenK + k, suff[type]);
+    }
+    strcat(tmp, ";\n");
+    kgenAddStmt(ctx, tmp);
+}
+
+/*
+ * sprintf vector multiplication expression using mad()'s
+ */
+static void
+genMadMul(
+    struct KgenContext *ctx,
+    size_t currCol,
+    size_t lenK,
+    VectMulType type)
+{
+    size_t k;
+    char tmp[MAX_LENGTH];
+    const char *suff[] = {"", "", ".yxwz", ".yx"};
+
+    sprintf(tmp, "sum = a[%d] * b[%lu]%s;\n", 0, currCol * lenK,
+            suff[type]);
+    for (k = 1; k < lenK; k++) {
+        sprintf(tmp, "%ssum = mad(a[%lu], b[%lu]%s, sum);\n", tmp, k,
+                currCol * lenK + k, suff[type]);
+    }
+    kgenAddStmt(ctx, tmp);
+}
+
+
+/*
+ * sprint expression for all the vector components
+ * accumulation
+ */
+static void
+genVecSum(
+    struct KgenContext *ctx,
+    DataType dataType,
+    size_t currRow,
+    size_t currCol,
+    size_t nrCols,
+    unsigned int vecLen,
+    VectMulType mulType)
+{
+    const char *vect = "xyzw";
+    unsigned long vecOff, regOff;
+    char c;
+    unsigned int k;
+    size_t pitch = nrCols;
+    char tmp1[MAX_LENGTH], tmp2[MAX_LENGTH];
+    unsigned int sumLen;
+
+    // get offset taking into account alignment
+    if ((pitch % vecLen) && !isComplexType(dataType)) {
+        pitch += vecLen - pitch % vecLen;
+    }
+
+    regOff = (unsigned int)(currRow * pitch + currCol);
+    if (isComplexType(dataType)) {
+        vecOff = (mulType == VECT_MULT_COMPLEX_REAL) ? 0 : 1;
+        sumLen = vecLen * 2;
+    }
+    else {
+        vecOff = regOff % vecLen;
+        regOff /= vecLen;
+        sumLen = vecLen;
+    }
+
+    sprintf(tmp1, " sum.x");
+    for (k = 1; k < sumLen; k++) {
+        c = ((mulType == VECT_MULT_COMPLEX_REAL) && (k & 1)) ? '-' : '+';
+        sprintf(tmp1, "%s %c sum.%c", tmp1, c, vect[k]);
+    }
+
+    sprintf(tmp2, "c[%lu].%c += %s;\n", regOff, vect[vecOff], tmp1);
+    kgenAddStmt(ctx, tmp2);
+}
+
+/*
+ * vector multiplication expression using mad() operations
+ */
+static void
+genMad(
+    struct KgenContext *ctx,
+    DataType dataType,
+    size_t currRow,
+    size_t currCol,
+    size_t nrCols,
+    size_t lenK,
+    unsigned int vecLen,
+    bool vectorized)
+{
+    const char *vect = {"xyzw"};
+    unsigned long vecOff, regOff;
+    unsigned int k;
+    size_t pitch = nrCols;
+    char tmp[MAX_LENGTH];
+    unsigned int sumLen;
+    int bIndex;
+
+    // get offset taking into account alignment
+    if ((pitch % vecLen) && !isComplexType(dataType)) {
+        pitch += vecLen - pitch % vecLen;
+    }
+    regOff = (unsigned int)(currRow * pitch + currCol);
+    vecOff = (unsigned int)(regOff % vecLen);
+
+    if (isComplexType(dataType)) {
+        sumLen = vecLen * 2;
+        for (k = 0; k < lenK; k++) {
+            int aIndex = k;
+            bIndex = (int)(currCol * lenK + k);
+
+            sprintf(tmp, "c[%lu] = mad(a[%d].xy, (float2)(b[%d].x), c[%lu]);\n",
+                    regOff, aIndex, bIndex, regOff);
+            kgenAddStmt(ctx, tmp);
+            sprintf(tmp, "c[%lu] = mad(a[%d].yx, (float2)(-b[%d].y, b[%d].y), c[%lu]);\n",
+                    regOff, aIndex, bIndex, bIndex, regOff);
+            kgenAddStmt(ctx, tmp);
+            sprintf(tmp, "c[%lu] = mad(a[%d].zw, (float2)(b[%d].z), c[%lu]);\n",
+                    regOff, aIndex, bIndex, regOff);
+            kgenAddStmt(ctx, tmp);
+            sprintf(tmp, "c[%lu] = mad(a[%d].wz, (float2)(-b[%d].w, b[%d].w), c[%lu]);\n",
+                    regOff, aIndex, bIndex, bIndex, regOff);
+            kgenAddStmt(ctx, tmp);
+        }
+    }
+    else {
+        // Real case
+        if (vectorized) {
+            const char *tname = (isDoubleBasedType(dataType)) ? "double" : "float";
+
+            regOff = (unsigned int)(currRow * nrCols / vecLen + currCol);
+            for (k = 0; k < lenK * vecLen; k++) {
+                bIndex = (int)(currCol * lenK * vecLen + k);
+                sprintf(tmp, "c[%lu] = mad((%s%u)a[%u].%c, b[%d], c[%lu]);\n",
+                        regOff, tname, vecLen, k / vecLen, vect[k % vecLen],
+                        bIndex, regOff);
+                kgenAddStmt(ctx, tmp);
+            }
+        }
+        else {
+            int dimNum;
+
+            regOff /= vecLen;
+            sumLen = vecLen;
+            if (isDoubleBasedType(dataType)) {
+                dimNum = 2;
+            }
+            else {
+                dimNum = 4;
+            }
+
+            for (k = 0; k < sumLen*lenK; k++) {
+                sprintf(tmp, "c[%lu].%c = mad(a[%u].%c, b[%lu].%c, "
+                                                    "c[%lu].%c);\n",
+                        regOff, vect[vecOff], k / sumLen, vect[k % dimNum],
+                        currCol * lenK + (k / sumLen), vect[k % dimNum],
+                        regOff, vect[vecOff]);
+                kgenAddStmt(ctx, tmp);
+            }
+            kgenAddBlankLine(ctx);
+        }
+    }
+}
+
+static void
+getUpdateSkewCoords(
+    struct KgenContext *ctx,
+    const BlkMulOpts *opts,
+    size_t subK,
+    size_t pitchA,
+    size_t pitchB,
+    unsigned int vecLen,
+    const char *ptrNameIn)
+{
+   char tmp[1024];
+   bool trb = ((opts->flags & BLKMUL_TRANSPOSED_B) != 0);
+
+   if (!(opts->flags & BLKMUL_SKEW_COLUMN)) {
+        kgenAddBlankLine(ctx);
+        if (opts->aMobj == CLMEM_IMAGE) {
+            sprintf(tmp, "coordA.x += %lu;\n", subK / vecLen);
+        }
+        else {
+            sprintf(tmp, "A.%s += %lu;\n", ptrNameIn, subK / vecLen);
+        }
+        kgenAddStmt(ctx, tmp);
+
+        if (!trb) {
+            subK /= vecLen;
+        }
+        if (opts->bMobj == CLMEM_IMAGE) {
+            const char *vfield = (trb) ? "y" : "x";
+
+            sprintf(tmp, "coordB.%s += %lu;\n", vfield, subK);
+        }
+        else {
+            size_t u = (trb) ? (subK * pitchB / vecLen) : subK;
+
+            sprintf(tmp, "B.%s += %lu;\n", ptrNameIn, u);
+        }
+        kgenAddStmt(ctx, tmp);
+    }
+    else if (subK == vecLen) {
+        if (isPower2(pitchA / vecLen)) {
+            sprintf(tmp, "\nskewCol = (skewCol + 1) & %lu;\n",
+                    pitchA / vecLen - 1);
+        }
+        else {
+            sprintf(tmp, "\nskewCol = (skewCol + 1) %% %lu;\n",
+                    pitchA / vecLen);
+        }
+        kgenAddStmt(ctx, tmp);
+    }
+}
+
+// MUST BE LATER DEPRECATED
+static void
+genScaleAccResults(
+    struct KgenContext *ctx,
+    DataType dtype,
+    size_t m,
+    size_t n,
+    size_t outPitch,
+    unsigned int vecLen,
+    bool transpose)
+{
+    char s[MAX_LENGTH];
+    const char *vect = "xyzw";
+    char vecChunk[6];
+    size_t inOff = 0, outOff, vecOff;
+    size_t regPitch = n;
+    size_t i, j, k;
+    bool isDouble;
+    const char *ptrNames[2][4] = {
+        {"f", "f2v", "", "f4v"},
+        {"d", "d2v", "", ""}};
+
+    if ((regPitch % vecLen) && !isComplexType(dtype)) {
+        regPitch += vecLen - regPitch % vecLen;
+    }
+
+    isDouble = isDoubleBasedType(dtype);
+    for (i = 0; i < m; i++) {
+        j = 0;
+        inOff = i * regPitch;
+
+        do {
+            /*
+             * get power of 2 size vector element to copy
+             * in the case without transposing and copy
+             * just with single element in the case with
+             * transposing
+             */
+
+            if (transpose) {
+                k = 1;
+                outOff = (j * outPitch + i);
+            }
+            else {
+                if (isComplexType(dtype)) {
+                    k = 1;
+                }
+                else {
+                    k = vecChunkSize(j, vecLen);
+                    k = szmin(k, n - j);
+                }
+                outOff = (i * outPitch + j);
+            }
+
+            if (isComplexType(dtype)) {
+                sprintf(s, "tempC.%s[%lu] += "
+                           "c[%lu] * alphaR + c[%lu].yx * alphaI;\n",
+                        ptrNames[isDouble][1], outOff, inOff, inOff);
+            }
+            else {
+                if (k == vecLen) {
+                    strcpy(vecChunk, "");
+                }
+                else {
+                    vecOff = inOff % vecLen;
+                    strcpy(vecChunk, ".");
+                    strncat(vecChunk, &vect[vecOff], k);
+                }
+
+                sprintf(s, "tempC.%s[%lu] += c[%lu]%s * alpha;\n",
+                        ptrNames[isDouble][k - 1], outOff / k,
+                        inOff / vecLen, vecChunk);
+            }
+
+            kgenAddStmt(ctx, s);
+
+            j += k;
+            inOff += k;
+        } while (j < n);
+    }
+}
+
+static void
+declareBlkMul(
+    struct KgenContext *ctx,
+    DataType dtype,
+    size_t m,
+    size_t n,
+    const BlkMulOpts *opts,
+    BlkmulArgNames *argNames)
+{
+    char s[MAX_LENGTH];
+    const char *s1;
+    char c;
+    const char *typeName;
+    bool isPriv = (opts->flags & BLKMUL_OUTPUT_PRIVATE);
+
+    c = dtypeToBlasPrefix(dtype);
+    typeName = dtypeBuiltinType(dtype);
+    s1 = (opts->flags & BLKMUL_TRANSPOSE) ? "Transp" : "";
+
+    // fill argument names
+    argNames->coordA = "coordA";
+    argNames->coordB = "coordB";
+    argNames->skewRow = "skewRow";
+    argNames->skewCol = "skewCol";
+
+    sprintf(s, "void\n"
+               "%cgemmBlock%s_%lu_%lu(\n",
+            c, s1, m, n);
+
+    if (!isPriv) {
+        sprintf(s, "%s    %s alpha,\n", s, typeName);
+    }
+    if (opts->aMobj == CLMEM_IMAGE) {
+        sprintf(s, "%s    __read_only image2d_t A,\n"
+                   "    int2 coordA,\n", s);
+    }
+    else {
+        sprintf(s, "%s    LPtr A,\n", s);
+    }
+    if (opts->bMobj == CLMEM_IMAGE) {
+        sprintf(s, "%s    __read_only image2d_t B,\n"
+                   "    int2 coordB,\n", s);
+    }
+    else {
+        sprintf(s, "%s    LPtr B,\n", s);
+    }
+
+    if (opts->flags & BLKMUL_OUTPUT_PRIVATE) {
+        if (isDoubleBasedType(dtype)) {
+            typeName = "double2";
+        }
+        else {
+            typeName = (dtype == TYPE_COMPLEX_FLOAT) ? "float2" : "float4";
+        }
+        sprintf(s, "%s    %s *c", s, typeName);
+    }
+    else {
+        sprintf(s, "%s    LPtr tempC", s);
+
+    }
+
+    if (opts->flags & BLKMUL_SKEW_ROW) {
+        sprintf(s, "%s,\n    int2 skewRow", s);
+    }
+    if (opts->flags & BLKMUL_SKEW_COLUMN) {
+        sprintf(s, "%s,\n    int skewCol", s);
+    }
+    strcat(s, ")\n");
+
+    kgenDeclareFunction(ctx, (const char*)s);
+}
+
+int
+blkMulGen(
+    struct KgenContext *ctx,
+    const SubproblemDim subdims[2],
+    DataType dtype,
+    const BlkMulOpts *opts)
+{
+    char s[MAX_LENGTH], s1[MAX_LENGTH];
+    const char *tNameIn, *tNameOut, *ptrNameIn;
+    size_t vecLen, vlenJ, vlenK;
+    size_t i, j, k;
+    size_t m, n, subK;
+    unsigned int nrRegs;
+    int ret = 0;
+    bool isReal, isDouble;
+    bool isImageA, isImageB;
+    size_t off;
+    size_t pitchA, pitchB, pitchC;
+    unsigned int tsize = dtypeSize(dtype);
+    bool transpose = (opts->flags & BLKMUL_TRANSPOSE);
+    bool trb = ((opts->flags & BLKMUL_TRANSPOSED_B) != 0);
+    bool isPriv = (opts->flags & BLKMUL_OUTPUT_PRIVATE);
+    bool isInlined = (opts->flags & BLKMUL_INLINE);
+    BlkmulCore core = opts->core;
+    BlkmulArgNames argNames;
+    // code to fetch from images for double and float based types
+    const char *imageFetch[2] = {
+        "%c[%lu] = as_float4(read_imageui(%s, sampler, %s));\n",
+        "%c[%lu] = as_double2(read_imageui(%s, sampler, %s));\n"};
+
+    if (trb && (opts->flags & BLKMUL_SKEW_COLUMN)) {
+        return -EINVAL;
+    }
+
+    memcpy(&argNames, &opts->argNames, sizeof(BlkmulArgNames));
+    strcpy(s, "");
+
+    isImageA = (opts->aMobj == CLMEM_IMAGE);
+    isImageB = (opts->bMobj == CLMEM_IMAGE);
+
+    m = subdims[1].y;
+    n = subdims[1].x;
+    subK = subdims[1].bwidth;
+    tsize = dtypeSize(dtype);
+
+    // matrix block pitches
+    pitchA = matrBlockPitch(subdims, MATRIX_A, dtype, clblasLeft);
+    k = (trb) ? subdims[0].x : subdims[0].bwidth;
+    pitchB = fl4RowWidth(k, tsize) * sizeof(cl_float4) / tsize;
+    pitchC = matrBlockPitch(subdims, MATRIX_C, dtype, clblasLeft);
+
+    isReal = !isComplexType(dtype);
+    isDouble = isDoubleBasedType(dtype);
+
+    vecLen = FLOAT4_VECLEN * sizeof(cl_float) / tsize;
+    if (isDouble) {
+        tNameIn = "double2";
+        ptrNameIn = "d2v";
+    }
+    else {
+        tNameIn = "float4";
+        ptrNameIn = "f4v";
+    }
+
+    getResultGPRsInfo(dtype, &subdims[1], (unsigned int)vecLen, &nrRegs, &tNameOut);
+
+    if (!isInlined) {
+        declareBlkMul(ctx, dtype, m, n, opts, &argNames);
+        kgenBeginFuncBody(ctx);
+    }
+
+    //variables declaration
+    if (isImageA || isImageB) {
+        kgenAddStmt(ctx, "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE "
+            "| CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n");
+    }
+    if (!isInlined) {
+        strcpy(s, "uint k;\n");
+    }
+    sprintf(s, "%s%s a[%lu], b[%lu];\n",s , tNameIn, subK / vecLen,
+			n * subK / vecLen);
+
+    if (!isPriv) {
+        // declare registers for result
+        sprintf(s, "%s%s c[%u];\n", s, tNameOut, nrRegs);
+    }
+
+    // 'dot' function can't be used for complex types
+    if (isComplexType(dtype) && (core == BLKMUL_DOT)) {
+        core = BLKMUL_SEPARATE_MULADD;
+    }
+
+    if ((core == BLKMUL_SEPARATE_MULADD) || isComplexType(dtype)) {
+        sprintf(s,"%s%s sum;\n", s, tNameIn);
+    }
+
+    kgenAddStmt(ctx, s);
+
+    if (!isPriv && !isReal) {
+        declareComplexMultParts(ctx, "alpha", tNameOut);
+    }
+    kgenAddBlankLine(ctx);
+
+    // zeroing temporary multiplication data stored to registers
+    if (!isPriv) {
+        sprintf(s, "for (k = 0; k < %u; k++) {\n"
+                   "   c[k] = 0;\n"
+                   "}\n\n", nrRegs);
+        kgenAddStmt(ctx, s);
+    }
+
+    //main loop start
+    if (!isInlined) {
+        // initial skew correction
+        if ((opts->flags & BLKMUL_SKEW_COLUMN) && (subK == vecLen)) {
+            if (isPower2(pitchA / vecLen) &&
+                    !(opts->flags & BLKMUL_AVOID_AND)) {
+                sprintf(s, "skewCol = skewCol & %lu;\n", pitchA / vecLen - 1);
+            }
+            else {
+                sprintf(s, "\nskewCol = skewCol %% %lu;\n", pitchA / vecLen);
+            }
+            kgenAddStmt(ctx, s);
+        }
+        sprintf(s, "\nfor (k = 0; k < %lu; k += %lu)",
+                subdims[0].bwidth / vecLen, subK / vecLen);
+        ret = kgenBeginBranch(ctx, s);
+    }
+
+    if (trb) {
+        vlenJ = vecLen;
+        vlenK = 1;
+    }
+    else {
+        vlenJ = 1;
+        vlenK = vecLen;
+    }
+
+    for (j = 0; j < n / vlenJ; j++) {
+        // fetch elements of matrix B
+        for (k = 0; k < subK / vlenK; k++) {
+            size_t coords[2] = {k, j};
+            if (trb) {
+                off = j * subK + k;
+            }
+            else {
+                off = j * subK / vecLen + k;
+            }
+            sprintfInputOffset(s1, MATRIX_B, (int)coords[1 - trb],
+                              (int)coords[trb], pitchB / vecLen,
+                               subdims[1].x, opts, &argNames, (subK == vecLen));
+            if (isImageB) {
+                sprintf(s, imageFetch[isDouble], 'b', off, "B", s1);
+            }
+            else {
+                sprintf(s, "b[%lu] = B.%s[%s];\n", off, ptrNameIn, s1);
+            }
+            ret = kgenAddStmt(ctx, s);
+        }
+    }
+
+    for (i = 0; i < m; i++) {
+        kgenAddBlankLine(ctx);
+        // fetch elements of matrix A from single row
+        for (k = 0; k < subK / vecLen; k++) {
+            sprintfInputOffset(s1, MATRIX_A, (int)i,
+                               (int)k, pitchA / vecLen, subdims[1].y, opts,
+                               &argNames, (subK == vecLen));
+            if (isImageA) {
+                sprintf(s, imageFetch[isDouble], 'a', k, "A", s1);
+            }
+            else {
+                sprintf(s,"a[%lu] = A.%s[%s];\n", k, ptrNameIn, s1);
+            }
+            ret = kgenAddStmt(ctx, s);
+        }
+
+        // multiply matrix A row on matrix B block
+        for (j = 0; j < n / vlenJ; j++) {
+            if (isReal) { //real case
+                switch (core) {
+                case BLKMUL_DOT:
+                    genRealDot(ctx, i, j, n, subK, (unsigned int)vecLen);
+                    break;
+                case BLKMUL_MAD:
+                    genMad(ctx, dtype, i, j, n, subK / vecLen,
+                           (unsigned int)vecLen, trb);
+                    break;
+                case BLKMUL_SEPARATE_MULADD:
+                    genVecMul(ctx, j, subK / vecLen, VECT_MULT_REAL);
+                    genVecSum(ctx, dtype, i, j, n, (unsigned int)vecLen,
+                            VECT_MULT_REAL);
+                    break;
+                }
+            }
+            else { //complex case
+                VectMulType mulType = (dtype == TYPE_COMPLEX_FLOAT) ?
+                        VECT_MULT_IMAG_FLOAT : VECT_MULT_IMAG_DOUBLE;
+
+                if (core == BLKMUL_MAD) {
+                    //real part
+                    genMadMul(ctx, j, subK / vecLen, VECT_MULT_COMPLEX_REAL);
+                    genVecSum(ctx, dtype, i, j, n, (unsigned int)vecLen,
+                            VECT_MULT_COMPLEX_REAL);
+
+                    //imaginary part
+                    genMadMul(ctx, j, subK / vecLen, mulType);
+                    genVecSum(ctx, dtype, i, j, n, (unsigned int)vecLen, mulType);
+                }
+                else {
+                    //real part
+                    genVecMul(ctx, j, subK / vecLen, VECT_MULT_COMPLEX_REAL);
+                    genVecSum(ctx, dtype, i, j, n, (unsigned int)vecLen,
+                              VECT_MULT_COMPLEX_REAL);
+
+                    //imaginary part
+                    genVecMul(ctx, j, subK / vecLen, mulType);
+                    genVecSum(ctx, dtype, i, j, n, (unsigned int)vecLen, mulType);
+                }
+            }
+        }
+    }
+
+    // update coordinates/skews and end the loop
+    if (!isInlined) {
+        getUpdateSkewCoords(ctx, opts, subK, pitchA, pitchB,
+                            (unsigned int)vecLen, ptrNameIn);
+        kgenEndBranch(ctx, NULL);
+    }
+
+    if (!isPriv) {
+        kgenAddBlankLine(ctx);
+        genScaleAccResults(ctx, dtype, m, n, pitchC, (unsigned int)vecLen, transpose);
+    }
+
+    if (!isInlined) {
+        ret = kgenEndFuncBody(ctx);
+    }
+
+    return ret ? -EOVERFLOW : 0;
+}
diff --git a/src/library/blas/gens/legacy/gemm_img.c b/src/library/blas/gens/legacy/gemm_img.c
new file mode 100644
index 0000000..9fa19c1
--- /dev/null
+++ b/src/library/blas/gens/legacy/gemm_img.c
@@ -0,0 +1,758 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * gemm image based generators
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <math.h>
+#include <clBLAS.h>
+#include <matrix_dims.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include <dis_warning.h>
+
+#include "blas_kgen_legacy.h"
+#include "../gen_helper.h"
+#include "gen_helper_legacy.h"
+
+static CLBLASMpatExtra mpatExtra;
+
+static const char *prepareImagesGemmDeclA =
+    "void __kernel\n"
+    "%cprepareImageA(\n"
+    "    clblasOrder order,\n"
+    "    clblasTranspose transA,\n"
+    "    uint M,\n"
+    "    uint K,\n"
+    "    __global %s *A,\n"
+    "    uint lda,\n"
+    "    __write_only image2d_t imgA,\n"
+    "    uint offsetA)\n";
+
+static const char *prepareImagesGemmDeclB =
+    "void __kernel\n"
+    "%cprepareImageB(\n"
+    "    clblasOrder order,\n"
+    "    clblasTranspose transB,\n"
+    "    uint N,\n"
+    "    uint K,\n"
+    "    __global %s *B,\n"
+    "    uint ldb,\n"
+    "    __write_only image2d_t imgB,\n"
+    "    uint offsetB)\n";
+
+
+static const char *imgGemmDecl =
+    "__attribute__((reqd_work_group_size(%lu, %lu, 1)))\n"
+    "void __kernel\n"
+    "%cgemmImg(\n"
+    "    const uint M,\n"
+    "    const uint N,\n"
+    "    const uint K,\n"
+    "    const %s alpha,\n"
+    "    const __read_only image2d_t A,\n"
+    "    const __read_only image2d_t B,\n"
+    "    const %s beta,\n"
+    "    __global %s *C,\n"
+    "    const uint ldc,\n"
+    "    const uint offsetC)\n";
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+static ssize_t
+preparator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+static ssize_t
+genWrapper(
+    char *buf,
+    size_t buflen,
+    const struct SubproblemDim *subdims,
+    const struct PGranularity *pgran,
+    void *extra)
+{
+    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    if (kextra->kernType == CLBLAS_COMPUTING_KERNEL) {
+        return generator(buf, buflen, subdims, pgran, extra);
+    }
+    else {
+        return preparator(buf, buflen, subdims, pgran, extra);
+    }
+}
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra);
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs);
+
+static SolverFlags
+solverFlags(void);
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static int
+imgGetPerf(
+    unsigned int kflags,
+    const void *args);
+
+static SolverOps imgSops = {
+    genWrapper,
+    assignKargs,
+    isFitToLDS,
+    imgGetPerf,
+    NULL,
+    calcNrThreads,
+    NULL,
+    solverFlags,
+    NULL, //fixupKargs
+    NULL, //getDefaultDecomp
+    NULL, //getDecompList
+    NULL,
+    NULL
+};
+
+// Preparation function for images based kernel generator
+static ssize_t
+preparator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+    struct KgenContext *ctx;
+    char tmp[4096], conjStr[1024];
+    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    CopyImgFuncs copyImgFuncs;
+    DataType dtype = kextra->dtype;
+    BlasGenSettings gset;
+    unsigned int vecLen;
+    unsigned int tsize;
+    const char *typeName;
+    char fpref;
+    bool b;
+    size_t localBufSize;
+    ssize_t ret;
+    const char *conjCond;
+
+    const char *functionHeadA =
+        "int tra, aligned;\n"
+        "const uint bpr = (K + %lu) / %lu;\n"
+        "uint m = (gid / bpr) * %lu;\n"
+        "uint k = (gid %% bpr) * %lu;\n"
+        "uint x, y;\n"
+        "__local %s temp[%lu];\n"
+        "\n"
+        "A += offsetA;\n"
+        "tra = (!transA && order == clblasColumnMajor) ||\n"
+        "      (transA && order == clblasRowMajor);\n"
+        "if (m >= M) {\n"
+        "     return;\n"
+        "}\n";
+
+    const char *functionHeadB =
+        "int trb, aligned;\n"
+        "const uint bpr = (K + %lu) / %lu;\n"
+        "const uint n = (gid / bpr) * %lu;\n"
+        "const uint k = (gid %% bpr) * %lu;\n"
+        "uint x, y;\n"
+        "__local %s temp[%lu];\n"
+        "\n"
+        "B += offsetB;\n"
+        "trb = (!transB && order == clblasRowMajor) ||\n"
+        "      (transB && order == clblasColumnMajor);\n"
+        "if (n >= N) {\n"
+        "    return;\n"
+        "}\n";
+
+    // Distribute blocks across compute units and copy matrix A to image.
+    // Transposition and filling with zeros in unaligned cases is made using
+    // buffer in local memory.
+    const char *copyToImageA =
+        "//copy matrix A block\n"
+        "y = m + %u <= M ? %u : M - m;\n"
+        "x = k + %u <= K ? %u : K - k;\n"
+        "aligned = (x == %u) && (y == %u) && %d;\n"
+        "int atcase = aligned * 10 + tra;\n"
+        "%s" // conjugated check
+        "if (atcase != 10) {\n"
+        "    %s((__local float4*)temp);\n"
+        "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+        "}\n"
+        "switch(atcase) {\n"
+        "case 10: //aligned, not transposed\n"
+        "    %s(imgA, k / %u, m, (GPtr)A, m, k, lda);\n"
+        "    break;\n"
+        "%s" // conjugated case
+        "case 1: //not aligned, transposed\n"
+        "    // generic transposed global to local\n"
+        "    %s((LPtr)temp, (GPtr)A, k, m, x, y, %u, lda);\n"
+        "    break;\n"
+        "case 0: //not aligned, not transposed\n"
+        "    // generic global to local\n"
+        "    %s((LPtr) temp, (GPtr)A, m, k, y, x, %u, lda);\n"
+        "    break;\n"
+        "case 11: //aligned, transposed\n"
+        "    // optimized transposed global to local\n"
+        "    %s((LPtr) temp, (GPtr)A, k, m, lda);\n"
+        "    break;\n"
+        "}\n"
+        "if (atcase != 10) {\n"
+        "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+        "    %s(imgA, k / %u, m, (LPtr) temp);\n"
+        "}\n"
+        "\n";
+
+    const char *copyToImageB =
+            "//copy matrix B block\n"
+            "y = n + %u <= N ? %u : N - n;\n"
+            "x = k + %u <= K ? %u : K - k;\n"
+            "aligned = (x == %u) && (y == %u) && %d;\n"
+            "int atcase = aligned * 10 + trb;\n"
+            "%s" // conjugated check
+            "if (atcase != 10) {\n"
+            "    %s((__local float4*)temp);\n"
+            "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+            "}\n"
+            "switch (atcase) {\n"
+            "case 10: //aligned, not transposed\n"
+            "    %s(imgB, k / %u, n, (GPtr)B, n, k, ldb);\n"
+            "    break;\n"
+            "%s" // conjugated case
+            "case 1: //not aligned, transposed\n"
+            "    // generic transposed global to local\n"
+            "    %s((LPtr)temp, (GPtr)B, k, n, x, y, %u, ldb);\n"
+            "    break;\n"
+            "case 0: //not aligned, not transposed\n"
+            "    // generic global to local\n"
+            "    %s((LPtr)temp, (GPtr)B, n, k, y, x, %u, ldb);\n"
+            "    break;\n"
+            "case 11: //transposed, aligned\n"
+            "    // optimized transposed global to local\n"
+            "    %s((LPtr)temp, (GPtr)B, k, n, ldb);\n"
+            "    break;\n"
+            "}\n"
+            "if (atcase != 10) {\n"
+            "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+            "    %s(imgB, k / %u, n, (LPtr)temp);\n"
+            "}\n"
+            "\n";
+
+    memset(&copyImgFuncs, 0, sizeof(copyImgFuncs));
+    memset(&gset, 0, sizeof(gset));
+
+    ctx = createKgenContext(buf, buflen, true);
+    if (ctx == NULL) {
+        return -ENOMEM;
+    }
+
+    tsize = dtypeSize(dtype);
+
+    b = isDoubleBasedType(dtype);
+    kgenDeclareUptrs(ctx, b);
+    declareBlasEnums(ctx);
+
+    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
+    gset.kextra = kextra;
+    gset.pgran = pgran;
+
+    // generate necessary memory to image copying functions
+    generateImageCopyFuncs(&copyImgFuncs, ctx, CLBLAS_GEMM, &gset);
+
+    kgenAddBlankLine(ctx);
+    vecLen = sizeof(cl_float4) / dtypeSize(dtype);
+    typeName = dtypeBuiltinType(dtype);
+    fpref = dtypeToBlasPrefix(dtype);
+
+    if (kextra->kernType == CLBLAS_PREP_A_KERNEL) {
+        sprintf(tmp, prepareImagesGemmDeclA, fpref, typeName, typeName);
+        kgenDeclareFunction(ctx, tmp);
+        ret = kgenBeginFuncBody(ctx);
+
+        // same local buffer is used for both matrix A and matrix B blocks
+        localBufSize = subdims[1].y * fl4RowWidth(subdims[1].bwidth, tsize);
+        localBufSize *= vecLen;
+
+        kgenDeclareGroupID(ctx, "gid", pgran);
+        sprintf(tmp, functionHeadA,
+                subdims[1].bwidth - 1, subdims[1].bwidth,
+                subdims[1].y, subdims[1].bwidth,
+                typeName, localBufSize);
+        kgenAddStmt(ctx, tmp);
+
+        if (isComplexType(dtype)) {
+            conjCond = "atcase += ((atcase == 10) && "
+                    "(transA == clblasConjTrans)) ? 100 : 0;\n";
+            sprintf(conjStr, "case 110: //conjugated, not transposed, aligned\n"
+                             "    %s((LPtr)temp, (GPtr)A, m, k, lda);\n"
+                             "    break;\n",
+                    copyImgFuncs.globalToLocal[MATRIX_A]);
+        }
+        else {
+            conjCond = "";
+            strcpy(conjStr, "");
+        }
+
+        sprintf(tmp, copyToImageA,
+                subdims[1].y, subdims[1].y, // y = m + dy <= M ?...
+                subdims[1].bwidth, subdims[1].bwidth, // x = k + bw <= K ?...
+                subdims[1].bwidth, subdims[1].y, // aligned = (x==bw1)&&(y==dy1)
+                (kextra->flags & KEXTRA_NO_COPY_VEC_A) == 0,
+                conjCond,
+                copyImgFuncs.zeroBlock[MATRIX_A],
+                copyImgFuncs.globalToImage[MATRIX_A],
+                vecLen,
+                conjStr,
+                copyImgFuncs.globalToLocalTransposedGeneric[MATRIX_A],
+                subdims[1].bwidth,
+                copyImgFuncs.globalToLocalGeneric[MATRIX_A],
+                subdims[1].bwidth,
+                copyImgFuncs.globalToLocalTransposed[MATRIX_A],
+                copyImgFuncs.localToImage[MATRIX_A],
+                vecLen);
+        kgenAddStmt(ctx, tmp);
+    }
+    else { // PREP_B
+        sprintf(tmp, prepareImagesGemmDeclB, fpref, typeName, typeName);
+        kgenDeclareFunction(ctx, tmp);
+        ret = kgenBeginFuncBody(ctx);
+
+        // same local buffer is used for both matrix A and matrix B blocks
+        localBufSize = subdims[1].x * fl4RowWidth(subdims[1].bwidth, tsize);
+        localBufSize *= vecLen;
+
+        kgenDeclareGroupID(ctx, "gid", pgran);
+        sprintf(tmp, functionHeadB,
+                subdims[1].bwidth - 1, subdims[1].bwidth,
+                subdims[1].x, subdims[1].bwidth,
+                typeName, localBufSize);
+        kgenAddStmt(ctx, tmp);
+
+        if (isComplexType(dtype)) {
+            conjCond = "atcase += ((atcase == 10) && "
+                    "(transB == clblasConjTrans)) ? 100 : 0;\n";
+            sprintf(conjStr, "case 110: //conjugated, not transposed, aligned\n"
+                             "    %s((LPtr)temp, (GPtr)B, n, k, ldb);\n"
+                             "    break;\n",
+                    copyImgFuncs.globalToLocal[MATRIX_B]);
+        }
+        else {
+            conjCond = "";
+            strcpy(conjStr, "");
+        }
+
+        sprintf(tmp, copyToImageB,
+                subdims[1].x, subdims[1].x, // y = n + dy <= N ?...
+                subdims[1].bwidth, subdims[1].bwidth, // x = k + bw <= K ?...
+                subdims[1].bwidth, subdims[1].x, // aligned = (x==bw1)&&(y==dx1)
+                (kextra->flags & KEXTRA_NO_COPY_VEC_B) == 0,
+                conjCond,
+                copyImgFuncs.zeroBlock[MATRIX_B],
+                copyImgFuncs.globalToImage[MATRIX_B],
+                vecLen,
+                conjStr,
+                copyImgFuncs.globalToLocalTransposedGeneric[MATRIX_B],
+                subdims[1].bwidth,
+                copyImgFuncs.globalToLocalGeneric[MATRIX_B],
+                subdims[1].bwidth,
+                copyImgFuncs.globalToLocalTransposed[MATRIX_B],
+                copyImgFuncs.localToImage[MATRIX_B],
+                vecLen);
+        kgenAddStmt(ctx, tmp);
+    }
+
+    kgenEndFuncBody(ctx);
+
+    ret = kgenAddBlankLine(ctx);
+
+    if (!ret) {
+        ret = (ssize_t)kgenSourceSize(ctx) + 1;
+    }
+    destroyKgenContext(ctx);
+
+    return (ret < 0) ? -EOVERFLOW : ret;
+}
+
+static void
+initKernelVarNames(KernelVarNames *kvars, KernelExtraFlags kflags)
+{
+    kvars->A = "imgA";
+    kvars->B = "imgB";
+    if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_A)) {
+        kvars->coordA = "coordA.x";
+    }
+    else {
+        kvars->coordA = "coordA.y";
+    }
+    if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_B)) {
+        kvars->coordB = "coordB.x";
+    }
+    else {
+        kvars->coordB = "coordB.y";
+    }
+    kvars->sizeM = "M";
+    kvars->sizeN = "N";
+    kvars->sizeK = "K";
+}
+
+// global memory based kernel generator
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+    struct KgenContext *ctx;
+    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    char tmp[4096], tmp1[4096];
+    char *p;
+    // is the iteration over N, N at the top level
+    const char *typeName;
+    char fpref;
+    DataType dtype = kextra->dtype;
+    ssize_t ret;
+    BlasGenSettings gset;
+    BlkMulOpts mulOpts;
+    unsigned int tsize;
+    unsigned int vecLen, outVecLen;
+    bool b;
+    const char *outTypeName;
+    unsigned int i;
+    unsigned int nrRegs, regPitch;
+    int tra, trb;
+    char vect[2] = {'y', 'x'};
+
+    const char *coordConstants =
+        "const uint workItemM = get_global_id(0) * %lu;\n"
+        "const uint workItemN = get_global_id(1) * %lu;\n"
+        "const int2 skewRow = (int2)(0, get_local_id(0) %% %lu);\n"
+        "uint vectK = (K + %u) / %u;\n";
+
+    /*
+     *  template for image based gemm preparation part
+     *  for two dimensional work space
+     */
+    const char *localVariables =
+        "uint k0;\n"
+        "int2 coordA = (int2)(0, workItemM);\n"
+        "int2 coordB = (int2)(0, workItemN);\n"
+        "%s c[%u];\n\n";
+
+    tsize = dtypeSize(dtype);
+    vecLen = sizeof(cl_float4) / dtypeSize(dtype);
+    if (isComplexType(dtype)) {
+        regPitch = (unsigned int)subdims[1].x;
+    }
+    else {
+        regPitch = (unsigned int) fl4RowWidth(subdims[1].x, tsize) *
+                    sizeof(cl_float4) / tsize;
+    }
+
+    memset(&gset, 0, sizeof(gset));
+    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
+    gset.kextra = kextra;
+    gset.pgran = pgran;
+    initKernelVarNames(&gset.varNames, kextra->flags);
+
+    ctx = createKgenContext(buf, buflen, true);
+    if (ctx == NULL) {
+        return -ENOMEM;
+    }
+
+    // at first, generate needed declarations and auxiliary functions
+    b = isDoubleBasedType(dtype);
+    kgenDeclareUptrs(ctx, b);
+
+    typeName = dtypeBuiltinType(dtype);
+    fpref = dtypeToBlasPrefix(dtype);
+
+    // now, generate the kernel
+
+    sprintf(tmp, imgGemmDecl, pgran->wgSize[0], pgran->wgSize[1], fpref,
+            typeName, typeName, typeName);
+    kgenDeclareFunction(ctx, tmp);
+    ret = kgenBeginFuncBody(ctx);
+
+    // constants
+    sprintf(tmp, coordConstants,
+            subdims[1].y, subdims[1].x, subdims[1].y,
+            vecLen - 1, vecLen);
+    kgenAddStmt(ctx, tmp);
+
+    /*
+     * Calculate local buffer pitches, and then declare local
+     * variables
+     */
+    getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName);
+
+    sprintf(tmp, localVariables, outTypeName, nrRegs);
+    kgenAddStmt(ctx, tmp);
+
+    // check if offset exceeds matrix
+    kgenAddStmt(ctx, "if ((workItemM >= M) ||"
+                         "(workItemN >= N)) {\n"
+                     "    return;\n"
+                     "}\n");
+
+    kgenAddStmt(ctx, "C += offsetC;\n");
+
+    // zero C block
+    sprintf(tmp, "for (k0 = 0; k0 < %u; k0++) {\n"
+                 "    c[k0] = 0;\n"
+                 "}\n\n",
+            nrRegs);
+    kgenAddStmt(ctx, tmp);
+
+    // block multiplication inlined function
+    sprintf(tmp, "for (k0 = 0; k0 < vectK; k0 += %lu)",
+            subdims[1].bwidth / vecLen);
+    kgenBeginBranch(ctx, tmp);
+
+    mulOpts.aMobj = CLMEM_IMAGE;
+    mulOpts.bMobj = CLMEM_IMAGE;
+    mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | BLKMUL_SKEW_ROW | BLKMUL_INLINE;
+    if (isComplexType(dtype)) {
+        mulOpts.core = BLKMUL_SEPARATE_MULADD;
+    }
+    else {
+        mulOpts.core = BLKMUL_MAD;
+    }
+    mulOpts.argNames.coordA = "coordA";
+    mulOpts.argNames.coordB = "coordB";
+    mulOpts.argNames.skewCol = "skewCol";
+    mulOpts.argNames.skewRow = "skewRow";
+    mulOpts.argNames.k = "k0";
+    mulOpts.argNames.vectBoundK = "vectK";
+    ret = blkMulGen(ctx, subdims, dtype, &mulOpts);
+    if (ret) {
+        destroyKgenContext(ctx);
+        return -EOVERFLOW;
+    }
+
+    // update image coordinates
+    sprintf(tmp, "\ncoordA.x += %lu;\n"
+                 "coordB.x += %lu;\n",
+            subdims[1].bwidth / vecLen, subdims[1].bwidth / vecLen);
+    kgenAddStmt(ctx, tmp);
+
+    kgenEndBranch(ctx, NULL);
+
+    // reorder the given solution
+    outVecLen = isComplexType(dtype) ? 1 : vecLen;
+    p = tmp1;
+    for (i = 0; i < regPitch / outVecLen; i++) {
+        unsigned int k = (unsigned int)(subdims[1].y - 1) *
+                         regPitch / outVecLen + i;
+
+        sprintf(p,  "\n"
+                    "    tmp = c[%u];\n"
+                    "    for (j = %lu; j >= 0; j--) {\n"
+                    "        c[(j+1) * %u + %u] = c[j * %u + %u];\n"
+                    "    }\n"
+                    "    c[%u] = tmp;\n",
+                k, subdims[1].y - 2, regPitch / outVecLen,
+                i, regPitch / outVecLen, i, i);
+        p += strlen(p);
+    }
+    sprintf(tmp, "\n"
+                 "for (k0 = 0; k0 < skewRow.y; k0++) {\n"
+                 "    int j;\n"
+                 "    %s tmp;\n"
+                 "%s"
+                 "}\n"
+                 "\n",
+                 outTypeName, tmp1);
+    kgenAddStmt(ctx, tmp);
+
+    tra = isMatrixAccessColMaj(CLBLAS_GEMM, kextra->flags, MATRIX_A);
+    trb = isMatrixAccessColMaj(CLBLAS_GEMM, kextra->flags, MATRIX_B);
+    sprintf(tmp, "coordA.%c = workItemM;\n"
+                 "coordB.%c = workItemN;\n\n",
+            vect[tra], vect[trb]);
+    kgenAddStmt(ctx, tmp);
+
+    // write back the tile evaluated
+    generateResultUpdateOld(ctx, CLBLAS_GEMM, &gset, NULL, NULL);
+
+    kgenEndFuncBody(ctx);
+    ret = kgenAddBlankLine(ctx);
+
+    if (!ret) {
+        ret = (ssize_t)kgenSourceSize(ctx) + 1;
+    }
+
+    destroyKgenContext(ctx);
+
+    return (ret < 0) ? -EOVERFLOW : ret;
+}
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra)
+{
+    const CLBlasKargs *blasArgs = (const CLBlasKargs*)params;
+
+    (void)extra;
+
+    switch (blasArgs->kernType) {
+    case CLBLAS_COMPUTING_KERNEL:
+        // arguments for computational kernel
+        initSizeKarg(&args[0], blasArgs->M);
+        initSizeKarg(&args[1], blasArgs->N);
+        initSizeKarg(&args[2], blasArgs->K);
+        assignScalarKarg(&args[3], &(blasArgs->alpha), blasArgs->dtype);
+        INIT_KARG(&args[4], blasArgs->scimage[0]);
+        INIT_KARG(&args[5], blasArgs->scimage[1]);
+        assignScalarKarg(&args[6], &(blasArgs->beta), blasArgs->dtype);
+        initMemobjKarg(&args[7], blasArgs->C, NULL, 0, 0);
+        initSizeKarg(&args[8], blasArgs->ldc.matrix);
+        initSizeKarg(&args[9], blasArgs->offCY);
+        break;
+    case CLBLAS_PREP_A_KERNEL:
+        INIT_KARG(&args[0], blasArgs->order);
+        INIT_KARG(&args[1], blasArgs->transA);
+        initSizeKarg(&args[2], blasArgs->M);
+        initSizeKarg(&args[3], blasArgs->K);
+        initMemobjKarg(&args[4], blasArgs->A, NULL, 0, 0);
+        initSizeKarg(&args[5], blasArgs->lda.matrix);
+        INIT_KARG(&args[6], blasArgs->scimage[0]);
+        initSizeKarg(&args[7], blasArgs->offA);
+        break;
+    case CLBLAS_PREP_B_KERNEL:
+        INIT_KARG(&args[0], blasArgs->order);
+        INIT_KARG(&args[1], blasArgs->transB);
+        initSizeKarg(&args[2], blasArgs->N);
+        initSizeKarg(&args[3], blasArgs->K);
+        initMemobjKarg(&args[4], blasArgs->B, NULL, 0, 0);
+        initSizeKarg(&args[5], blasArgs->ldb.matrix);
+        INIT_KARG(&args[6], blasArgs->scimage[1]);
+        initSizeKarg(&args[7], blasArgs->offBX);
+        break;
+    default:
+        //this should not happen
+        break;
+    }
+}
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs)
+{
+    cl_ulong size;
+    const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs;
+    size = matrBlockSize(&dim[1], MATRIX_C, dtype, kargs->side);
+    return (size * dtypeSize(dtype) <= ldsSize);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra)
+{
+    const CLBlasKargs *kargs = args;
+    (void)extra;
+
+    if (kargs->kernType != CLBLAS_COMPUTING_KERNEL) {
+        const size_t *whole, *part;
+        size_t nrGroups;
+
+        // each thread gets one block
+
+        if (kargs->kernType == CLBLAS_PREP_A_KERNEL) {
+            whole = &kargs->M;
+            part = &subdims[0].itemY;
+        }
+        else {
+            whole = &kargs->N;
+            part = &subdims[0].itemX;
+        }
+
+        nrGroups = *whole / *part + (*whole % *part != 0);
+        nrGroups *= (kargs->K / subdims[0].bwidth +
+                    (kargs->K % subdims[0].bwidth != 0));
+        threads[0] = pgran->wgSize[0] * nrGroups;
+        threads[1] = pgran->wgSize[1];
+    }
+    else {
+        calcGlobalThreads(threads, &subdims[0], pgran, kargs->M, kargs->N);
+    }
+}
+
+static SolverFlags
+solverFlags(void)
+{
+    return (SF_WSPACE_2D);
+}
+
+void
+initGemmImgPattern(MemoryPattern *mempat)
+{
+    mempat->name = "Image based block gemm";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &imgSops;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS;
+    mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS;
+    mpatExtra.mobjA = CLMEM_IMAGE;
+    mpatExtra.mobjB = CLMEM_IMAGE;
+    mempat->extra = &mpatExtra;
+}
+
+static int
+imgGetPerf(
+    unsigned int kflags,
+    const void *args)
+{
+    (void)args;
+    (void)kflags;
+
+    return PPERF_POOR;
+}
diff --git a/src/library/blas/gens/legacy/gemm_lds.c b/src/library/blas/gens/legacy/gemm_lds.c
new file mode 100644
index 0000000..61e3154
--- /dev/null
+++ b/src/library/blas/gens/legacy/gemm_lds.c
@@ -0,0 +1,562 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * LDS based block GEMM generator
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include <matrix_dims.h>
+#include <dis_warning.h>
+
+#include "../init.h"
+#include "blas_kgen_legacy.h"
+#include "gen_helper_legacy.h"
+#include "../gen_helper.h"
+
+static CLBLASMpatExtra mpatExtra;
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra);
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs);
+
+static SolverFlags
+solverFlags(void);
+
+static int
+ldsGetPerf(
+    unsigned int kflags,
+    const void *args);
+
+static SolverOps solverOps = {
+    generator,
+    assignKargs,
+    isFitToLDS,
+    ldsGetPerf,
+    NULL,
+    NULL,
+    NULL,
+    solverFlags,
+    NULL, //fixupKargs
+    NULL, //getDefaultDecomp
+    NULL, //getDecompList
+    NULL,
+    NULL
+};
+
+static void
+declareKernel(
+    struct KgenContext *ctx,
+    DataType dtype,
+    const PGranularity *pgran)
+{
+    char tmp[1024];
+    char fpref;
+    const char *typeName;
+
+    typeName = dtypeBuiltinType(dtype);
+    fpref = dtypeToBlasPrefix(dtype);
+
+    sprintf(tmp, "__attribute__((reqd_work_group_size(%u, %u, 1)))\n"
+                 "void __kernel\n"
+                 "%cgemm(\n"
+                 "    uint M,\n"
+                 "    uint N,\n"
+                 "    uint K,\n"
+                 "    %s alpha,\n"
+                 "    __global %s *A,\n"
+                 "    uint lda,\n"
+                 "    __global %s *B,\n"
+                 "    uint ldb,\n"
+                 "    %s beta,\n"
+                 "    __global %s *C,\n"
+                 "    uint ldc,\n"
+                 "    const uint offA,\n"
+                 "    const uint offB,\n"
+                 "    const uint offC)\n",
+            pgran->wgSize[0], pgran->wgSize[1],
+            fpref, typeName, typeName,
+            typeName, typeName, typeName);
+
+    kgenDeclareFunction(ctx, tmp);
+}
+
+static void
+declareLocalVariables(
+    struct KgenContext *ctx,
+    DataType dtype,
+    const SubproblemDim *dims,
+    const PGranularity *pgran)
+{
+    char tmp[1024];
+    const char *inTypeName, *outTypeName;
+    size_t pitchAB;
+    unsigned int nrRegs;
+    unsigned int vecLen;
+
+    inTypeName = dtypeBuiltinType(dtype);
+    pitchAB = matrBlockPitch(dims, MATRIX_A, dtype, clblasLeft);
+    vecLen = sizeof(cl_float4) / dtypeSize(dtype);
+
+    getResultGPRsInfo(dtype, &dims[1], vecLen, &nrRegs, &outTypeName);
+
+    sprintf(tmp, "uint m0, k0;\n"
+                 "__local %s tempA[%lu];\n"
+                 "__local %s tempB[%lu];\n"
+                 "%s c[%u];\n"
+                 "uint currM, currN, groupsPan;\n"
+                 "uint2 coordA, coordB;\n"
+                 "uint x, y;\n",
+             inTypeName, pitchAB * dims[0].y,
+             inTypeName, pitchAB * dims[0].x,
+             outTypeName, nrRegs);
+
+    kgenAddStmt(ctx, tmp);
+    kgenDeclareLocalID(ctx, "lid", pgran);
+    kgenDeclareGroupID(ctx, "gid", pgran);
+    kgenAddBlankLine(ctx);
+}
+
+static void
+genPrepareBlockA(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    DataType dtype,
+    const CopyBufFuncs *copyFuncs,
+    const ZeroFuncs *zeroFuncs,
+    KernelExtraFlags flags)
+{
+    char tmp[1024];
+    size_t pitch;
+    const char *coordName[2] = {"currM", "k0"};
+    const char *sizeName[2] = {"y", "x"};
+    size_t bsize[2] = {dim->y, dim->bwidth};
+    int tra;
+
+    tra = isMatrixAccessColMaj(CLBLAS_GEMM, flags, MATRIX_A);
+    pitch = matrBlockPitch(dim, MATRIX_A, dtype, clblasLeft);
+
+    /*
+     * If the (sub)problem is integrally divisible,
+     * skip any checks, and just read with optimal blocks,
+     * otherwise check for tails and then read with a
+     * fast function in the case of optimal blocks, and with
+     * the slow one in the case of tails respectively
+     */
+
+    if (!(flags & (KEXTRA_TAILS_M | KEXTRA_TAILS_K))) {
+        sprintf(tmp, "%s((LPtr)tempA, (GPtr)A, %s, %s, lda);\n",
+                copyFuncs->read[MATRIX_A], coordName[tra], coordName[1 - tra]);
+    }
+    else {
+        sprintf(tmp,
+                "y = (currM + %lu <= M) ? %lu : M - currM;\n"
+                "x = (k0 + %lu <= K) ? %lu : K - k0;\n"
+                "if ((y == %lu) && (x == %lu)) {\n"
+                     // fast read
+                "    %s((LPtr)tempA, (GPtr)A, %s, %s, lda);\n"
+                "}\n"
+                "else {\n"
+                "    %s((__local float4*)tempA);\n"           // zeroing
+                "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+                     // slow read
+                "    %s((LPtr)tempA, (GPtr)A, %s, %s, %s, %s, %lu, lda);\n"
+                "}\n\n",
+                bsize[0], bsize[0], bsize[1], bsize[1], bsize[0], bsize[1],
+                copyFuncs->read[MATRIX_A], coordName[tra], coordName[1 - tra],
+                zeroFuncs->names[MATRIX_A],
+                copyFuncs->readGeneric[MATRIX_A], coordName[tra],
+                coordName[1 - tra], sizeName[tra], sizeName[1 - tra],
+                pitch);
+    }
+
+    kgenAddStmt(ctx, tmp);
+}
+
+static void
+genPrepareBlockB(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    DataType dtype,
+    const CopyBufFuncs *copyFuncs,
+    const ZeroFuncs *zeroFuncs,
+    KernelExtraFlags flags)
+{
+    char tmp[1024];
+    size_t pitch;
+    const char *coordName[2] = {"currN", "k0"};
+    const char *sizeName[2] = {"y", "x"};
+    size_t bsize[2] = {dim->x, dim->bwidth};
+    int trb;
+
+    pitch = matrBlockPitch(dim, MATRIX_B, dtype, clblasLeft);
+    trb = isMatrixAccessColMaj(CLBLAS_GEMM, flags, MATRIX_B);
+
+    if (!(flags & (KEXTRA_TAILS_N | KEXTRA_TAILS_K))) {
+        sprintf(tmp, "%s((LPtr)tempB, (GPtr)B, %s, %s, ldb);\n",
+                copyFuncs->read[MATRIX_B], coordName[trb],
+                coordName[1 - trb]);
+    }
+    else {
+        sprintf(tmp,
+                "y = (currN + %lu <= N) ? %lu : N - currN;\n"
+                "x = (k0 + %lu <= K) ? %lu : K - k0;\n"
+                "if ((y == %lu) && (x == %lu)) {\n"
+                     // fast read
+                "    %s((LPtr)tempB, (GPtr)B, %s, %s, ldb);\n"
+                "}\n"
+                "else {\n"
+                "    %s((__local float4*)tempB);\n"           // zeroing
+                "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+                             // slow read
+                "    %s((LPtr)tempB, (GPtr)B, %s, %s, %s, %s, %lu, ldb);\n"
+                "}\n\n",
+                bsize[0], bsize[0], bsize[1], bsize[1], bsize[0], bsize[1],
+                copyFuncs->read[MATRIX_B], coordName[trb], coordName[1 - trb],
+                zeroFuncs->names[MATRIX_B],
+                copyFuncs->readGeneric[MATRIX_B], coordName[trb],
+                coordName[1 - trb], sizeName[trb], sizeName[1 - trb],
+                pitch);
+    }
+
+    kgenAddStmt(ctx, tmp);
+}
+
+static void
+genZeroResult(
+    struct KgenContext *ctx,
+    DataType dtype,
+    const SubproblemDim *dims)
+{
+    unsigned int n;
+    char tmp[1024];
+    unsigned int vecLen;
+
+    vecLen = sizeof(cl_float4) / dtypeSize(dtype);
+    getResultGPRsInfo(dtype, &dims[1], vecLen, &n, NULL);
+
+    sprintf(tmp, "\n"
+                 "for (x = 0; x < %u; x++) {\n"
+                 "    c[x] = 0;\n"
+                 "}\n\n", n);
+
+    kgenAddStmt(ctx, tmp);
+}
+
+static void
+initKernelVarNames(KernelVarNames *kvars, KernelExtraFlags kflags)
+{
+    kvars->A = "A";
+    kvars->B = "B";
+    if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_A)) {
+        kvars->coordA = "coordA.x";
+    }
+    else {
+        kvars->coordA = "coordA.y";
+    }
+    if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_B)) {
+        kvars->coordB = "coordB.x";
+    }
+    else {
+        kvars->coordB = "coordB.y";
+    }
+    kvars->sizeM = "M";
+    kvars->sizeN = "N";
+    kvars->sizeK = "K";
+}
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+    struct KgenContext *ctx;
+    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    KernelExtraFlags kflags = kextra->flags;
+    char tmp[1024];
+    char blkmul[128];
+    char updateResFn[FUNC_NAME_MAXLEN];
+    char updateResGenericFn[FUNC_NAME_MAXLEN];
+    CopyBufFuncs copyFuncs;
+    ZeroFuncs zeroFuncs;
+    DataType dtype = kextra->dtype;
+    ssize_t ret;
+    BlasGenSettings gset;
+    BlkMulOpts mulOpts;
+    size_t pitchAB;
+    const char *s;
+    bool b;
+    int tra, trb;
+    unsigned int l1Pans;
+    unsigned int vecLen;
+    char vect[2] = {'y', 'x'};
+    UpdateResultFlags upFlags;
+
+    ctx = createKgenContext(buf, buflen, true);
+    if (ctx == NULL) {
+        return -ENOMEM;
+    }
+
+    // at first, generate needed declarations and auxiliary functions
+
+    pitchAB = matrBlockPitch(subdims, MATRIX_A, dtype, clblasLeft);
+    b = isDoubleBasedType(dtype);
+    kgenDeclareUptrs(ctx, b);
+
+    // generator settings initialization
+    memset(&gset, 0, sizeof(gset));
+    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
+    gset.kextra = kextra;
+    gset.pgran = pgran;
+    initKernelVarNames(&gset.varNames, kflags);
+
+    generateBufCopyFuncs(&copyFuncs, ctx, CLBLAS_GEMM, &gset,
+                         BCHF_MATRIX_A | BCHF_MATRIX_B);
+
+    generateUpresFuncs(ctx, CLBLAS_GEMM, &gset, updateResFn,
+                       updateResGenericFn);
+
+    generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype,
+                         ZF_MATRIX_A | ZF_MATRIX_B);
+
+    // block multiplication function
+    mulOpts.aMobj = CLMEM_BUFFER;
+    mulOpts.bMobj = CLMEM_BUFFER;
+    mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | BLKMUL_SKEW_COLUMN;
+    if (isComplexType(dtype)) {
+        mulOpts.core = BLKMUL_SEPARATE_MULADD;
+    }
+    else {
+        mulOpts.core = BLKMUL_MAD;
+    }
+    ret = blkMulGen(ctx, subdims, dtype, &mulOpts);
+    if (ret) {
+        destroyKgenContext(ctx);
+
+        return -EOVERFLOW;
+    }
+
+    kgenAddBlankLine(ctx);
+    kgenGetLastFuncName(blkmul, sizeof(blkmul), ctx);
+
+    // now, generate the kernel
+    declareKernel(ctx, dtype, pgran);
+    kgenBeginFuncBody(ctx);
+    declareLocalVariables(ctx, dtype, subdims, pgran);
+
+    // Shift matrices' origins according to offsetM and offsetN.
+    kgenAddBlankLine(ctx);
+    tmp[0] = '\0';
+    strcat(tmp, "A += offA;\n");
+    strcat(tmp, "B += offB;\n");
+    strcat(tmp, "C += offC;\n");
+
+    kgenAddStmt(ctx, tmp);
+    kgenAddBlankLine(ctx);
+
+    /*
+     * Output matrix is divided into squares, each work group
+     * gets such a square. Get current panel coordinates
+     * depending on which matrix must be outer.
+     * Assign different inner matrix's panels processed
+     * at the same time to different work groups in order to
+     * reduce global memory bank conflicts. Use cyclic
+     * addressing for this purpose
+     */
+    sprintf(tmp, // number of outer panels
+                 "groupsPan = N / %lu;\n"
+                 "if (N %% %lu) {\n"
+                 "    groupsPan++;\n"
+                 "}\n"
+                 "x = gid %% groupsPan;\n"  // outer panel number
+                 "y = gid / groupsPan;\n"   // outer inner number
+                 "currN = x * %lu;\n"
+                 "\n"
+                 // number of inner panels
+                 "groupsPan = M / %lu;\n"
+                 "if (M %% %lu) {\n"
+                 "    groupsPan++;\n"
+                 "}\n"
+                 // inner panel number using cyclic addressing
+                 "y = (x + y) %% groupsPan;\n"
+                 "currM = y * %lu;\n"
+                 "\n",
+            subdims[0].itemX, subdims[0].itemX, subdims[0].itemX,
+            subdims[0].itemY, subdims[0].itemY, subdims[0].itemY);
+        ret = kgenAddStmt(ctx, tmp);
+
+    tra = isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_A);
+    trb = isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_B);
+    sprintf(tmp, "coordA.%c = currM;\n"
+                 "coordA.%c = 0;\n"
+                 "coordB.%c = currN;\n"
+                 "coordB.%c = 0;\n\n",
+            vect[tra], vect[1 - tra], vect[trb], vect[1 - trb]);
+    kgenAddStmt(ctx, tmp);
+
+    genZeroResult(ctx, dtype, subdims);
+
+    // loop over K
+    sprintf(tmp, "for (k0 = 0; k0 < K; k0 += %lu)", subdims[0].bwidth);
+    kgenBeginBranch(ctx, tmp);
+
+    genPrepareBlockA(ctx, subdims, dtype, &copyFuncs,
+                     &zeroFuncs, kflags);
+    genPrepareBlockB(ctx, subdims, dtype, &copyFuncs, &zeroFuncs,
+                     kflags);
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+
+    l1Pans = (unsigned int)subdims[0].x / (unsigned int)subdims[1].x;
+
+    vecLen = sizeof(cl_float4) / dtypeSize(dtype);
+    // and eventually multiply the blocks and update the current result
+    getResultGPRsInfo(dtype, &subdims[1], vecLen, NULL, &s);
+    sprintf(tmp, "%s((LPtr)(tempA + (lid / %u * %lu) * %lu),\n"
+                 "   (LPtr)(tempB + (lid %% %u * %lu) * %lu),\n"
+                 "   (%s*)c, lid);\n",
+            blkmul, l1Pans, subdims[1].y, pitchAB, l1Pans,
+            subdims[1].x, pitchAB, s);
+    kgenAddStmt(ctx, tmp);
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+
+    kgenEndBranch(ctx, NULL);       // loop over K
+
+    // update result logic
+    sprintf(tmp, "coordA.%c += lid / %u * %lu;\n"
+                 "coordB.%c += lid %% %u * %lu;\n",
+            vect[tra], l1Pans, subdims[1].y, vect[trb], l1Pans, subdims[1].x);
+    kgenAddStmt(ctx, tmp);
+    if (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N)) {
+        sprintf(tmp, "if (coordA.%c >= M || coordB.%c >= N) {\n"
+                     "  return;\n"
+                     "}\n", vect[tra], vect[trb]);
+        kgenAddStmt(ctx, tmp);
+    }
+    kgenAddBlankLine(ctx);
+
+    upFlags = kextraToUpresFlags(CLBLAS_GEMM, kflags);
+    upFlags |= UPRES_EXCEED_PROBLEM_CONDITION;
+    genResultUpdateWithFlagsOld(ctx, CLBLAS_GEMM, &gset, upFlags, updateResFn,
+                                updateResGenericFn, NULL);
+
+    ret = kgenEndFuncBody(ctx);
+    if (!ret) {
+        ret = (ssize_t)kgenSourceSize(ctx) + 1;
+    }
+
+    destroyKgenContext(ctx);
+
+    return (ret < 0) ? -EOVERFLOW : ret;
+}
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra)
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+
+    (void)extra;
+
+    initSizeKarg(&args[0], blasArgs->M);
+    initSizeKarg(&args[1], blasArgs->N);
+    initSizeKarg(&args[2], blasArgs->K);
+    assignScalarKarg(&args[3], &(blasArgs->alpha), blasArgs->dtype);
+    initMemobjKarg(&args[4], blasArgs->A, NULL, 0, 0);
+    initSizeKarg(&args[5], blasArgs->lda.matrix);
+    initMemobjKarg(&args[6], blasArgs->B, NULL, 0, 0);
+    initSizeKarg(&args[7], blasArgs->ldb.matrix);
+    assignScalarKarg(&args[8], &(blasArgs->beta), blasArgs->dtype);
+    initMemobjKarg(&args[9], blasArgs->C, NULL, 0, 0);
+    initSizeKarg(&args[10], blasArgs->ldc.matrix);
+    initSizeKarg(&args[11], blasArgs->offA);
+    initSizeKarg(&args[12], blasArgs->offBX);
+    initSizeKarg(&args[13], blasArgs->offCY);
+}
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs)
+{
+    cl_ulong size;
+
+    (void)kernelArgs;
+
+    size = matrBlockSize(dim, MATRIX_A, dtype, clblasLeft);
+    size += matrBlockSize(dim, MATRIX_B, dtype, clblasLeft);
+    size += matrBlockSize(dim, MATRIX_C, dtype, clblasLeft);
+
+    return (size * dtypeSize(dtype) <= ldsSize);
+}
+
+static SolverFlags
+solverFlags(void)
+{
+    return (SF_WSPACE_2D);
+}
+
+void
+initGemmLdsPattern(MemoryPattern *mempat)
+{
+    mempat->name = "LDS based block gemm";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &solverOps;
+
+    mpatExtra.aMset = CLMEM_LEVEL_LDS;
+    mpatExtra.bMset = CLMEM_LEVEL_LDS;
+    mpatExtra.mobjA = CLMEM_BUFFER;
+    mpatExtra.mobjB = CLMEM_BUFFER;
+    mempat->extra = &mpatExtra;
+}
+
+static int
+ldsGetPerf(
+    unsigned int kflags,
+    const void *args)
+{
+    (void)args;
+    (void)kflags;
+
+    return PPERF_POOR;
+}
diff --git a/src/library/blas/gens/legacy/gen_helper_legacy.c b/src/library/blas/gens/legacy/gen_helper_legacy.c
new file mode 100644
index 0000000..47505fc
--- /dev/null
+++ b/src/library/blas/gens/legacy/gen_helper_legacy.c
@@ -0,0 +1,448 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+#include "gen_helper_legacy.h"
+#include "blas_kgen_legacy.h"
+#include "../gen_helper.h"
+
+typedef struct CopyPattern {
+    SubproblemDim dim;
+    const PGranularity *pgran;
+    DataType dtype;
+    DBlockCopyDirection dir;
+    DBlockCopyFlags flags;
+    bool generic;
+    bool zeroing;
+} CopyPattern;
+
+static int
+cpyImgGenCallback(struct KgenContext *ctx, const void *pattern)
+{
+    const CopyPattern *pat = (CopyPattern*)pattern;
+    const void *dim = (pat->generic) ? NULL : &pat->dim;
+    if(pat->zeroing) {
+        return f4zeroBlockGen(ctx, dim, pat->pgran, "__local");
+    }
+    else {
+        return copyDataBlockGen(ctx, dim, pat->pgran, pat->dtype, pat->dir,
+                                pat->flags);
+    }
+}
+
+int
+generateImageCopyFuncs(
+    CopyImgFuncs *copyFuncs,
+    struct KgenContext *ctx,
+    BlasFunctionID funcID,
+    const BlasGenSettings *gset)
+{
+    const SubproblemDim *dims = gset->subdims;
+    KernelExtraFlags kflags = gset->kextra->flags;
+    DataType dtype = gset->kextra->dtype;
+    const PGranularity *pgran = gset->pgran;
+    CopyPattern pattern;
+    // mandatory flags for global to local copying
+    DBlockCopyFlags glcpFlags[2] = {0, 0};
+    struct KgenGuard *guard;
+    unsigned int tsize;
+    int ret = 0;
+    bool isTra, areTails, isConjA;
+    bool customize;
+
+    if (kflags & KEXTRA_NO_COPY_VEC_A) {
+        glcpFlags[0] = DBLOCK_COPY_NOT_VECTORIZE;
+    }
+    if (kflags & KEXTRA_NO_COPY_VEC_B) {
+        glcpFlags[1] = DBLOCK_COPY_NOT_VECTORIZE;
+    }
+
+    tsize = dtypeSize(dtype);
+    isTra = isMatrixAccessColMaj(funcID, kflags, MATRIX_A);
+    isConjA = isMatrixConj(kflags, MATRIX_A);
+    areTails = (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N));
+    customize = (funcID == CLBLAS_TRMM);
+
+    guard = createKgenGuard(ctx, cpyImgGenCallback, sizeof(CopyPattern));
+    if (guard == NULL) {
+        return -ENOMEM;
+    }
+
+    memset(&pattern, 0, sizeof(pattern));
+
+    pattern.zeroing = false;
+    pattern.dim = dims[0];
+    pattern.dir = DBLOCK_GLOBAL_TO_IMAGE;
+    pattern.dtype = dtype;
+    pattern.flags = 0;
+    pattern.generic = false;
+    pattern.pgran = pgran;
+
+    if (!(customize && (isTra || isConjA))) {
+        pattern.dim.x = dims[0].bwidth;
+        pattern.dim.y = dims[0].y;
+        findGenerateFunction(guard, &pattern, copyFuncs->globalToImage[MATRIX_A],
+                             FUNC_NAME_MAXLEN);
+        kgenAddBlankLine(ctx);
+    }
+
+    pattern.dim.x = dims[0].bwidth;
+    pattern.dim.y = dims[0].x;
+    findGenerateFunction(guard, &pattern, copyFuncs->globalToImage[MATRIX_B],
+                         FUNC_NAME_MAXLEN);
+    kgenAddBlankLine(ctx);
+
+    pattern.dim.x = dims[0].bwidth;
+    pattern.dim.y = dims[1].y;
+    pattern.dir = DBLOCK_LOCAL_TO_IMAGE;
+    findGenerateFunction(guard, &pattern, copyFuncs->localToImage[MATRIX_A],
+                         FUNC_NAME_MAXLEN);
+    kgenAddBlankLine(ctx);
+
+    pattern.dim.x = dims[0].bwidth;
+    pattern.dim.y = dims[1].x;
+    pattern.dir = DBLOCK_LOCAL_TO_IMAGE;
+    findGenerateFunction(guard, &pattern, copyFuncs->localToImage[MATRIX_B],
+                         FUNC_NAME_MAXLEN);
+    kgenAddBlankLine(ctx);
+
+    // Global to local optimized
+    pattern.dir = DBLOCK_GLOBAL_TO_LOCAL;
+    if (customize || isComplexType(dtype)) {
+        pattern.flags = (!customize || isConjA) ? DBLOCK_COPY_CONJUGATE : 0;
+        pattern.flags |= glcpFlags[0];
+        pattern.dim.x = dims[0].bwidth;
+        pattern.dim.y = dims[1].y;
+        findGenerateFunction(guard, &pattern, copyFuncs->globalToLocal[MATRIX_A],
+                             FUNC_NAME_MAXLEN);
+        kgenAddBlankLine(ctx);
+    }
+
+    if ((funcID == CLBLAS_GEMM) && isComplexType(dtype)) {
+        pattern.flags = DBLOCK_COPY_CONJUGATE | glcpFlags[1];
+        pattern.dim.x = dims[0].bwidth;
+        pattern.dim.y = dims[1].x;
+        findGenerateFunction(guard, &pattern, copyFuncs->globalToLocal[MATRIX_B],
+                             FUNC_NAME_MAXLEN);
+        kgenAddBlankLine(ctx);
+    }
+
+    // Global to local generic
+    pattern.dim = dims[0];
+    pattern.dir = DBLOCK_GLOBAL_TO_LOCAL;
+    pattern.generic = true;
+    if (!customize || areTails) {
+        pattern.flags = (isConjA) ? DBLOCK_COPY_CONJUGATE : 0;
+        pattern.flags |= glcpFlags[0];
+        findGenerateFunction(guard, &pattern,
+                             copyFuncs->globalToLocalGeneric[MATRIX_A],
+                             FUNC_NAME_MAXLEN);
+        kgenAddBlankLine(ctx);
+    }
+
+    pattern.flags = (kflags & KEXTRA_CONJUGATE_B) ? DBLOCK_COPY_CONJUGATE : 0;
+    pattern.flags |= glcpFlags[1];
+    findGenerateFunction(guard, &pattern,
+                         copyFuncs->globalToLocalGeneric[MATRIX_B],
+                         FUNC_NAME_MAXLEN);
+    kgenAddBlankLine(ctx);
+
+    // Global to local transposed functions
+    pattern.dir = DBLOCK_GLOBAL_TO_LOCAL;
+    pattern.flags = (kflags & KEXTRA_NO_COPY_VEC_A) ?
+                    DBLOCK_COPY_NOT_VECTORIZE : 0;
+    pattern.flags |= glcpFlags[0];
+    if (!customize || isTra) {
+        pattern.generic = false;
+        if (isConjA) {
+            pattern.flags |= DBLOCK_COPY_TRANSPOSE | DBLOCK_COPY_CONJUGATE;
+        }
+        else {
+            pattern.flags |= DBLOCK_COPY_TRANSPOSE;
+        }
+        pattern.dim.x = dims[1].y;
+        pattern.dim.y = dims[0].bwidth;
+
+        findGenerateFunction(guard, &pattern,
+                             copyFuncs->globalToLocalTransposed[MATRIX_A],
+                             FUNC_NAME_MAXLEN);
+        kgenAddBlankLine(ctx);
+    }
+
+    if (!customize || (isTra && areTails)) {
+        pattern.generic = true;
+        pattern.dim.x = 0;
+        pattern.dim.y = 0;
+        findGenerateFunction(guard, &pattern,
+                         copyFuncs->globalToLocalTransposedGeneric[MATRIX_A],
+                         FUNC_NAME_MAXLEN);
+        kgenAddBlankLine(ctx);
+    }
+
+    pattern.generic = false;
+    pattern.dim.x = dims[1].x;
+    pattern.dim.y = dims[0].bwidth;
+    if (kflags & KEXTRA_CONJUGATE_B) {
+        pattern.flags = DBLOCK_COPY_TRANSPOSE | DBLOCK_COPY_CONJUGATE;
+    }
+    else {
+        pattern.flags = DBLOCK_COPY_TRANSPOSE;
+    }
+    pattern.flags |= glcpFlags[1];
+    findGenerateFunction(guard, &pattern,
+                         copyFuncs->globalToLocalTransposed[MATRIX_B],
+                         FUNC_NAME_MAXLEN);
+    kgenAddBlankLine(ctx);
+
+    pattern.generic = true;
+    pattern.dim.x = 0;
+    pattern.dim.y = 0;
+    findGenerateFunction(guard, &pattern,
+                         copyFuncs->globalToLocalTransposedGeneric[MATRIX_B],
+                         FUNC_NAME_MAXLEN);
+    kgenAddBlankLine(ctx);
+
+    // generate two local zeroing functions for matrix A and matrix B blocks
+    pattern.zeroing = true;
+    pattern.dim = dims[0];
+    pattern.generic = false;
+    pattern.flags = 0;
+    pattern.dim.y = 1;
+    pattern.dim.x = fl4RowWidth(dims[0].bwidth, tsize) * dims[1].y;
+
+    findGenerateFunction(guard, &pattern,
+                         copyFuncs->zeroBlock[MATRIX_A],
+                         FUNC_NAME_MAXLEN);
+    kgenAddBlankLine(ctx);
+
+    pattern.dim.x = fl4RowWidth(dims[0].bwidth, tsize) * dims[1].x;
+    findGenerateFunction(guard, &pattern,
+                         copyFuncs->zeroBlock[MATRIX_B],
+                         FUNC_NAME_MAXLEN);
+    ret = kgenAddBlankLine(ctx);
+
+    destroyKgenGuard(guard);
+    return ret;
+}
+
+int
+generateResultUpdateOld(
+    struct KgenContext *ctx,
+    BlasFunctionID funcID,
+    const BlasGenSettings *gset,
+    const char *optFuncName,
+    const char *genericFuncName)
+{
+    UpdateResultFlags flags;
+
+    flags = kextraToUpresFlags(funcID, gset->kextra->flags);
+
+    return genResultUpdateWithFlagsOld(ctx, funcID, gset, flags,
+                                       optFuncName, genericFuncName, NULL);
+}
+
+int
+genResultUpdateWithFlagsOld(
+    struct KgenContext *ctx,
+    BlasFunctionID funcID,
+    const BlasGenSettings *gset,
+    UpdateResultFlags flags,
+    const char *optFuncName,
+    const char *genericFuncName,
+    const char *cachedName)
+{
+    KernelExtraFlags kflags = gset->kextra->flags;
+    UpdateResultOp op;
+    char tmp[1024];
+    int ret = 0;
+    const char *coordY, *coordX;
+    UpresVarNames uvars;
+    const KernelVarNames *kvarNames = &gset->varNames;
+    const SubproblemDim *dim = &gset->subdims[1];
+    bool areTails, useCondition;
+
+    memset(&uvars, 0, sizeof(uvars));
+
+    coordX = kvarNames->coordB;
+    coordY = kvarNames->coordA;
+
+    if (funcHasTriangMatrix(funcID)) {
+        if (flags & UPRES_TRIANG_WRITE_C) {
+            uvars.result = "C";
+        }
+        else {
+            uvars.result = "B";
+        }
+        uvars.ld = "ldb";
+    }
+    else {
+        uvars.result = "C";
+        uvars.ld = "ldc";
+    }
+
+    uvars.cachedName = cachedName;
+
+    /* For now, kernels that do not use UPRES_EXCEED_PROBLEM_CONDITION
+     * must return in case problem exceeds more precise lower level conditions
+     * (KEXTRA_TAILS_M_LOWER, KEXTRA_TAILS_N_LOWER) before updating result
+    */
+    areTails = (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N));
+    useCondition = areTails && ((flags & UPRES_EXCEED_PROBLEM_CONDITION) != 0);
+    if (useCondition) {
+        bool tailM = (kflags & KEXTRA_TAILS_M) != 0;
+        bool tailN = (kflags & KEXTRA_TAILS_N) != 0;
+
+        if (tailM) {
+            if (tailN) {
+                sprintf(tmp, "if ((%s < %s) && (%s < %s))",
+                        coordY, kvarNames->sizeM, coordX, kvarNames->sizeN);
+            }
+            else {
+                sprintf(tmp, "if (%s < %s)", coordY, kvarNames->sizeM);
+            }
+        }
+        else {
+            // here tailN is true
+            sprintf(tmp, "if (%s < %s)", coordX, kvarNames->sizeN);
+        }
+        kgenBeginBranch(ctx, tmp);
+    }
+    else {
+        kgenAddBlankLine(ctx);
+    }
+
+    if (optFuncName) {
+        const char *betaStr;
+        betaStr = (flags & UPRES_WITH_BETA) ? ", beta" : "";
+
+        // update with functions invoking
+        if (!(kflags & (KEXTRA_TAILS_M_LOWER | KEXTRA_TAILS_N_LOWER))) {
+            sprintf(tmp, "%s(%s, c, alpha, %s, %s, %s%s);\n",
+                    optFuncName, uvars.result, coordY, coordX,
+                    uvars.ld, betaStr);
+        }
+        else {
+            sprintf(tmp, "uint y = min(%luu, %s - (uint)%s);\n"
+                         "uint x = min(%luu, %s - (uint)%s);\n"
+
+                         "if ((y == %lu) && (x == %lu)) {\n"
+                         "    %s(%s, c, alpha, %s, %s, %s%s);\n"
+                         "}\n"
+                         "else {\n"
+                         "    %s(%s, c, alpha, %s, %s, %s%s, y, x);\n"
+                         "}\n",
+                     dim->y, kvarNames->sizeM, coordY,
+                     dim->x, kvarNames->sizeN, coordX,
+                     dim->y, dim->x,
+                     optFuncName, uvars.result, coordY, coordX, uvars.ld,
+                     betaStr,
+                     genericFuncName, uvars.result, coordY, coordX, uvars.ld,
+                     betaStr);
+        }
+
+        kgenAddStmt(ctx, tmp);
+    }
+    else {
+        // inline result update
+        flags |= UPRES_INLINE;
+
+        op = (flags & UPRES_WITH_BETA) ? UPRES_SUM : UPRES_SET;
+
+        uvars.startRow = coordY;
+        uvars.startCol = coordX;
+        uvars.nrRows = "y";
+        uvars.nrCols = "x";
+
+        if (!(kflags & (KEXTRA_TAILS_M_LOWER | KEXTRA_TAILS_N_LOWER))) {
+            ret = updateResultGenOld(ctx, gset, op, flags, &uvars);
+        }
+        else {
+            sprintf(tmp, "uint y = min(%luu, %s - (uint)%s);\n"
+                         "uint x = min(%luu, %s - (uint)%s);\n",
+                    dim->y, kvarNames->sizeM, coordY,
+                    dim->x, kvarNames->sizeN, coordX);
+            kgenAddStmt(ctx, tmp);
+
+            sprintf(tmp, "if ((y == %lu) && (x == %lu))",
+                    dim->y, dim->x);
+            kgenBeginBranch(ctx, tmp);
+            // optimized update
+            updateResultGenOld(ctx, gset, op, flags, &uvars);
+            kgenEndBranch(ctx, NULL);
+
+            flags |= UPRES_GENERIC;
+            kgenBeginBranch(ctx, "else ");
+            // not optimized update
+            updateResultGenOld(ctx, gset, op, flags, &uvars);
+            ret = kgenEndBranch(ctx, NULL);
+        }
+    }
+
+    if (useCondition) {
+        ret = kgenEndBranch(ctx, NULL);
+    }
+
+    return (ret) ? -EOVERFLOW : 0;
+}
+
+int
+genUpresFuncsWithFlags(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    UpdateResultFlags flags,
+    char optFuncName[FUNC_NAME_MAXLEN],
+    char genericFuncName[FUNC_NAME_MAXLEN])
+{
+    KernelExtraFlags kflags = gset->kextra->flags;
+    UpdateResultOp op;
+    int ret;
+
+    op = (flags & UPRES_WITH_BETA) ? UPRES_SUM : UPRES_SET;
+
+    updateResultGenOld(ctx, gset, op, flags, NULL);
+    ret = kgenAddBlankLine(ctx);
+    if (ret) {
+        return -EOVERFLOW;
+    }
+
+    kgenGetLastFuncName(optFuncName, FUNC_NAME_MAXLEN, ctx);
+
+    if (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N)) {
+        flags |= UPRES_GENERIC;
+        updateResultGenOld(ctx, gset, op, flags, NULL);
+        kgenAddBlankLine(ctx);
+        kgenGetLastFuncName(genericFuncName, FUNC_NAME_MAXLEN, ctx);
+    }
+
+    return (ret) ? -EOVERFLOW : 0;
+}
+
+int
+generateUpresFuncs(
+    struct KgenContext *ctx,
+    BlasFunctionID funcID,
+    const BlasGenSettings *gset,
+    char optFuncName[FUNC_NAME_MAXLEN],
+    char genericFuncName[FUNC_NAME_MAXLEN])
+{
+    UpdateResultFlags flags;
+
+    flags = kextraToUpresFlags(funcID, gset->kextra->flags);
+
+    return genUpresFuncsWithFlags(ctx, gset, flags,
+                                  optFuncName, genericFuncName);
+}
diff --git a/src/library/blas/gens/legacy/gen_helper_legacy.h b/src/library/blas/gens/legacy/gen_helper_legacy.h
new file mode 100644
index 0000000..8b1acde
--- /dev/null
+++ b/src/library/blas/gens/legacy/gen_helper_legacy.h
@@ -0,0 +1,77 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef GEN_HELPER_LEGACY_H_
+#define GEN_HELPER_LEGACY_H_
+
+#include <kerngen.h>
+#include <dblock_kgen.h>
+#include <matrix_props.h>
+
+#include "../blas_kgen.h"
+
+typedef struct CopyImgFuncs {
+    char localToImage[2][FUNC_NAME_MAXLEN];
+    char globalToImage[2][FUNC_NAME_MAXLEN];
+    char globalToLocalTransposed[2][FUNC_NAME_MAXLEN];
+    char globalToLocalTransposedGeneric[2][FUNC_NAME_MAXLEN];
+    char globalToLocal[2][FUNC_NAME_MAXLEN];
+    char globalToLocalGeneric[2][FUNC_NAME_MAXLEN];
+    char zeroBlock[2][FUNC_NAME_MAXLEN];
+} CopyImgFuncs;
+
+int
+generateImageCopyFuncs(
+    CopyImgFuncs *copyFuncs,
+    struct KgenContext *ctx,
+    BlasFunctionID funcID,
+    const BlasGenSettings *gset);
+
+int
+generateResultUpdateOld(
+    struct KgenContext *ctx,
+    BlasFunctionID funcID,
+    const BlasGenSettings *gset,
+    const char *optFuncName,
+    const char *genericFuncName);
+
+int
+genResultUpdateWithFlagsOld(
+    struct KgenContext *ctx,
+    BlasFunctionID funcID,
+    const BlasGenSettings *gset,
+    UpdateResultFlags flags,
+    const char *optFuncName,
+    const char *genericFuncName,
+    const char *cachedName);
+
+int generateUpresFuncs(
+    struct KgenContext *ctx,
+    BlasFunctionID funcID,
+    const BlasGenSettings *gset,
+    char optFuncName[FUNC_NAME_MAXLEN],
+    char genericFuncName[FUNC_NAME_MAXLEN]);
+
+int
+genUpresFuncsWithFlags(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    UpdateResultFlags flags,
+    char optFuncName[FUNC_NAME_MAXLEN],
+    char genericFuncName[FUNC_NAME_MAXLEN]);
+
+#endif /* GEN_HELPER_LEGACY_H_ */
diff --git a/src/library/blas/gens/legacy/tests/CMakeLists.txt b/src/library/blas/gens/legacy/tests/CMakeLists.txt
new file mode 100644
index 0000000..9c5a0f3
--- /dev/null
+++ b/src/library/blas/gens/legacy/tests/CMakeLists.txt
@@ -0,0 +1,63 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+
+set(SRC_BLKMUL
+    ../blkmul.c
+    ${clBLAS_SOURCE_DIR}/library/common/kerngen_core.c
+    ${clBLAS_SOURCE_DIR}/library/common/kgen_basic.c
+    ${clBLAS_SOURCE_DIR}/library/common/kgen_loop_helper.c
+    ${clBLAS_SOURCE_DIR}/library/common/misc.c
+    ${clBLAS_SOURCE_DIR}/library/blas/gens/blas_kgen.c
+    ${clBLAS_SOURCE_DIR}/library/blas/gens/tile.c
+    ${clBLAS_SOURCE_DIR}/library/blas/gens/tile_iter.c
+    ${clBLAS_SOURCE_DIR}/library/blas/gens/legacy/blas_kgen_legacy.c
+    ${clBLAS_SOURCE_DIR}/library/blas/gens/gen_helper.c
+    ${clBLAS_SOURCE_DIR}/library/blas/gens/legacy/gen_helper_legacy.c
+    ${clBLAS_SOURCE_DIR}/library/blas/generic/blas_funcs.c
+    ${clBLAS_SOURCE_DIR}/library/blas/generic/matrix_dims.c
+    ${clBLAS_SOURCE_DIR}/library/blas/generic/matrix_props.c
+    ${clBLAS_SOURCE_DIR}/library/common/gens/dblock_kgen.c
+    ${clBLAS_SOURCE_DIR}/library/blas/gens/tilemul.c
+    ${clBLAS_SOURCE_DIR}/library/blas/gens/fetch.c
+    ${clBLAS_SOURCE_DIR}/library/common/kgen_guard.c
+    ${clBLAS_SOURCE_DIR}/library/common/list.c
+    ${clBLAS_SOURCE_DIR}/library/common/mutex.c
+    ${clBLAS_SOURCE_DIR}/library/common/trace_malloc.c
+    t_blkmul.c
+)
+
+include_directories(${OPENCL_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_DIR}/include
+                    ${clBLAS_SOURCE_DIR}/library/blas/include ${clBLAS_SOURCE_DIR}/library/blas/gens)
+
+add_executable(t_blkmul ${SRC_BLKMUL})
+target_link_libraries(t_blkmul ${OPENCL_LIBRARIES})
+
+if( TARGET_PLATFORM EQUAL 64 )
+	# CPack configuration; include the executable into the package
+	install( TARGETS t_blkmul
+			RUNTIME DESTINATION bin64
+			LIBRARY DESTINATION lib64
+			ARCHIVE DESTINATION lib64/import
+			)
+else()
+	# CPack configuration; include the executable into the package
+	install( TARGETS t_blkmul
+			RUNTIME DESTINATION bin32
+			LIBRARY DESTINATION lib32
+			ARCHIVE DESTINATION lib32/import
+			)
+endif()
diff --git a/src/library/blas/gens/legacy/tests/t_blkmul.c b/src/library/blas/gens/legacy/tests/t_blkmul.c
new file mode 100644
index 0000000..4983ce0
--- /dev/null
+++ b/src/library/blas/gens/legacy/tests/t_blkmul.c
@@ -0,0 +1,733 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <CL/cl.h>
+#include <string.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdio.h>
+#include <kerngen.h>
+#include <blas_kgen.h>
+#include "../blas_kgen_legacy.h"
+
+enum {
+    ITEM_WORK_M = 8,
+    ITEM_WORK_N = 8,
+    GROUP_SIZE = ITEM_WORK_M * ITEM_WORK_N,
+    BLOCKS_K = 2,
+    PACK_RATE = 4,
+    RAND_BOUND = 10
+};
+
+// float types based unified pointer
+typedef union FPtr {
+  void *v;
+  cl_float *f;
+  cl_double *d;
+  cl_float2 *f2;
+  cl_double2 *d2;
+} FPtr;
+
+// float type based unified data type
+typedef union FType {
+    unsigned char u[sizeof(cl_double)];
+    cl_float f;
+    cl_float2 f2;
+    cl_double d;
+    cl_double2 d2;
+} FType;
+
+static void
+usage(void)
+{
+    printf("USAGE: blkmul_test type <M N K> [--iter i] [--imA] [--imB] [--alpha] a "
+           "--[img-packed]\n"
+           "type argument can be a value from the following list:s, d, c, z\n"
+           "iter - number of iterations\n"
+           "imA, imB - image usage for matrix\n"
+           "img-packed - store elements of matrix A or (and) B "
+           "to an image in the packed way\n");
+}
+
+static void
+imageSizes(
+    int *height,
+    int *width,
+    int blockHeight,
+    int blockWidth,
+    int AB,
+    int typeSize,
+    int packed)
+{
+    *width = blockWidth * typeSize / 16;
+    *height = blockHeight;
+    if (packed) {
+        int smallHeight = (AB) ? (blockHeight / ITEM_WORK_M) :
+                                 (blockHeight / ITEM_WORK_N);
+
+        *width *= smallHeight * PACK_RATE;
+        *height /= smallHeight * PACK_RATE;
+    }
+}
+
+void
+addTestPrefix(struct KgenContext *ctx, bool isDouble)
+{
+    kgenDeclareUptrs(ctx, isDouble);
+}
+
+void
+addTestSuffix(
+    struct KgenContext *ctx,
+    const SubproblemDim subdims[2],
+    DataType type,
+    BlkMulOpts *mulOpts)
+{
+    char c;
+    char s[300];
+    bool isImageA, isImageB;
+    char *tName;
+    size_t m, n, k;
+    size_t blockWidth;
+    char imgXA[64], imgYA[64], imgXB[64], imgYB[64];
+    unsigned int vecLen = sizeof(cl_float4) / dtypeSize(type);
+
+    isImageA = (mulOpts->aMobj == CLMEM_IMAGE);
+    isImageB = (mulOpts->bMobj == CLMEM_IMAGE);
+
+    m = subdims[1].y;
+    n = subdims[1].x;
+    k = subdims[1].bwidth;
+    blockWidth = k * BLOCKS_K;
+
+    switch (type) {
+    case TYPE_FLOAT:
+        c = 's';
+        tName = "float";
+        break;
+    case TYPE_DOUBLE:
+        c = 'd';
+        tName = "double";
+        break;
+    case TYPE_COMPLEX_FLOAT:
+        c = 'c';
+        tName = "float2";
+        break;
+    case TYPE_COMPLEX_DOUBLE:
+        c = 'z';
+        tName = "double2";
+        break;
+    default:
+        return;
+    }
+
+    kgenAddBlankLine(ctx);
+    kgenAddStmt(ctx, "__kernel void\n");
+    kgenAddStmt(ctx, "blkmul_test(\n");
+    sprintf(s,"    %s alpha,\n", tName);
+    kgenAddStmt(ctx, s);
+    if (isImageA) {
+        kgenAddStmt(ctx, "    __read_only image2d_t A,\n");
+    }
+    else {
+        sprintf(s,"    __global %s *A,\n", tName);
+        kgenAddStmt(ctx, s);
+    }
+    if (isImageB) {
+        kgenAddStmt(ctx, "    __read_only image2d_t B,\n");
+    }
+    else {
+        sprintf(s,"    __global %s *B,\n", tName);
+        kgenAddStmt(ctx, s);
+    }
+    kgenAddStmt(ctx, "    size_t M,\n"
+                     "    size_t N,\n"
+                     "    size_t K,\n");
+    sprintf(s,"    __global %s *C,\n", tName);
+    kgenAddStmt(ctx, s);
+    kgenAddStmt(ctx, "    size_t iter)\n");
+    kgenBeginFuncBody(ctx);
+    kgenAddStmt(ctx, "size_t i, j, it, m0, n0;\n");
+    if (!isImageA) {
+        sprintf(s,"__local %s LA[%lu];\n", tName, m * ITEM_WORK_M * blockWidth);
+        kgenAddStmt(ctx, s);
+    }
+    else {
+        if (mulOpts->flags & BLKMUL_IMAGE_PACKED) {
+            sprintf(imgXA, "(m0 / %lu) %% %d * %lu", m, PACK_RATE,
+                    m * blockWidth / vecLen);
+            sprintf(imgYA, "m0 / %lu", m * PACK_RATE);
+        }
+        else {
+            strcpy(imgXA, "0");
+            strcpy(imgYA, "m0");
+        }
+    }
+    if (!isImageB) {
+        sprintf(s,"__local %s LB[%lu];\n", tName, n * ITEM_WORK_N * blockWidth);
+        kgenAddStmt(ctx, s);
+    }
+    else {
+        if (mulOpts->flags & BLKMUL_IMAGE_PACKED) {
+            sprintf(imgXB, "(n0 / %lu) %% %d * %lu", n, PACK_RATE,
+                    n * blockWidth / vecLen);
+            sprintf(imgYB, "n0 / %lu", n * PACK_RATE);
+        }
+        else {
+            strcpy(imgXB, "0");
+            strcpy(imgYB, "n0");
+        }
+    }
+
+    sprintf(s,"__local %s LC[%lu];\n", tName, n * m * GROUP_SIZE);
+    kgenAddStmt(ctx, s);
+
+    sprintf(s, "m0 = %lu * (get_global_id(0) / %d);\n"
+               "n0 = %lu * (get_global_id(0) %% %d);\n",
+            m, ITEM_WORK_N, n, ITEM_WORK_N);
+    kgenAddStmt(ctx, s);
+
+    if (!isImageA) {
+        kgenAddBlankLine(ctx);
+        sprintf(s, "for (i = m0; i < m0 + %lu; i++)", m);
+        kgenBeginBranch(ctx, s);
+        kgenBeginBranch(ctx, "for (j = 0; j < K; j++)");
+        kgenAddStmt(ctx,"LA[i * K + j] = A[i * K  + j];\n");
+        kgenEndBranch(ctx, NULL);
+        kgenEndBranch(ctx, NULL);
+    }
+
+    if (!isImageB) {
+        kgenAddBlankLine(ctx);
+        sprintf(s, "for (i = n0; i < n0 + %lu; i++)", n);
+        kgenBeginBranch(ctx, s);
+        kgenBeginBranch(ctx,"for (j = 0; j < K; j++)");
+        kgenAddStmt(ctx,"LB[i * K + j] = B[i * K  + j];\n");
+        kgenEndBranch(ctx, NULL);
+        kgenEndBranch(ctx, NULL);
+    }
+
+    kgenAddBlankLine(ctx);
+
+    kgenAddBlankLine(ctx);
+    kgenBeginBranch(ctx,"for (it = 0; it < iter; it++)");
+    sprintf(s, "for (i = m0; i < m0 + %lu; i++)", m);
+    kgenBeginBranch(ctx, s);
+    sprintf(s, "for (j = n0; j < n0 + %lu; j++)", n);
+    kgenBeginBranch(ctx, s);
+    kgenAddStmt(ctx,"LC[i * N + j] = 0;\n");
+    kgenEndBranch(ctx, NULL);
+    kgenEndBranch(ctx, NULL);
+
+    if (isImageA) {
+        if (isImageB) {
+            sprintf(s, "%cgemmBlock_%lu_%lu(alpha, A, (int2)(%s, %s), B, "
+                       "(int2)(%s, %s), (LPtr)(LC + m0 * %lu + n0));\n",
+                    c, m, n, imgXA, imgYA, imgXB, imgYB, subdims[0].x);
+        }
+        else {
+            sprintf(s, "%cgemmBlock_%lu_%lu(alpha, A, (int2)(%s, %s), "
+                       "(LPtr)(LB + n0 * %lu), (LPtr)(LC + m0 * %lu + n0));\n",
+                    c, m, n, imgXA, imgYA, subdims[0].bwidth, subdims[0].x);
+        }
+    }
+    else {
+        if (isImageB) {
+            sprintf(s, "%cgemmBlock_%lu_%lu(alpha, (LPtr)(LA + m0 * %lu), B, "
+                       "(int2)(%s, %s), (LPtr)(LC + m0 * %lu + n0));\n",
+                    c, m, n, subdims[0].bwidth, imgXB, imgYB, subdims[0].x);
+        }
+        else {
+            sprintf(s, "%cgemmBlock_%lu_%lu(alpha, (LPtr)(LA + m0 * %lu), "
+                       "(LPtr)(LB + n0 * %lu), (LPtr)(LC + m0 * %lu + n0));\n",
+                    c, m, n, subdims[0].bwidth, subdims[0].bwidth,
+                    subdims[0].x);
+        }
+    }
+    kgenAddStmt(ctx, s);
+    kgenEndBranch(ctx, NULL);
+
+    kgenAddBlankLine(ctx);
+    sprintf(s, "for (i = m0; i < m0 + %lu; i++)", m);
+    kgenBeginBranch(ctx, s);
+    sprintf(s, "for (j = n0; j < n0 + %lu; j++)", n);
+    kgenBeginBranch(ctx, s);
+    kgenAddStmt(ctx,"C[i * N  + j] = LC[i * N + j];\n");
+    kgenEndBranch(ctx, NULL);
+    kgenEndBranch(ctx, NULL);
+
+    kgenEndFuncBody(ctx);
+}
+
+cl_int
+run (char *ker, cl_uint M, cl_uint N, cl_uint K, FType alpha, DataType type, BlkMulOpts *mulOpts, cl_uint iter)
+{
+    cl_int err;
+    cl_platform_id platform;
+    cl_context ctx;
+    cl_device_id device;
+    cl_command_queue queue;
+    cl_event evt;
+    FType tmp;
+
+    cl_mem imA, imB, bufC;
+    FPtr A, B, C, C_naive;
+    bool is_complex = type == TYPE_COMPLEX_FLOAT || type == TYPE_COMPLEX_DOUBLE;
+    bool is_double = type == TYPE_DOUBLE || type == TYPE_COMPLEX_DOUBLE;
+    cl_uint nwords = (is_complex) ? 2 : 1;
+    unsigned int tsize = dtypeSize(type);
+    cl_kernel kernel;
+    const cl_image_format image_format = {CL_RGBA, CL_FLOAT};
+    size_t i, j, k;
+    size_t globalWorkSize[1] = {GROUP_SIZE};
+    size_t localWorkSize[1] = {GROUP_SIZE};
+    char log[100000]; size_t logSize;
+    cl_long sTime, fTime;
+    cl_program program = NULL;
+    const char *kernelName = "blkmul_test";
+    int imgWidth, imgHeight;
+    bool packed = (mulOpts->flags & BLKMUL_IMAGE_PACKED);
+
+    clGetPlatformIDs(1, &platform, NULL);
+
+    clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+
+    ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, CL_QUEUE_PROFILING_ENABLE, &err);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    /* Prepare OpenCL kernel and its arguments */
+
+    program = clCreateProgramWithSource(ctx, 1, (const char**)&ker, NULL, NULL);
+
+    err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
+    if (err != CL_SUCCESS){
+        clGetProgramBuildInfo (program,
+            device,
+            CL_PROGRAM_BUILD_LOG,
+            100000,
+            log,
+            &logSize);
+        printf("%s", log);
+        clReleaseProgram(program);
+        return err;
+    }
+
+    kernel = clCreateKernel(program, kernelName, &err);
+    if (err != CL_SUCCESS){
+        clReleaseProgram(program);
+        return err;
+    }
+
+    /* Memory allocation */
+
+    A.v = malloc(M * K * tsize);
+    B.v = malloc(K * N * tsize);
+    C.v = malloc(M * N * tsize);
+    C_naive.v = malloc(M * N * tsize);
+
+    srand(0);
+    if (is_double) {
+        for(i = 0; i < M * K * nwords; i++){
+            A.d[i] = (double)(rand() % RAND_BOUND);
+        }
+        for(i = 0; i < N * K * nwords; i++){
+            B.d[i] = (double)(rand() % RAND_BOUND);
+        }
+        for(i = 0; i < M * N * nwords; i++){
+            C.d[i] = 0.0;
+            C_naive.d[i] = 0.0;
+        }
+    }
+    else {
+        for(i = 0; i < M * K * nwords; i++){
+            A.f[i] = (float)(rand() % RAND_BOUND);
+        }
+        for(i = 0; i < N * K * nwords; i++){
+            B.f[i] = (float)(rand() % RAND_BOUND);
+        }
+        for(i = 0; i < M * N * nwords; i++){
+            C.f[i] = 0.0;
+            C_naive.f[i] = 0.0;
+        }
+    }
+
+    if (mulOpts->aMobj == CLMEM_IMAGE) {
+        imageSizes(&imgHeight, &imgWidth, M, K, 0, tsize, packed);
+        imA = clCreateImage2D (ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+            &image_format, imgWidth, imgHeight, 0, A.v, &err);
+    }
+    else {
+        imA = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+            K * M * tsize, A.v, &err);
+    }
+    if (err != CL_SUCCESS) {
+        clReleaseKernel(kernel);
+        return err;
+    }
+    if (mulOpts->bMobj == CLMEM_IMAGE) {
+        imageSizes(&imgHeight, &imgWidth, N, K, 0, tsize, packed);
+        imB = clCreateImage2D (ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+            &image_format, imgWidth, imgHeight, 0, B.v, &err);
+    }
+    else {
+        imB = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+            K * N * tsize, B.v, &err);
+    }
+    if (err != CL_SUCCESS) {
+        clReleaseMemObject(imA);
+        clReleaseKernel(kernel);
+        return err;
+    }
+
+    bufC = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+        M * N * tsize, C.v, &err);
+
+    if (err != CL_SUCCESS) {
+        clReleaseMemObject(imB);
+        clReleaseMemObject(imA);
+        clReleaseKernel(kernel);
+        return err;
+    }
+
+    err = clEnqueueWriteBuffer (queue,
+        bufC,
+        CL_TRUE,
+        0,
+        M * N * tsize,
+        C.v,
+        0,
+        NULL,
+        NULL);
+
+    /* Argument setting and kernel execution */
+    err = clSetKernelArg(kernel, 0, tsize, alpha.u);
+    err |= clSetKernelArg(kernel, 1, sizeof(imA), &imA);
+    err |= clSetKernelArg(kernel, 2, sizeof(imB), &imB);
+    err |= clSetKernelArg(kernel, 3, sizeof(M), &M);
+    err |= clSetKernelArg(kernel, 4, sizeof(N), &N);
+    err |= clSetKernelArg(kernel, 5, sizeof(K), &K);
+    err |= clSetKernelArg(kernel, 6, sizeof(bufC), &bufC);
+    err |= clSetKernelArg(kernel, 7, sizeof(iter), &iter);
+
+    if (err != CL_SUCCESS) {
+        clReleaseMemObject(bufC);
+        clReleaseMemObject(imB);
+        clReleaseMemObject(imA);
+        clReleaseKernel(kernel);
+        return err;
+    }
+
+    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL,
+        globalWorkSize, localWorkSize, 0,
+        NULL, &evt);
+
+    if (err != CL_SUCCESS) {
+        clReleaseMemObject(bufC);
+        clReleaseMemObject(imB);
+        clReleaseMemObject(imA);
+        clReleaseKernel(kernel);
+        return err;
+    }
+
+    err = clFinish(queue);
+
+    err = clEnqueueReadBuffer (queue,
+        bufC,
+        CL_TRUE,
+        0,
+        M * N * tsize,
+        C.v,
+        0,
+        NULL,
+        NULL);
+
+    /* Naive CPU multiplication */
+    if (is_double) {
+        if (is_complex) {
+            for (i = 0; i < M; i++) {
+                for (j = 0; j < N; j++) {
+                    for (k = 0; k < K; k++) {
+                        C_naive.d[(i * N + j) * 2] +=
+                            A.d[(i * K + k) * 2] * B.d[(j * K + k) * 2] -
+                            A.d[(i * K + k) * 2 + 1] * B.d[(j * K + k) * 2 + 1];
+
+                        C_naive.d[(i * N + j) * 2 + 1] +=
+                            A.d[(i * K + k) * 2] * B.d[(j * K + k) * 2 + 1] +
+                            A.d[(i * K + k) * 2 + 1] * B.d[(j * K + k) * 2];
+                    }
+
+                    tmp.d2.s[0] = C_naive.d[(i * N + j) * 2] * alpha.d2.s[0] -
+                                  C_naive.d[(i * N + j) * 2 + 1] * alpha.d2.s[1];
+                    tmp.d2.s[1] = C_naive.d[(i * N + j) * 2] * alpha.d2.s[1] +
+                                  C_naive.d[(i * N + j) * 2 + 1] * alpha.d2.s[0];
+                    C_naive.d[(i * N + j) * 2] = tmp.d2.s[0];
+                    C_naive.d[(i * N + j) * 2 + 1] = tmp.d2.s[1];
+                }
+            }
+
+            for (i = 0; i < M * N; i++) {
+                if ((C.d[i * 2] != C_naive.d[i * 2])  ||
+                    (C.d[i * 2 + 1] !=  C_naive.d[i * 2 + 1])) {
+                    printf("Differ at (%lu, %lu): (%lf; %lf) != (%lf; %lf)\n",
+                           i / N, i % N, C.d[i * 2], C.d[i * 2 + 1],
+                           C_naive.d[i * 2], C_naive.d[i * 2 + 1]);
+                    break;
+                }
+            }
+            if (i == M * N) {
+                printf("Match\n");
+            }
+        }
+        else {
+            for (i = 0; i < M; i++) {
+                for (j = 0; j < N; j++) {
+                    for (k = 0; k < K; k++) {
+                        C_naive.d[i * N + j] += A.d[i * K + k] * B.d[j * K + k];
+                    }
+                    C_naive.d[i * N + j] *= alpha.d;
+                }
+            }
+
+            for (i = 0; i < M * N; i++) {
+                if (C.d[i] != C_naive.d[i]) {
+                    printf("Differ at (%lu, %lu): %lf != %lf\n", i / N, i % N,
+                           C.d[i], C_naive.d[i]);
+                    break;
+                }
+            }
+            if (i == M * N) {
+                printf("Match\n");
+            }
+        }
+    }
+    else {
+        if (is_complex) {
+            for (i = 0; i < M; i++) {
+                for (j = 0; j < N; j++) {
+                    for (k = 0; k < K; k++) {
+                        C_naive.f[(i * N + j) * 2] +=
+                            A.f[(i * K + k) * 2] * B.f[(j * K + k) * 2] -
+                            A.f[(i * K + k) * 2 + 1] * B.f[(j * K + k) * 2 + 1];
+
+                        C_naive.f[(i * N + j) * 2 + 1] +=
+                            A.f[(i * K + k) * 2] * B.f[(j * K + k) * 2 + 1] +
+                            A.f[(i * K + k) * 2 + 1] * B.f[(j * K + k) * 2];
+                    }
+
+                    tmp.f2.s[0] = C_naive.f[(i * N + j) * 2] * alpha.f2.s[0] -
+                                  C_naive.f[(i * N + j) * 2 + 1] * alpha.f2.s[1];
+                    tmp.f2.s[1] = C_naive.f[(i * N + j) * 2] * alpha.f2.s[1] +
+                                  C_naive.f[(i * N + j) * 2 + 1] * alpha.f2.s[0];
+                    C_naive.f[(i * N + j) * 2] = tmp.f2.s[0];
+                    C_naive.f[(i * N + j) * 2 + 1] = tmp.f2.s[1];
+                }
+            }
+
+            for (i = 0; i < M * N; i++) {
+                if ((C.f[i * 2] != C_naive.f[i * 2]) ||
+                    (C.f[i * 2 + 1] != C_naive.f[i * 2 + 1])) {
+                    printf("Differ at (%lu, %lu): (%lf; %lf) != (%lf; %lf)\n",
+                           i / N, i % N, C.f[i * 2], C.f[i * 2 + 1],
+                           C_naive.f[i * 2], C_naive.f[i * 2 + 1]);
+                    break;
+                }
+            }
+            if (i == M * N) {
+                printf("Match\n");
+            }
+        }
+        else {
+            for (i = 0; i < M; i++) {
+                for (j = 0; j < N; j++) {
+                    for (k = 0; k < K; k++) {
+                        C_naive.f[i * N + j] += A.f[i * K + k] * B.f[j * K + k];
+                    }
+                    C_naive.f[i * N + j] *= alpha.f;
+                }
+            }
+
+            for (i = 0; i < M * N; i++) {
+                if (C.f[i] != C_naive.f[i]) {
+                    printf("Differ at (%lu, %lu): %lf != %lf\n",
+                           i / N, i % N, C.f[i], C_naive.f[i]);
+                    break;
+                }
+            }
+            if (i == M * N) {
+                printf("Match\n");
+            }
+        }
+    }
+    /* End of naive CPU multiplication */
+
+    clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &sTime, NULL);
+    clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &fTime, NULL);
+
+    printf("Total multiplication time: %d ms\nTime per iteration: %d ns\n",
+        (int)((fTime-sTime)/1000000), (int)((fTime-sTime)/iter));
+
+    clReleaseMemObject(bufC);
+    clReleaseMemObject(imB);
+    clReleaseMemObject(imA);
+    clReleaseKernel(kernel);
+    return CL_SUCCESS;
+}
+
+int main(int argc, char *argv[])
+{
+    char out[65535];
+    SubproblemDim subdims[2];
+    BlkMulOpts mulOpts;
+    DataType dtype;
+    int i;
+    cl_uint iter = 1, blockM = 4, blockN = 4, blockK = 8;
+    struct KgenContext *ctx = createKgenContext(out, 65535, 1);
+    FType alpha;
+    int cmdAlpha = 0;
+
+    mulOpts.aMobj = CLMEM_BUFFER;
+    mulOpts.bMobj = CLMEM_BUFFER;
+    mulOpts.flags = BLKMUL_NO_FLAGS;
+
+   // parse command line
+
+    if (argc < 2) {
+        usage();
+        return 1;
+    }
+
+    if (!strcmp(argv[1], "s")) {
+        dtype = TYPE_FLOAT;
+        alpha.f = 1;
+    }
+    else if (!strcmp(argv[1], "d")) {
+        dtype = TYPE_DOUBLE;
+        alpha.d = 1;
+    }
+    else if (!strcmp(argv[1], "c")) {
+        dtype = TYPE_COMPLEX_FLOAT;
+        alpha.f2.s[0] = 1;
+        alpha.f2.s[1] = 0;
+    }
+    else if (!strcmp(argv[1], "z")) {
+        dtype = TYPE_COMPLEX_DOUBLE;
+        alpha.d2.s[0] = 1;
+        alpha.d2.s[1] = 0;
+    }
+    else {
+        printf("Wrong type specified: %s\n", argv[1]);
+        return 1;
+    }
+
+    for (i = 2; i < argc; i++) {
+        if (strcmp(argv[i], "--imA") == 0) {
+            mulOpts.aMobj = CLMEM_IMAGE;
+            continue;
+        }
+        if (strcmp(argv[i], "--imB") == 0) {
+            mulOpts.bMobj = CLMEM_IMAGE;
+            continue;
+        }
+        if (strcmp(argv[i], "--img-packed") == 0) {
+            mulOpts.flags |= BLKMUL_IMAGE_PACKED;
+            continue;
+        }
+
+        if (strcmp(argv[i], "--iter") == 0) {
+            if (i + 1 == argc) {
+                printf("Error: 'iter' argument is not specified\n");
+                usage();
+                return 1;
+            }
+            iter = atoi(argv[i + 1]);
+            i++;
+            continue;
+        }
+
+        if (strcmp(argv[i], "--alpha") == 0) {
+            if (i + 1 == argc) {
+                printf("Error: 'alpha' argument is not specified\n");
+                usage();
+                return 1;
+            }
+            cmdAlpha = atoi(argv[i + 1]);
+            i++;
+            continue;
+        }
+
+        if (i + 2 >= argc) {
+            printf("Error: Not all sizes are specified\n");
+            usage();
+            return 1;
+        }
+        blockM = atoi(argv[i]);
+        blockN = atoi(argv[i + 1]);
+        blockK = atoi(argv[i + 2]);
+        i += 2;
+    }
+
+    if (cmdAlpha) {
+        switch (dtype) {
+        case TYPE_FLOAT:
+            alpha.f = cmdAlpha;
+            break;
+        case TYPE_DOUBLE:
+            alpha.d = cmdAlpha;
+            break;
+        case TYPE_COMPLEX_FLOAT:
+            alpha.f2.s[0] = cmdAlpha;
+            alpha.f2.s[1] = -cmdAlpha / 2;
+            break;
+        case TYPE_COMPLEX_DOUBLE:
+            alpha.d2.s[0] = cmdAlpha;
+            alpha.d2.s[1] = -cmdAlpha / 2;
+            break;
+        default:
+            break;
+        }
+    }
+
+    subdims[0].y = blockM * ITEM_WORK_M;
+    subdims[0].x = blockN * ITEM_WORK_N;
+    subdims[0].bwidth = blockK * BLOCKS_K;
+    subdims[1].y = blockM;
+    subdims[1].x = blockN;
+    subdims[1].bwidth = blockK;
+
+    memset(out, 0, sizeof(out));
+
+    i = isDoubleBasedType(dtype);
+    addTestPrefix(ctx, i);
+
+    blkMulGen(ctx, subdims, dtype, &mulOpts);
+
+    addTestSuffix(ctx, subdims, dtype, &mulOpts);
+
+    run(out, subdims[0].y, subdims[0].x, subdims[0].bwidth, alpha,
+        dtype, &mulOpts, iter);
+
+    destroyKgenContext(ctx);
+
+	return 0;
+}
diff --git a/src/library/blas/gens/legacy/trmm_img.c b/src/library/blas/gens/legacy/trmm_img.c
new file mode 100644
index 0000000..9a69a2d
--- /dev/null
+++ b/src/library/blas/gens/legacy/trmm_img.c
@@ -0,0 +1,850 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * trmm image based generator
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <math.h>
+#include <ctype.h>
+
+#include <clBLAS.h>
+#include <matrix_dims.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include <dis_warning.h>
+
+#include "blas_kgen_legacy.h"
+#include "../gen_helper.h"
+#include "gen_helper_legacy.h"
+#include "trxm_common_legacy.h"
+
+static CLBLASMpatExtra mpatExtra;
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+static ssize_t
+preparator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+static ssize_t
+genWrapper(
+    char *buf,
+    size_t buflen,
+    const struct SubproblemDim *subdims,
+    const struct PGranularity *pgran,
+    void *extra)
+{
+    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    if (kextra->kernType == CLBLAS_COMPUTING_KERNEL) {
+        return generator(buf, buflen, subdims, pgran, extra);
+    }
+    else {
+        return preparator(buf, buflen, subdims, pgran, extra);
+    }
+}
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra);
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs);
+
+static SolverFlags
+solverFlags(void);
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static int getPerf(
+    unsigned int kflags,
+    const void *args);
+
+static SolverOps imgSops = {
+    genWrapper,
+    assignKargs,
+    isFitToLDS,
+    getPerf,
+    NULL,
+    calcNrThreads,
+    NULL,
+    solverFlags,
+    NULL, //fixupKargs
+    NULL, //getDefaultDecomp
+    NULL, //getDecompList
+    NULL,
+    NULL
+};
+
+static void
+imgToCopyBufFuncs(
+    CopyBufFuncs *bufFuncs,
+    const CopyImgFuncs *imgFuncs,
+    KernelExtraFlags kflags)
+{
+    memcpy(bufFuncs->write, imgFuncs->localToImage, FUNC_NAME_MAXLEN);
+    if (isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A)) {
+        memcpy(bufFuncs->read[MATRIX_A],
+               imgFuncs->globalToLocalTransposed[MATRIX_A], FUNC_NAME_MAXLEN);
+        memcpy(bufFuncs->readGeneric[MATRIX_A],
+               imgFuncs->globalToLocalTransposedGeneric[MATRIX_A],
+               FUNC_NAME_MAXLEN);
+    }
+    else {
+        memcpy(bufFuncs->read[MATRIX_A],
+               imgFuncs->globalToLocal[MATRIX_A], FUNC_NAME_MAXLEN);
+        memcpy(bufFuncs->readGeneric[MATRIX_A],
+               imgFuncs->globalToLocalGeneric[MATRIX_A],
+               FUNC_NAME_MAXLEN);
+    }
+}
+
+static void
+genPrepKernelA(
+    struct KgenContext *ctx,
+    const SubproblemDim *subdims,
+    KernelExtraFlags kflags,
+    DataType dtype,
+    CopyImgFuncs *copyImgFuncs,
+    const PGranularity *pgran)
+{
+    char tmp[4096];
+    bool isBranch = false;
+    size_t localBufSize;
+    unsigned int tsize, vecLen;
+    const char *typeName;
+    CopyBufFuncs copyBufFuncs;
+    char fpref;
+
+    fpref = dtypeToBlasPrefix(dtype);
+    typeName = dtypeBuiltinType(dtype);
+    tsize = dtypeSize(dtype);
+    vecLen = sizeof(cl_float4) / tsize;
+    localBufSize = subdims[1].y * fl4RowWidth(subdims[1].bwidth, tsize);
+    localBufSize *= vecLen;
+    imgToCopyBufFuncs(&copyBufFuncs, copyImgFuncs, kflags);
+
+    sprintf(tmp, "void __kernel\n"
+                 "%cprepareImageA(\n"
+                 "    uint M,\n"
+                 "    __global %s *A,\n"
+                 "    uint lda,\n"
+                 "    __write_only image2d_t imgA,\n"
+                 "    uint startM,\n"
+                 "    uint origM,\n"
+                 "    uint offA)\n",
+            fpref, typeName);
+    kgenDeclareFunction(ctx, tmp);
+    kgenBeginFuncBody(ctx);
+
+    kgenDeclareGroupID(ctx, "gid", pgran);
+    kgenDeclareLocalID(ctx, "lid", pgran);
+    sprintf(tmp, "const uint bpr = (origM + %lu) / %lu;\n"
+                 "uint currM = startM + (gid / bpr) * %lu;\n"
+                 "uint k0 = (gid %% bpr) * %lu;\n"
+                 "uint x, y;\n"
+                 "__local %s tempA[%lu];\n"
+                 "bool processed = false;\n\n",
+            subdims[1].bwidth - 1, subdims[1].bwidth, subdims[1].y,
+            subdims[1].bwidth, typeName, localBufSize);
+    kgenAddStmt(ctx, tmp);
+
+    kgenAddStmt(ctx, "A += offA;\n");
+    if (!(isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A) ||
+          isMatrixConj(kflags, MATRIX_A))) {
+
+        if (isMatrixUpper(kflags)) {
+            sprintf(tmp, "if (k0 >= currM + %lu)", subdims[1].y);
+        }
+        else {
+            sprintf(tmp, "if (k0 + %lu <= currM)", subdims[1].bwidth);
+        }
+        kgenBeginBranch(ctx, tmp);
+        sprintf(tmp, "if ((currM + %lu <= M + startM) && "
+                         "(k0 + %lu <= origM) && %d) {\n"
+                     // write directly to an image from the global memory
+                     "    %s(imgA, k0 / %u, currM - startM, (GPtr)A, "
+                            "currM, k0, lda);\n"
+                     "    processed = true;\n"
+                     "}\n",
+                subdims[1].y, subdims[1].bwidth,
+                (kflags & KEXTRA_NO_COPY_VEC_A) == 0,
+                copyImgFuncs->globalToImage[MATRIX_A], vecLen);
+
+        kgenAddStmt(ctx, tmp);
+        kgenEndBranch(ctx, NULL);
+
+        kgenBeginBranch(ctx, "if (!processed)");
+        isBranch = true;
+    }
+
+    // now, zeroing blocks entirely located in the "other" triangle
+    if (isMatrixUpper(kflags)) {
+        sprintf(tmp, "if (k0 + %lu <= currM) {\n"
+                     "    %s((__local float4*)tempA);\n"
+                     "}\n",
+                subdims[1].bwidth, copyImgFuncs->zeroBlock[MATRIX_A]);
+    }
+    else {
+        sprintf(tmp, "if (k0 >= currM + %lu) {\n"
+                     "    %s((__local float4*)tempA);\n"
+                     "}\n",
+                subdims[1].y, copyImgFuncs->zeroBlock[MATRIX_A]);
+    }
+    kgenAddStmt(ctx, tmp);
+
+    // useful block path, reading data from the global memory to the local one
+    kgenBeginBranch(ctx, "else");
+    kgenAddStmt(ctx, "M += startM;\n");
+    genPrepareTrxmBlockA(ctx, subdims, dtype, &copyBufFuncs,
+                         (ZeroFuncs*)copyImgFuncs->zeroBlock,
+                         kflags, "origM");
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+    kgenAddStmt(ctx, "M -= startM;\n");
+    genTriangMatrBlock(ctx, subdims, dtype, kflags);
+    kgenEndBranch(ctx, NULL);
+
+    // and write to the image
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+    sprintf(tmp, "%s(imgA, k0 / %u, currM - startM, (LPtr)tempA);\n",
+            copyImgFuncs->localToImage[MATRIX_A], vecLen);
+    kgenAddStmt(ctx, tmp);
+    if (isBranch) {
+        kgenEndBranch(ctx, NULL);
+    }
+
+    kgenEndFuncBody(ctx);
+}
+
+static void
+genPrepKernelB(
+    struct KgenContext *ctx,
+    const SubproblemDim *subdims,
+    DataType dtype,
+    CopyImgFuncs *copyImgFuncs,
+    const PGranularity *pgran,
+    KernelExtraFlags kflags)
+{
+    char tmp[4096];
+    size_t localBufSize;
+    unsigned int tsize, vecLen;
+    const char *typeName;
+    char fpref;
+
+    const char *funcHead =
+        "bool trb, aligned;\n"
+        "const uint bpr = (origM + %lu) / %lu;\n"
+        "const uint n = startN + (gid / bpr) * %lu;\n"
+        "const uint k = (gid %% bpr) * %lu;\n"
+        "uint x, y;\n"
+        "__local %s temp[%lu];\n"
+        "\n"
+        "B += offB;\n"
+        "trb = (order == clblasRowMajor) ^ (side == clblasRight);\n"
+        "N += startN;\n";
+
+    const char *funcBody =
+        "//copy matrix B block\n"
+        "y = n + %u <= N ? %u : N - n;\n"
+        "x = k + %u <= origM ? %u : origM - k;\n"
+        "aligned = (x == %u) && (y == %u) && %d;\n"
+        "if (aligned && !trb) {\n"
+        "    %s(imgB, k / %u, n - startN, (GPtr)B, n, k, ldb);\n"
+        "}\n"
+        "else {\n"
+        "    if (n >= N) {\n"
+                // just zero, this is padding related part
+        "        %s((__local float4*)temp);\n"
+        "    }\n"
+        "    else if (!aligned) {\n"
+        "        // zero local memory\n"
+        "        %s((__local float4*)temp);\n"
+        "        barrier(CLK_LOCAL_MEM_FENCE);\n"
+        "        if (trb) {\n"
+        "            // generic transposed global to local\n"
+        "            %s((LPtr)temp, (GPtr)B, k, n, x, y, %u, ldb);\n"
+        "        }\n"
+        "        else {\n"
+        "            // generic global to local\n"
+        "            %s((LPtr)temp, (GPtr)B, n, k, y, x, %u, ldb);\n"
+        "        }\n"
+        "    }\n"
+        "    else {\n"
+        "        if (trb) {//transposed, aligned\n"
+        "            // optimized transposed global to local\n"
+        "            %s((LPtr)temp, (GPtr)B, k, n, ldb);\n"
+        "        }\n"
+        "    }\n"
+        "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+        "    %s(imgB, k / %u, n - startN, (LPtr)temp);\n"
+        "}\n"
+        "\n";
+
+    fpref = dtypeToBlasPrefix(dtype);
+    typeName = dtypeBuiltinType(dtype);
+    tsize = dtypeSize(dtype);
+    vecLen = sizeof(cl_float4) / tsize;
+    localBufSize = subdims[1].x * fl4RowWidth(subdims[1].bwidth, tsize);
+    localBufSize *= vecLen;
+
+    sprintf(tmp, "void __kernel\n"
+                 "%cprepareImageB(\n"
+                 "    clblasOrder order,\n"
+                 "    clblasSide side,\n"
+                 "    uint N,\n"
+                 "    __global %s *B,\n"
+                 "    uint ldb,\n"
+                 "    __write_only image2d_t imgB,\n"
+                 "    uint startN,\n"
+                 "    uint origM,\n"
+                 "    uint offB)\n",
+            fpref, typeName);
+    kgenDeclareFunction(ctx, tmp);
+    kgenBeginFuncBody(ctx);
+
+    kgenDeclareGroupID(ctx, "gid", pgran);
+    sprintf(tmp, funcHead,
+            subdims[1].bwidth - 1, subdims[1].bwidth,
+            subdims[1].x, subdims[1].bwidth,
+            typeName, localBufSize);
+    kgenAddStmt(ctx, tmp);
+
+    sprintf(tmp, funcBody,
+            subdims[1].x, subdims[1].x, // y = n + dy <= N ?...
+            subdims[1].bwidth,
+            subdims[1].bwidth, // x = k + bw <= M ?...
+            subdims[1].bwidth,
+            subdims[1].x, // aligned = (x==bw1)&&(y==dx1)
+            (kflags & KEXTRA_NO_COPY_VEC_B) == 0,
+            copyImgFuncs->globalToImage[MATRIX_B],
+            vecLen,
+            copyImgFuncs->zeroBlock[MATRIX_B],
+            copyImgFuncs->zeroBlock[MATRIX_B],
+            copyImgFuncs->globalToLocalTransposedGeneric[MATRIX_B],
+            subdims[1].bwidth,
+            copyImgFuncs->globalToLocalGeneric[MATRIX_B],
+            subdims[1].bwidth,
+            copyImgFuncs->globalToLocalTransposed[MATRIX_B],
+            copyImgFuncs->localToImage[MATRIX_B],
+            vecLen);
+    kgenAddStmt(ctx, tmp);
+
+    kgenEndFuncBody(ctx);
+}
+
+static void
+declareMainKernel(
+    struct KgenContext *ctx,
+    DataType dtype,
+    KernelExtraFlags kflags,
+    const PGranularity *pgran)
+{
+    char tmp[4048];
+    char fpref;
+    const char *typeName;
+    char coordNames[2] = {'M', 'N'};
+    int side = ((kflags & KEXTRA_SIDE_RIGHT) != 0);
+
+    fpref = dtypeToBlasPrefix(dtype);
+    typeName = dtypeBuiltinType(dtype);
+    sprintf(tmp, "__attribute__((reqd_work_group_size(%u, %u, 1)))\n"
+                 "void __kernel\n"
+                 "%ctrmmImg(\n"
+                 "    uint %c,\n"
+                 "    uint %c,\n"
+                 "    const %s alpha,\n"
+                 "    const __read_only image2d_t A,\n"
+                 "    const __read_only image2d_t B,\n"
+                 "    __global %s *C,\n"
+                 "    uint ldb,\n"
+                 "    const uint start%c,\n"
+                 "    const uint start%c,\n"
+                 "    const uint origM,\n"
+                 "    const uint offB)\n",
+             pgran->wgSize[0], pgran->wgSize[1],  fpref, coordNames[side],
+             coordNames[1 - side], typeName, typeName, coordNames[side],
+             coordNames[1 - side]);
+
+    kgenDeclareFunction(ctx, tmp);
+}
+
+// Preparation function for images based kernel generator
+static ssize_t
+preparator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+    struct KgenContext *ctx;
+    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    CopyImgFuncs copyImgFuncs;
+    BlasGenSettings gset;
+    ssize_t ret;
+    bool b;
+
+    memset(&copyImgFuncs, 0, sizeof(copyImgFuncs));
+    memset(&gset, 0, sizeof(gset));
+
+    ctx = createKgenContext(buf, buflen, true);
+    if (ctx == NULL) {
+        return -ENOMEM;
+    }
+
+    b = isDoubleBasedType(kextra->dtype);
+    kgenDeclareUptrs(ctx, b);
+    if (kextra->kernType == CLBLAS_PREP_B_KERNEL) {
+        declareBlasEnums(ctx);
+    }
+
+    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
+    gset.kextra = kextra;
+    gset.pgran = pgran;
+
+    // generate necessary memory to image copying functions
+    generateImageCopyFuncs(&copyImgFuncs, ctx, CLBLAS_TRMM, &gset);
+    kgenAddBlankLine(ctx);
+
+    if (kextra->kernType == CLBLAS_PREP_A_KERNEL) {
+        genPrepKernelA(ctx, subdims, kextra->flags, kextra->dtype,
+                       &copyImgFuncs, pgran);
+    }
+    else {
+        genPrepKernelB(ctx, subdims, kextra->dtype, &copyImgFuncs, pgran,
+                       kextra->flags);
+    }
+
+    ret = kgenAddBlankLine(ctx);
+    if (!ret) {
+        ret = (ssize_t)kgenSourceSize(ctx) + 1;
+    }
+    destroyKgenContext(ctx);
+
+    return (ret < 0) ? -EOVERFLOW : ret;
+}
+
+static void
+initKernelVarNames(KernelVarNames *kvars, KernelExtraFlags kflags)
+{
+    kvars->A = "imgA";
+    kvars->B = "imgB";
+    if (isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A)) {
+        kvars->coordA = "coordA.x";
+    }
+    else {
+        kvars->coordA = "coordA.y";
+    }
+    if (isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_B)) {
+        kvars->coordB = "coordB.x";
+    }
+    else {
+        kvars->coordB = "coordB.y";
+    }
+    kvars->sizeM = "M";
+    kvars->sizeN = "N";
+    kvars->sizeK = "K";
+}
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+    struct KgenContext *ctx;
+    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    char tmp[4096], tmp1[4096];
+    char *p;
+    // is the iteration over N, N at the top level
+    const char *typeName;
+    DataType dtype = kextra->dtype;
+    ssize_t ret;
+    BlasGenSettings gset;
+    BlkMulOpts mulOpts;
+    unsigned int tsize;
+    unsigned int vecLen, outVecLen;
+    bool b;
+    const char *outTypeName;
+    unsigned int i;
+    unsigned int nrRegs, regPitch;
+    KernelExtraFlags kflags = kextra->flags;
+    int tra, trb;
+    char coordNames[2] = {'M', 'N'};
+    char vect[2] = {'y', 'x'};
+
+    const char *coordConstants =
+        "const uint workItemM = startM + get_global_id(0) * %lu;\n"
+        "const uint workItemN = startN + get_global_id(1) * %lu;\n"
+        "const int2 skewRow = (int2)(0, get_local_id(0) %% %lu);\n"
+        "uint vectK = (origM + %u) / %u;\n";
+
+    /*
+     *  template for image based trmm preparation part
+     *  for two dimensional work space
+     */
+    const char *localVariables =
+        "uint k0;\n"
+        "int2 coordA = (int2)(0, workItemM - startM);\n"
+        "int2 coordB = (int2)(0, workItemN - startN);\n"
+        "%s c[%u];\n\n";
+
+    memset(&gset, 0, sizeof(gset));
+    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
+    gset.kextra = kextra;
+    gset.pgran = pgran;
+    initKernelVarNames(&gset.varNames, kflags);
+
+    tsize = dtypeSize(dtype);
+    vecLen = sizeof(cl_float4) / dtypeSize(dtype);
+    if (isComplexType(dtype)) {
+        regPitch = (unsigned int)subdims[1].x;
+    }
+    else {
+        regPitch = (unsigned int) fl4RowWidth(subdims[1].x, tsize) *
+                                             sizeof(cl_float4) / tsize;
+    }
+
+    ctx = createKgenContext(buf, buflen, true);
+    if (ctx == NULL) {
+        return -ENOMEM;
+    }
+
+    // at first, generate needed declarations and auxiliary functions
+    b = isDoubleBasedType(dtype);
+    kgenDeclareUptrs(ctx, b);
+
+    typeName = dtypeBuiltinType(dtype);
+
+    // now, generate the kernel
+    declareMainKernel(ctx, dtype, kflags, pgran);
+    ret = kgenBeginFuncBody(ctx);
+
+    // constants
+    sprintf(tmp, coordConstants,
+            subdims[1].y, subdims[1].x, subdims[1].y,
+            vecLen - 1, vecLen);
+    kgenAddStmt(ctx, tmp);
+
+    /*
+     * Calculate local buffer pitches, and then declare local
+     * variables
+     */
+    getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName);
+
+    sprintf(tmp, localVariables, outTypeName, nrRegs);
+    kgenAddStmt(ctx, tmp);
+
+    // check if offset exceeds matrix
+    kgenAddStmt(ctx, "if ((workItemM >= startM + M) ||"
+                         "(workItemN >= startN + N)) {\n"
+                     "    return;\n"
+                     "}\n");
+
+    // zero C block
+    sprintf(tmp, "for (k0 = 0; k0 < %u; k0++) {\n"
+                 "    c[k0] = 0;\n"
+                 "}\n\n",
+            nrRegs);
+    kgenAddStmt(ctx, tmp);
+
+    // loop over K
+    if (isMatrixUpper(kflags)) {
+        sprintf(tmp, "coordA.x = vectK - %lu;\n"
+                     "coordB.x = coordA.x;\n",
+                subdims[1].bwidth / vecLen);
+        kgenAddStmt(ctx, tmp);
+        sprintf(tmp, "for (k0 = ((workItemM/%lu)*%lu)/%u; "
+                          "k0 < vectK; k0 += %lu)",
+                subdims[0].bwidth, subdims[0].bwidth, vecLen,
+                subdims[1].bwidth / vecLen);
+    }
+    else {
+        size_t dk;
+
+        dk = (subdims[1].y > subdims[1].bwidth) ? subdims[1].y :
+                                                  subdims[1].bwidth;
+        dk = dk / vecLen + 1;
+        sprintf(tmp, "for (k0 = 0; "
+                          "k0 < min((workItemM+%u)/%u + %lu, vectK); "
+                          "k0 += %lu)",
+                vecLen - 1, vecLen, dk, subdims[1].bwidth / vecLen);
+    }
+    kgenBeginBranch(ctx, tmp);
+
+    mulOpts.aMobj = CLMEM_IMAGE;
+    mulOpts.bMobj = CLMEM_IMAGE;
+    mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | BLKMUL_SKEW_ROW | BLKMUL_INLINE |
+                    BLKMUL_AVOID_AND;
+    if (isComplexType(dtype)) {
+        mulOpts.core = BLKMUL_SEPARATE_MULADD;
+    }
+    else {
+        mulOpts.core = BLKMUL_MAD;
+    }
+    mulOpts.argNames.coordA = "coordA";
+    mulOpts.argNames.coordB = "coordB";
+    mulOpts.argNames.skewCol = "skewCol";
+    mulOpts.argNames.skewRow = "skewRow";
+    mulOpts.argNames.k = "k0";
+    mulOpts.argNames.vectBoundK = "vectK";
+    ret = blkMulGen(ctx, subdims, dtype, &mulOpts);
+    if (ret) {
+        destroyKgenContext(ctx);
+        return -EOVERFLOW;
+    }
+
+    // update image coordinates
+    if (isMatrixUpper(kflags)) {
+        // In this case loop is inverted to avoid 'random' skews
+        sprintf(tmp, "\ncoordA.x -= %lu;\n"
+                     "coordB.x -= %lu;\n",
+                subdims[1].bwidth / vecLen, subdims[1].bwidth / vecLen);
+    }
+    else {
+        sprintf(tmp, "\ncoordA.x += %lu;\n"
+                     "coordB.x += %lu;\n",
+                subdims[1].bwidth / vecLen, subdims[1].bwidth / vecLen);
+    }
+    kgenAddStmt(ctx, tmp);
+
+    kgenEndBranch(ctx, NULL);
+    // reorder the given solution
+    outVecLen = isComplexType(dtype) ? 1 : vecLen;
+    p = tmp1;
+    for (i = 0; i < regPitch / outVecLen; i++) {
+        unsigned int k = (unsigned int)(subdims[1].y - 1)
+                                         * regPitch / outVecLen + i;
+
+        sprintf(p,  "\n"
+                    "    tmp = c[%u];\n"
+                    "    for (j = %lu; j >= 0; j--) {\n"
+                    "        c[(j+1) * %u + %u] = c[j * %u + %u];\n"
+                    "    }\n"
+                    "    c[%u] = tmp;\n",
+                k, subdims[1].y - 2, regPitch / outVecLen,
+                i, regPitch / outVecLen, i, i);
+        p += strlen(p);
+    }
+    sprintf(tmp, "\n"
+                 "for (k0 = 0; k0 < skewRow.y; k0++) {\n"
+                 "    int j;\n"
+                 "    %s tmp;\n"
+                 "%s"
+                 "}\n"
+                 "\n",
+                 outTypeName, tmp1);
+    kgenAddStmt(ctx, tmp);
+
+    // write back the tile evaluated
+    tra = isMatrixAccessColMaj(CLBLAS_TRMM, kextra->flags, MATRIX_A);
+    trb = isMatrixAccessColMaj(CLBLAS_TRMM, kextra->flags, MATRIX_B);
+    sprintf(tmp, "coordA.%c = workItemM - startM;\n"
+                 "coordB.%c = workItemN - startN;\n\n",
+            vect[tra], vect[trb]);
+    kgenAddStmt(ctx, tmp);
+    kgenBeginBranch(ctx, NULL);
+    trb = isMatrixAccessColMaj(CLBLAS_TRMM, kextra->flags, MATRIX_C);
+    sprintf(tmp, "__global %s *B = C + offB + start%c * ldb + start%c;\n\n",
+            typeName, coordNames[trb], coordNames[1 - trb]);
+
+    kgenAddStmt(ctx, tmp);
+    generateResultUpdateOld(ctx, CLBLAS_TRMM, &gset, NULL, NULL);
+    kgenEndBranch(ctx, NULL);
+    kgenEndFuncBody(ctx);
+    ret = kgenAddBlankLine(ctx);
+
+    if (!ret) {
+        ret = (ssize_t)kgenSourceSize(ctx) + 1;
+    }
+
+    destroyKgenContext(ctx);
+
+    return (ret < 0) ? -EOVERFLOW : ret;
+}
+
+
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra)
+{
+    const CLBlasKargs *blasArgs = (const CLBlasKargs*)params;
+    int side = (blasArgs->side == clblasRight);
+    size_t sizes[2] = {blasArgs->M, blasArgs->N};
+    size_t offs[2] = {blasArgs->offsetM, blasArgs->offsetN};
+
+    (void)extra;
+
+    switch (blasArgs->kernType) {
+    case CLBLAS_COMPUTING_KERNEL:
+        initSizeKarg(&args[0], blasArgs->M);
+        initSizeKarg(&args[1], blasArgs->N);
+        assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype);
+        INIT_KARG(&args[3], blasArgs->scimage[0]);
+        INIT_KARG(&args[4], blasArgs->scimage[1]);
+        initMemobjKarg(&args[5], blasArgs->B, NULL, 0, 0);
+        initSizeKarg(&args[6], blasArgs->ldb.matrix);
+        initSizeKarg(&args[7], blasArgs->offsetM);
+        initSizeKarg(&args[8], blasArgs->offsetN);
+        initSizeKarg(&args[9], blasArgs->K);
+        initSizeKarg(&args[10], blasArgs->offBX);
+        break;
+    case CLBLAS_PREP_A_KERNEL:
+        initSizeKarg(&args[0], sizes[side]);
+        initMemobjKarg(&args[1], blasArgs->A, NULL, 0, 0);
+        initSizeKarg(&args[2], blasArgs->lda.matrix);
+        INIT_KARG(&args[3], blasArgs->scimage[0]);
+        initSizeKarg(&args[4], offs[side]);
+        initSizeKarg(&args[5], blasArgs->K);
+        initSizeKarg(&args[6], blasArgs->offA);
+        break;
+    case CLBLAS_PREP_B_KERNEL:
+        INIT_KARG(&args[0], blasArgs->order);
+        INIT_KARG(&args[1], blasArgs->side);
+        initSizeKarg(&args[2], sizes[1 - side]);
+        initMemobjKarg(&args[3], blasArgs->B, NULL, 0, 0);
+        initSizeKarg(&args[4], blasArgs->ldb.matrix);
+        INIT_KARG(&args[5], blasArgs->scimage[1]);
+        initSizeKarg(&args[6], offs[1 - side]);
+        initSizeKarg(&args[7], blasArgs->K);
+        initSizeKarg(&args[8], blasArgs->offBX);
+        break;
+    default:
+        //this should not happen
+        break;
+    }
+}
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs)
+{
+    cl_ulong size;
+    const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs;
+    size = matrBlockSize(&dim[1], MATRIX_C, dtype, kargs->side);
+    return (size * dtypeSize(dtype) <= ldsSize);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra)
+{
+    const CLBlasKargs *kargs = args;
+    size_t m, n, k;
+    (void)extra;
+
+    //form inner subdims with respect of multiplication side
+    if (kargs->side == clblasRight) {
+        m = kargs->N;
+        n = kargs->M;
+        //original N was stored in K
+        k = kargs->K;
+    }
+    else {
+        m = kargs->M;
+        n = kargs->N;
+        //original M was stored in K
+        k = kargs->K;
+    }
+
+    if (kargs->kernType != CLBLAS_COMPUTING_KERNEL) {
+        size_t whole, part;
+        size_t nrGroups;
+
+        // each thread gets one block
+        if (kargs->kernType == CLBLAS_PREP_A_KERNEL) {
+            whole = m;
+            part = subdims[0].itemY;
+        }
+        else {
+            whole = n;
+            part = subdims[0].itemX;
+        }
+
+        nrGroups = whole / part + (whole % part != 0);
+        nrGroups *= (k / subdims[0].bwidth +
+                    (k % subdims[0].bwidth != 0));
+        threads[0] = pgran->wgSize[0] * nrGroups;
+        threads[1] = pgran->wgSize[1];
+    }
+    else {
+        calcGlobalThreads(threads, &subdims[0], pgran, m, n);
+    }
+}
+
+static SolverFlags
+solverFlags(void)
+{
+    return (SF_WSPACE_2D);
+}
+
+void
+initTrmmImgPattern(MemoryPattern *mempat)
+{
+    mempat->name = "Image based block trmm";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &imgSops;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS;
+    mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS;
+    mpatExtra.mobjA = CLMEM_IMAGE;
+    mpatExtra.mobjB = CLMEM_IMAGE;
+    mempat->extra = &mpatExtra;
+}
+
+static int
+getPerf( unsigned int kflags,
+    const void *args)
+{
+    DUMMY_ARG_USAGE(kflags);
+    DUMMY_ARG_USAGE(args);
+
+    return PPERF_POOR;
+}
diff --git a/src/library/blas/gens/legacy/trmm_lds.c b/src/library/blas/gens/legacy/trmm_lds.c
new file mode 100644
index 0000000..d7fe882
--- /dev/null
+++ b/src/library/blas/gens/legacy/trmm_lds.c
@@ -0,0 +1,514 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * LDS based generator
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include <matrix_dims.h>
+#include <dis_warning.h>
+
+#include "../init.h"
+#include "blas_kgen_legacy.h"
+#include "gen_helper_legacy.h"
+#include "../gen_helper.h"
+#include "../trxm_common.h"
+#include "trxm_common_legacy.h"
+
+static CLBLASMpatExtra mpatExtra;
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra);
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs);
+
+static SolverFlags
+solverFlags(void);
+
+static int
+getPerf( unsigned int kflags,
+    const void *args);
+
+static SolverOps solverOps = {
+    generator,
+    assignKargs,
+    isFitToLDS,
+    getPerf,
+    NULL,
+    NULL,
+    NULL,
+    solverFlags,
+    NULL, //fixupKargs
+    NULL, //getDefaultDecomp
+    NULL, //getDecompList
+    NULL,
+    NULL
+};
+
+static void
+genPrepareBlockC(
+    struct KgenContext *ctx,
+    const ZeroFuncs *zeroFuncs)
+{
+    char tmp[2048];
+
+    sprintf(tmp, "%s((__local float4*)tempC);\n", zeroFuncs->names[MATRIX_C]);
+    kgenAddStmt(ctx, tmp);
+}
+
+static void
+genWriteBlockB(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    DataType dtype,
+    const CopyBufFuncs *copyFuncs,
+    KernelExtraFlags kflags)
+{
+    char tmp[1024];
+    size_t pitch;
+    const char *coordName[2] = {"currM", "currN"};
+    int trb;
+
+    trb = isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_C);
+    pitch = matrBlockPitch(dim, MATRIX_C, dtype, clblasLeft);
+
+    if (!(kflags & (KEXTRA_TAILS_N | KEXTRA_TAILS_M))) {
+        sprintf(tmp, "%s((GPtr)B, (LPtr)tempC, %s, %s, ldb);\n",
+                copyFuncs->write, coordName[trb], coordName[1 - trb]);
+    }
+    else {
+        sprintf(tmp,
+                "y = (currM + %lu <= M) ? %lu : M - currM;\n"
+                "x = (currN + %lu <= N) ? %lu : N - currN;\n"
+                "if ((y == %lu) && (x == %lu)) {\n"
+                     // fast rwrite
+                "    %s((GPtr)B, (LPtr)tempC, %s, %s, ldb);\n"
+                "}\n"
+                "else {\n"
+                     // slow write
+                "    %s((GPtr)B, (LPtr)tempC, %s, %s, y, x, ldb, %lu);\n"
+                "}\n\n",
+                dim->y, dim->y, dim->x, dim->x, dim->y, dim->x,
+                copyFuncs->write, coordName[trb], coordName[1 - trb],
+                copyFuncs->writeGeneric, coordName[trb],
+                coordName[1 - trb], pitch);
+    }
+
+    kgenAddStmt(ctx, tmp);
+}
+
+static void
+genInitCurrM(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    KernelExtraFlags kflags)
+{
+    char tmp[1024];
+
+    if (isMatrixUpper(kflags)) {
+        strcpy(tmp, "currM = 0;\n");
+    }
+    else {
+        sprintf(tmp, "currM = (M - 1) / %lu * %lu;\n", dim->y, dim->y);
+    }
+
+    kgenAddStmt(ctx, tmp);
+    kgenAddBlankLine(ctx);
+}
+
+static void
+genInternalLoopCtl(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    KernelExtraFlags kflags)
+{
+    char tmp[1024];
+
+    if (isMatrixUpper(kflags)) {
+        if (!(kflags & KEXTRA_TAILS_M)) {
+            sprintf(tmp, "for (k0 = M - %lu; (k0 + %lu > currM) && (k0 < M); "
+                         "k0 -= %lu)",
+                    dim->bwidth, dim->bwidth, dim->bwidth);
+        }
+        else {
+            sprintf(tmp, "for (k0 = (M - 1) / %lu * %lu; k0 + %lu > currM; "
+                              "k0 -= %lu)",
+                    dim->bwidth, dim->bwidth, dim->bwidth, dim->bwidth);
+        }
+    }
+    else {
+        sprintf(tmp, "for (k0 = 0; (k0 < currM + %lu) && (k0 < M); "
+                          "k0 += %lu)",
+                dim->y, dim->bwidth);
+    }
+
+    kgenBeginBranch(ctx, tmp);
+}
+
+static void
+initKernelVarNames(KernelVarNames *kvars,  KernelExtraFlags kflags)
+{
+    kvars->A = "A";
+    kvars->B = "B";
+    if (isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A)) {
+        kvars->coordA = "coordA.x";
+    }
+    else {
+        kvars->coordA = "coordA.y";
+    }
+    if (isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_B)) {
+        kvars->coordB = "coordB.x";
+    }
+    else {
+        kvars->coordB = "coordB.y";
+    }
+    kvars->sizeM = "M";
+    kvars->sizeN = "N";
+    kvars->sizeK = "origM";
+}
+
+static ssize_t
+generator(
+    char *buf,
+    size_t buflen,
+    const struct SubproblemDim *subdims,
+    const struct PGranularity *pgran,
+    void *extra)
+{
+    struct KgenContext *ctx;
+    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    char tmp[2048];
+    char blkmul[128];
+    char updateResFn[FUNC_NAME_MAXLEN];
+    char updateResGenericFn[FUNC_NAME_MAXLEN];
+    CopyBufFuncs copyFuncs;
+    ZeroFuncs zeroFuncs;
+    DataType dtype = kextra->dtype;
+    ssize_t ret;
+    BlasGenSettings gset;
+    BlkMulOpts mulOpts;
+    size_t pitchAB, pitchC;
+    bool b;
+    KernelExtraFlags kflags = kextra->flags;
+    const char *outTypeName;
+    unsigned int nrRegs;
+    bool useLocalC;
+    unsigned int vecLen = sizeof(cl_float4) / dtypeSize(dtype);
+    int tra, trb;
+    unsigned int l1Pans;
+    char vect[2] = {'y', 'x'};
+
+    if (pgran->wgDim != 1) {
+        return -EINVAL;
+    }
+
+    ctx = createKgenContext(buf, buflen, true);
+    if (ctx == NULL) {
+        return -ENOMEM;
+    }
+
+    /* Code that updates block of B matrix using local registers or use mad's
+     * doesn't work on some GPUs. As a workaround use buffer in local memory
+     * for unaligned matrix sizes */
+    useLocalC = (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N));
+
+    memset(&gset, 0, sizeof(gset));
+    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
+    gset.pgran = pgran;
+    gset.kextra = kextra;
+
+    initKernelVarNames(&gset.varNames, kflags);
+
+    // at first, generate needed declarations and auxiliary functions
+
+    b = isDoubleBasedType(dtype);
+    kgenDeclareUptrs(ctx, b);
+    generateBufCopyFuncs(&copyFuncs, ctx, CLBLAS_TRMM, &gset,
+                         BCHF_MATRIX_A | BCHF_MATRIX_B | BCHF_WRITE_OUTPUT);
+    if (useLocalC) {
+        generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype,
+                ZF_MATRIX_A | ZF_MATRIX_B | ZF_MATRIX_C);
+    }
+    else {
+        generateUpresFuncs(ctx, CLBLAS_TRMM, &gset, updateResFn,
+                           updateResGenericFn);
+        generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype,
+                ZF_MATRIX_A | ZF_MATRIX_B);
+    }
+    kgenAddBlankLine(ctx);
+
+    // block multiplication function
+    mulOpts.aMobj = CLMEM_BUFFER;
+    mulOpts.bMobj = CLMEM_BUFFER;
+
+    if (useLocalC) {
+        mulOpts.flags = BLKMUL_SKEW_COLUMN;
+    }
+    else {
+        mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | BLKMUL_SKEW_COLUMN;
+    }
+    // BLKMUL_MAD doesn't work here on all cards so use SEPARATE_MULADD always
+    // as a workaround
+    mulOpts.core = BLKMUL_SEPARATE_MULADD;
+    ret = blkMulGen(ctx, subdims, dtype, &mulOpts);
+    if (ret) {
+        destroyKgenContext(ctx);
+        return -EOVERFLOW;
+    }
+
+    kgenAddBlankLine(ctx);
+    kgenGetLastFuncName(blkmul, sizeof(blkmul), ctx);
+
+    // now, generate the kernel
+    declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRMM, NULL, false,
+                      false);
+    ret = kgenBeginFuncBody(ctx);
+
+    /*
+     * Calculate local buffer pitches, and then insert the
+     * preparative code
+     */
+    pitchAB = matrBlockPitch(subdims, MATRIX_A, dtype, clblasLeft);
+    pitchC = matrBlockPitch(subdims, MATRIX_C, dtype, clblasLeft);
+
+    getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName);
+    declareLdsBasedTrxmVariables(ctx, dtype, subdims, pgran, useLocalC);
+
+    /*
+    * B matrix is divided on panels, each work group
+    * multiply such a panel on the whole matrix A.
+    */
+    sprintf(tmp, "currN = gid * %lu;\n", subdims->x);
+    kgenAddStmt(ctx, tmp);
+    genInitCurrM(ctx, subdims, kflags);
+    if (((kflags & (KEXTRA_SIDE_RIGHT | KEXTRA_STARTM_NOT_ZERO)) ==
+          KEXTRA_STARTM_NOT_ZERO) ||
+        ((kflags & (KEXTRA_SIDE_RIGHT | KEXTRA_STARTN_NOT_ZERO)) ==
+                   (KEXTRA_SIDE_RIGHT | KEXTRA_STARTN_NOT_ZERO))) {
+
+        kgenAddStmt(ctx, "A += lda * offsetM + offsetM;\n");
+    }
+    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
+        kgenAddStmt(ctx, "A += offA;\n");
+    }
+    genTrxmBMatrShift(ctx, kflags, false);
+
+    tra = isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A);
+    trb = isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_B);
+    l1Pans = (unsigned int)subdims[0].x / (unsigned int)subdims[1].x;
+
+    sprintf(tmp, "coordB.%c = currN + lid %% %u * %lu;\n"
+                 "coordB.%c = 0;\n\n",
+            vect[trb], l1Pans, subdims[1].x, vect[1 - trb]);
+    kgenAddStmt(ctx, tmp);
+
+    // loop over M
+    sprintf(tmp, "for (m0 = 0; m0 < M; m0 += %lu)", subdims->y);
+    kgenBeginBranch(ctx, tmp);
+
+    sprintf(tmp, "coordA.%c = currM + lid / %u * %lu;\n"
+                 "coordA.%c = 0;\n\n",
+            vect[tra], l1Pans, subdims[1].y, vect[1 - tra]);
+    kgenAddStmt(ctx, tmp);
+
+    if (useLocalC) {
+        genPrepareBlockC(ctx, &zeroFuncs);
+    }
+    else {
+        // zero work item C block
+        sprintf(tmp, "for (k0 = 0; k0 < %u; k0++) {\n"
+                     "    c[k0] = 0;\n"
+                     "}\n\n", nrRegs);
+        kgenAddStmt(ctx, tmp);
+    }
+
+    /*
+     * In the first pass the part without triangle blocks is processed,
+     * and in the second one only triangle blocks are processed
+     */
+    genInternalLoopCtl(ctx, subdims, kflags);
+
+    genPrepareTrxmBlockA(ctx, subdims, dtype, &copyFuncs, &zeroFuncs,
+                         kflags, "M");
+    genPrepareTrxmBlockB(ctx, subdims, dtype, &copyFuncs, &zeroFuncs,
+                         kflags);
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+    kgenAddBlankLine(ctx);
+
+    genTriangMatrBlock(ctx, subdims, dtype, kflags);
+
+    // and eventually multiply the blocks and update the matrix C block
+    if (useLocalC) {
+        sprintf(tmp, "%s(alpha, (LPtr)(tempA + (lid / %u * %lu) * %lu), \n"
+                "                    (LPtr)(tempB + (lid %% %u * %lu) * %lu),\n"
+                "                    (LPtr)(tempC + (lid / %u * %lu) * %lu + \n"
+                "                    (lid %% %u * %lu)), lid);\n",
+                blkmul, l1Pans, subdims[1].y, pitchAB,
+                l1Pans, subdims[1].x, pitchAB,
+                l1Pans, subdims[1].y, pitchC, l1Pans, subdims[1].x);
+    }
+    else {
+        sprintf(tmp, "%s((LPtr)(tempA + (lid / %u * %lu) * %lu), "
+                     "(LPtr)(tempB + (lid %% %u * %lu) * %lu), c, lid);\n",
+                   blkmul, l1Pans, subdims[1].y, pitchAB, l1Pans,
+                   subdims[1].x, pitchAB);
+    }
+    kgenAddStmt(ctx, tmp);
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+
+    genInternalLoopEnd(ctx);                             // loop over K
+    kgenAddBlankLine(ctx);
+
+    // write back the block, it's evaluated
+    if (useLocalC) {
+        genWriteBlockB(ctx, subdims, dtype, &copyFuncs, kflags);
+        kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE);
+    }
+    else {
+        if (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N)) {
+            sprintf(tmp, "if ((coordA.%c < M) && (coordB.%c < N))",
+                    vect[tra], vect[trb]);
+            kgenBeginBranch(ctx, tmp);
+        }
+
+        generateResultUpdateOld(ctx, CLBLAS_TRMM, &gset, updateResFn,
+                             updateResGenericFn);
+
+        if (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N)) {
+           kgenEndBranch(ctx, tmp);
+        }
+    }
+
+    if (isMatrixUpper(kflags)) {
+        sprintf(tmp, "currM += %lu;\n", subdims[0].y);
+    }
+    else {
+        sprintf(tmp, "currM -= %lu;\n", subdims[0].y);
+    }
+    kgenAddStmt(ctx, tmp);
+
+    kgenEndBranch(ctx, NULL);                                 // loop over M
+
+    kgenEndFuncBody(ctx);
+    ret = kgenAddBlankLine(ctx);
+
+    if (!ret) {
+        ret = (ssize_t)kgenSourceSize(ctx) + 1;
+    }
+
+    destroyKgenContext(ctx);
+
+    return (ret < 0) ? -EOVERFLOW : ret;
+}
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra)
+{
+    const CLBlasKargs *blasArgs = (const CLBlasKargs*)params;
+    KernelExtraFlags kflags = ((const CLBLASKernExtra*)extra)->flags;
+    int idx = 7;
+
+    initSizeKarg(&args[0], blasArgs->M);
+    initSizeKarg(&args[1], blasArgs->N);
+    assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype);
+    initMemobjKarg(&args[3], blasArgs->A, NULL, 0, 0);
+    initSizeKarg(&args[4], blasArgs->lda.matrix);
+    initMemobjKarg(&args[5], blasArgs->B, NULL, 0, 0);
+    initSizeKarg(&args[6], blasArgs->ldb.matrix);
+    if (kflags & KEXTRA_STARTM_NOT_ZERO) {
+        initSizeKarg(&args[idx++], blasArgs->offsetM);
+    }
+    if (kflags & KEXTRA_STARTN_NOT_ZERO) {
+        initSizeKarg(&args[idx++], blasArgs->offsetN);
+    }
+    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
+        initSizeKarg(&args[idx++], blasArgs->offA);
+    }
+    if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
+        initSizeKarg(&args[idx++], blasArgs->offBX);
+    }
+}
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs)
+{
+    const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs;
+    cl_ulong size;
+
+    size = matrBlockSize(dim, MATRIX_A, dtype, kargs->side);
+    size += matrBlockSize(dim, MATRIX_B, dtype, kargs->side);
+    size += matrBlockSize(dim, MATRIX_C, dtype, kargs->side);
+
+    return (size * dtypeSize(dtype) <= ldsSize);
+}
+
+static SolverFlags
+solverFlags(void)
+{
+    return ((unsigned int)SF_WSPACE_1D);
+}
+
+void
+initTrmmLdsPattern(MemoryPattern *mempat)
+{
+    mempat->name = "LDS based block trmm";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &solverOps;
+
+    mpatExtra.aMset = CLMEM_LEVEL_LDS;
+    mpatExtra.bMset = CLMEM_LEVEL_LDS;
+    mpatExtra.mobjA = CLMEM_BUFFER;
+    mpatExtra.mobjB = CLMEM_BUFFER;
+    mempat->extra = &mpatExtra;
+}
+
+static int
+getPerf( unsigned int kflags,
+    const void *args)
+{
+    DUMMY_ARG_USAGE(kflags);
+    DUMMY_ARG_USAGE(args);
+
+    return PPERF_POOR;
+}
diff --git a/src/library/blas/gens/legacy/trsm_cached_lds.c b/src/library/blas/gens/legacy/trsm_cached_lds.c
new file mode 100644
index 0000000..828da2a
--- /dev/null
+++ b/src/library/blas/gens/legacy/trsm_cached_lds.c
@@ -0,0 +1,1005 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * TRSM generator with support of cached reads from the global memory
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include <matrix_props.h>
+#include <matrix_dims.h>
+
+#include "../blas_kgen.h"
+#include "../trxm_common.h"
+#include "trsm_kgen_legacy.h"
+#include "gen_helper_legacy.h"
+#include "../trsm_kgen.h"
+
+static const char *readSquareBlock =
+    "y = (currM + %lu <= M) ? %lu : M - currM;\n"
+    "x = (k0 + %lu <= M) ? %lu : M - k0;\n"
+    "if ((y == %lu) && (x == %lu)) {\n"
+    // just read with an optimized function
+    "    %s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n"
+    "}\n"
+    "else {\n"
+    "    %s((__local float4*)temp%c);\n"           // zeroing
+    "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+    "    %s((LPtr)temp%c, (GPtr)A, currM, k0, y, x, %lu, lda);\n"
+    "}\n\n";
+
+static const char *readSquareBlockOpt =
+    // just read with an optimized function
+    "%s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n";
+
+static const char *readSquareBlockTrans =
+    "y = (currM + %lu <= M) ? %lu : M - currM;\n"
+    "x = (k0 + %lu <= M) ? %lu : M - k0;\n"
+    "if ((y == %lu) && (x == %lu)) {\n"
+    // read and transpose with an optimized function
+    "    %s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n"
+    "}\n"
+    "else {\n"
+    "    %s((__local float4*)temp%c);\n"           // zeroing
+    "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+    // read and transpose with slow function
+    "    %s((LPtr)temp%c, (GPtr)A, k0, currM, x, y, %lu, lda);\n"
+    "}\n\n";
+
+static const char *readSquareBlockTransOpt =
+    // read and transpose with an optimized function
+    "%s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n";
+
+static CLBLASMpatExtra mpatExtra;
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs);
+
+static SolverFlags
+solverFlags(void);
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra);
+
+static void
+fixupArgs(void *args, SubproblemDim *subdims, void *extra);
+
+static SolverOps trsmSops = {
+    generator,
+    assignKargs,
+    isFitToLDS,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    solverFlags,
+    fixupArgs,
+    NULL, //getDefaultDecomp
+   	NULL, // getDecompList
+   	NULL,
+   	NULL
+};
+
+static TileMulFlags
+getCyclicFlags(
+    const SubproblemDim *dim,
+    KernelExtraFlags kflags,
+    bool tailPass,
+    unsigned int vecLen)
+{
+    TileMulFlags mflags = TILEMUL_NO_FLAGS;
+
+    if (tailPass && !isMatrixUpper(kflags)) {
+        mflags |= TILEMUL_GLOBAL_CYCLIC_A;
+    }
+
+    if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B) &&
+        (kflags & KEXTRA_TAILS_N) && (dim->x > vecLen)) {
+
+        mflags |= TILEMUL_GLOBAL_CYCLIC_B;
+    }
+
+    return mflags;
+}
+
+static void
+initTiles(BlasGenSettings *gset)
+{
+    unsigned int nrRows, nrCols;
+    unsigned int vecLen;
+    const SubproblemDim *dim = &gset->subdims[1];
+    const CLBLASKernExtra *kextra = gset->kextra;
+    DataType dtype = kextra->dtype;
+    bool tra;
+
+    // the tile A should be able to fit rectangular and square tiles
+    nrCols = (unsigned int)szmax(dim->y, dim->bwidth);
+    tra = isMatrixAccessColMaj(CLBLAS_TRSM, kextra->flags, MATRIX_A);
+    vecLen = getVecLen(gset, CLBLAS_TRSM, MATRIX_A);
+    initTile(&gset->tileA, "a", (unsigned int)dim->y, nrCols, vecLen,
+             dtype, PRIV_STORAGE_ARRAY, tra, false);
+
+    /*
+     * tile B should be able to fit tiles of the matrix B and of the
+     * intermediate result. That result will be always transposed
+     * from the point of view of tile multiplication
+     */
+    tra = !isMatrixAccessColMaj(CLBLAS_TRSM, kextra->flags, MATRIX_B);
+    if (tra) {
+        nrRows = (unsigned int)szmax(dim->bwidth, dim->y);
+        nrCols = (unsigned int)dim->x;
+    }
+    else {
+        nrRows = (unsigned int)szmax(dim->bwidth, dim->x);
+        nrCols = (unsigned int)szmax(dim->x, dim->y);
+    }
+    vecLen = getVecLen(gset, CLBLAS_TRSM, MATRIX_B);
+    initTile(&gset->tileBX, "b", nrRows, nrCols, vecLen, dtype,
+             PRIV_STORAGE_ARRAY, tra, false);
+
+    initTile(&gset->tileCY, "c", (unsigned int)dim->y, (unsigned int)dim->x,
+             vecLen, dtype, PRIV_STORAGE_ARRAY, false, false);
+}
+
+static void
+prepareTilesForMainLoop(BlasGenSettings *gset)
+{
+    const SubproblemDim *dim = &gset->subdims[1];
+
+    gset->tileA.nrCols = (unsigned int)dim->bwidth;
+    gset->tileBX.nrRows = (unsigned int)dim->bwidth;
+    gset->tileBX.nrCols = (unsigned int)dim->x;
+}
+
+static void
+declareLocalVariables(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset)
+{
+    char tmp[1024];
+    const char *elemType;
+    const SubproblemDim *dims = gset->subdims;
+    DataType dtype = gset->kextra->dtype;
+    size_t pitchAC, heightC;
+
+    elemType = dtypeBuiltinType(dtype);
+    pitchAC = matrBlockPitch(dims, MATRIX_C, dtype, clblasRight);
+    heightC = szmax(dims[0].y, dims[0].x);
+
+    declareTileStorages(ctx, gset);
+    sprintf(tmp, "const int lid = get_local_id(0);\n"
+                 "const int gid = get_group_id(0);\n"
+                 "const uint2 skewRow = 0, skewCol = 0;\n\n"
+                 "GPtr uA, uB;\n"
+                 "uint coordA, coordB, k;\n"
+                 "uint x, y;\n"
+                 "__local %s tempA[%lu], tempC[%lu];\n"
+                 "LPtr utmpA, utmpC;\n"
+                 "uint m0 = 0, k0, currM, currN;\n",
+            elemType, pitchAC * dims[0].y, pitchAC * heightC);
+    kgenAddStmt(ctx, tmp);
+}
+
+static void
+genReadDiagBlock(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    DataType dtype,
+    const CopyBufFuncs *copyFuncs,
+    const ZeroFuncs *zeroFuncs,
+    KernelExtraFlags kflags,
+    char c)
+{
+    char tmp[1024];
+    size_t pitch;
+    const char *readBlock;
+    bool tra;
+
+    tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A);
+    pitch = matrBlockPitch(dim, MATRIX_A, dtype, clblasLeft);
+
+    if (!(kflags & KEXTRA_TAILS_M)) {
+        readBlock = (tra) ? readSquareBlockTransOpt : readSquareBlockOpt;
+        sprintf(tmp, readBlock, copyFuncs->read[MATRIX_A], c);
+    }
+    else {
+        readBlock = (tra) ? readSquareBlockTrans : readSquareBlock;
+        sprintf(tmp, readBlock, dim->y, dim->y, dim->bwidth, dim->bwidth,
+                dim->y, dim->bwidth, copyFuncs->read[MATRIX_A], c,
+                zeroFuncs->names[MATRIX_A], c,
+                copyFuncs->readGeneric[MATRIX_A], c, pitch);
+    }
+    kgenAddStmt(ctx, tmp);
+}
+
+static void
+genZeroResult(
+    struct KgenContext *ctx,
+    DataType dtype,
+    const SubproblemDim *dims,
+    unsigned int vecLen)
+{
+    unsigned int n;
+    char tmp[1024];
+
+    getResultGPRsInfo(dtype, &dims[1], vecLen, &n, NULL);
+
+    sprintf(tmp, "for (x = 0; x < %u; x++) {\n"
+                 "    c[x] = 0;\n"
+                 "}\n\n", n);
+
+    kgenAddStmt(ctx, tmp);
+}
+
+static void
+genInternalLoopCtl(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    KernelExtraFlags kflags)
+{
+    char tmp[1024];
+
+    if (isMatrixUpper(kflags)) {
+        if (kflags & KEXTRA_TAILS_M) {
+            sprintf(tmp, "for (k0 = currM + %lu; k0 < M / %lu * %lu; "
+                               "k0 += %lu)",
+                    dim[0].bwidth, dim[1].bwidth, dim[1].bwidth, dim[1].bwidth);
+        }
+        else {
+            sprintf(tmp, "for (k0 = currM + %lu; k0 < M; k0 += %lu)",
+                    dim[0].bwidth, dim[1].bwidth);
+        }
+    }
+    else {
+        sprintf(tmp, "for (k0 = 0; k0 < currM; k0 += %lu)",
+                dim[1].bwidth);
+    }
+
+    kgenBeginBranch(ctx, tmp);
+}
+
+static void
+genInitCurrM(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    KernelExtraFlags kflags)
+{
+    char tmp[1024];
+
+    if (isMatrixUpper(kflags)) {
+        /* start from the last block */
+        sprintf(tmp, "currM = ((M - 1) / %lu) * %lu;\n", dim->y, dim->y);
+        kgenAddStmt(ctx, tmp);
+    }
+    else {
+        kgenAddStmt(ctx, "currM = 0;\n");
+    }
+}
+
+static void
+initKernelVarNames(KernelVarNames *kvars)
+{
+    kvars->A = "uA";
+    kvars->B = "uB";
+    kvars->coordA = "coordA";
+    kvars->coordB = "coordB";
+    kvars->k = "k";
+    kvars->sizeM = "M";
+    kvars->sizeN = "N";
+    kvars->sizeK = "M";
+    kvars->lda = "lda";
+    kvars->ldb = "ldb";
+}
+
+/*
+ * Generate a code copying tile between LDS and private location.
+ */
+static void
+genLdsCopy(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset)
+{
+    char pitchStr[16];
+    char coordY[128], coordX[128];
+    size_t pitch;
+    UpresVarNames uvars;
+    UpdateResultFlags upFlags = UPRES_INLINE | UPRES_USE_LDS |
+                                UPRES_WITHOUT_ALPHA | UPRES_COLUMN_MAJOR;
+    const SubproblemDim *dims = gset->subdims;
+    unsigned int l1Pans = (unsigned int)(dims[0].x / dims[1].x);
+
+    memset(&uvars, 0, sizeof(uvars));
+
+    pitch = matrBlockPitch(dims, MATRIX_C, gset->kextra->dtype, clblasRight);
+    sprintf(pitchStr, "%lu", pitch);
+    sprintf(coordY, "lid / %u * %lu", l1Pans, dims[1].y);
+    sprintf(coordX, "lid %% %u * %lu", l1Pans, dims[1].x);
+    uvars.result = "tempC";
+    uvars.ld = pitchStr;
+    uvars.startRow = coordY;
+    uvars.startCol = coordX;
+    uvars.nrRows = NULL;
+    uvars.nrCols = NULL;
+
+    kgenBeginBranch(ctx, NULL);
+
+    updateResultGen(ctx,
+        gset,
+        CLBLAS_TRSM,
+        UPRES_SET,
+        upFlags,
+        &uvars);
+
+    kgenEndBranch(ctx, NULL);
+
+    kgenAddBlankLine(ctx);
+}
+
+static void
+genZeroResultTrash(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    const CLBLASKernExtra *kextra)
+{
+    char tmp[1024];
+    unsigned int vecLen, pitch;
+    unsigned int i;
+
+    vecLen = (isComplexType(kextra->dtype)) ? 1 : kextra->vecLen;
+    pitch = (unsigned int)roundUp(dim->x, vecLen);
+    sprintf(tmp, "if (coordA + %lu > M)", dim->y);
+    kgenBeginBranch(ctx, tmp);
+    sprintf(tmp, "int i = (coordA >= M) ? %lu : (%lu - M %% %lu);\n\n",
+            dim->y, dim->y, dim->y);
+    kgenAddStmt(ctx, tmp);
+    sprintf(tmp, "for (; i > 0; i--)");
+    kgenBeginBranch(ctx, tmp);
+
+    for (i = 0; i < pitch / vecLen; i++) {
+        sprintf(tmp, "c[(%lu - i) * %u + %u] = 0;\n",
+                dim->y, pitch / vecLen, i);
+        kgenAddStmt(ctx, tmp);
+    }
+
+    kgenEndBranch(ctx, NULL);
+    kgenEndBranch(ctx, NULL);
+}
+
+static void
+setupVdepUpresFlags(KernelExtraFlags kflags, UpdateResultFlags* upFlags)
+{
+    bool forceBug = false;
+
+    unsigned int bugFlag1 = KEXTRA_NO_COPY_VEC_A
+                          | KEXTRA_TAILS_K
+                          | KEXTRA_TAILS_M;
+    unsigned int bugFlag2 = bugFlag1
+                          | KEXTRA_UPPER_TRIANG
+                          | KEXTRA_TRANS_A;
+    unsigned int bugFlag3 = bugFlag1
+                          | KEXTRA_SIDE_RIGHT
+                          | KEXTRA_COLUMN_MAJOR;
+    unsigned int bugFlag4 = bugFlag3
+                          | KEXTRA_TRANS_A;
+    unsigned int bugFlag5 = bugFlag3
+                          | KEXTRA_UPPER_TRIANG;
+    unsigned int bugFlag6 = KEXTRA_NO_COPY_VEC_A
+                          | KEXTRA_NO_COPY_VEC_B
+                          | KEXTRA_NO_COPY_VEC_C
+                          | KEXTRA_TAILS_K
+                          | KEXTRA_TAILS_M;
+    unsigned int bugFlag7 = bugFlag6
+                          | KEXTRA_COLUMN_MAJOR;
+    unsigned int bugFlag8 = bugFlag6
+                          | KEXTRA_SIDE_RIGHT
+                          | KEXTRA_UPPER_TRIANG;
+    unsigned int bugFlag9 = bugFlag6
+                          | KEXTRA_UPPER_TRIANG
+                          | KEXTRA_TRANS_A
+                          | KEXTRA_TAILS_N;
+    unsigned int bugFlag10 = bugFlag7
+                           | KEXTRA_SIDE_RIGHT
+                           | KEXTRA_TRANS_A
+                           | KEXTRA_TAILS_N;
+    unsigned int bugFlag11 = bugFlag9
+                           | KEXTRA_UNIT_DIAGONAL;
+    unsigned int bugFlag12 = bugFlag6
+                           | KEXTRA_TAILS_N
+                           | KEXTRA_SIDE_RIGHT
+                           | KEXTRA_UNIT_DIAGONAL
+                           | KEXTRA_COLUMN_MAJOR
+                           | KEXTRA_TRANS_A;
+
+    /*
+     * WORKAROUND for AMD GPU: Now, we avoid optimizing the case when
+     *                         matrix B is not divided on block size and
+     *                         since it leads to a hang up at code seeming
+     *                         correct.
+     */
+    if (kflags & KEXTRA_VENDOR_AMD) {
+        forceBug = (kflags & KEXTRA_TAILS_N) != 0;
+    }
+    else {
+        forceBug = (kflags != bugFlag1
+            && kflags != bugFlag2 && kflags != bugFlag4 &&  kflags != bugFlag5
+            && kflags != bugFlag7 && kflags != bugFlag8 &&  kflags != bugFlag9
+            && kflags != bugFlag10 && kflags != bugFlag11
+            && kflags != bugFlag12);
+    }
+
+    if (!forceBug) {
+        *upFlags |= UPRES_INDEXING_WITH_CONSTANTS;
+    }
+}
+
+static void
+genSetupCoordinates(
+    struct KgenContext *ctx,
+    const SubproblemDim *dims,
+    KernelExtraFlags kflags)
+{
+    char tmp[1024];
+    unsigned int l1Pans = (unsigned int)(dims[0].x / dims[1].x);
+
+    sprintf(tmp, "coordA = currM + lid / %u * %lu;\n", l1Pans, dims[1].y);
+    kgenAddStmt(ctx, tmp);
+    if (isMatrixUpper(kflags)) {
+        sprintf(tmp, "k = currM + %lu;\n", dims[0].y);
+    }
+    else {
+        strcpy(tmp, "k = 0;\n");
+    }
+    kgenAddStmt(ctx, tmp);
+}
+
+static void
+genInvertDiagBlock(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    const ZeroFuncs *zeroFuncs)
+{
+    char tmp[1024];
+    const CLBLASKernExtra *kextra = gset->kextra;
+    const SubproblemDim *subdims = gset->subdims;
+    size_t pitchA;
+
+    pitchA = matrBlockPitch(subdims, MATRIX_A, kextra->dtype, clblasLeft);
+
+    sprintf(tmp, "%s((__local float4*)tempA);\n", zeroFuncs->names[MATRIX_A]);
+    kgenAddStmt(ctx, tmp);
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+
+    if (kextra->flags & KEXTRA_UNIT_DIAGONAL) {
+        sprintf(tmp, "if (lid < %lu) {\n"
+                     "    tempC[lid * %lu + lid] = %s;\n"
+                     "}\n",
+                subdims[0].bwidth, pitchA, strOne(kextra->dtype));
+        kgenAddStmt(ctx, tmp);
+        kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+        kgenAddBlankLine(ctx);
+    }
+
+    sprintf(tmp, "if (lid < %lu)", subdims[0].y);
+    kgenBeginBranch(ctx, tmp);
+    sprintf(tmp, "invert(tempC, tempA, lid, (currM + %lu > M) ? "
+                         "M - currM : %lu);\n",
+            subdims[0].y, subdims[0].y);
+    kgenAddStmt(ctx, tmp);
+    kgenEndBranch(ctx, NULL);
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+    kgenAddBlankLine(ctx);
+}
+
+static void
+genMulOnDiagBlock(
+    struct KgenContext *ctx,
+    BlasGenSettings *gset,
+    const TileMulOpts *mulOpts)
+{
+    char tmp[1024];
+    const SubproblemDim *dims = gset->subdims;
+    const CLBLASKernExtra *kextra = gset->kextra;
+    unsigned int l1Pans = (unsigned int)(dims[0].x / dims[1].x);
+    TileMulOpts optsNew;
+    size_t pitchAC;
+    const char *ptrName;
+    Tile *tile;
+    BlasGenSettings gsetNew;
+
+    pitchAC = matrBlockPitch(dims, MATRIX_C, kextra->dtype, clblasRight);
+    ptrName = dtypeUPtrField(kextra->dtype);
+
+    memcpy(&optsNew, mulOpts, sizeof(optsNew));
+    optsNew.memA = CLMEM_LOCAL_MEMORY;
+    optsNew.memB = CLMEM_LOCAL_MEMORY;
+    optsNew.flags &= ~(TILEMUL_TRA | TILEMUL_GLOBAL_CYCLIC | TILEMUL_CONJA);
+    optsNew.flags |= TILEMUL_TRB;
+    optsNew.memA = CLMEM_LOCAL_MEMORY;
+    optsNew.memB = CLMEM_LOCAL_MEMORY;
+    gset->varNames.A = "utmpA";
+    gset->varNames.B = "utmpC";
+
+    sprintf(tmp, "utmpA.%s = tempA + lid / %u * %lu;\n"
+                 "utmpC.%s = tempC + lid %% %u * %lu;\n\n",
+            ptrName, l1Pans, pitchAC * dims[1].y,
+            ptrName, l1Pans, pitchAC * dims[1].x);
+    kgenAddStmt(ctx, tmp);
+
+    memcpy(&gsetNew, gset, sizeof(gsetNew));
+    gsetNew.subdims[1].bwidth = dims[1].y;
+
+    // Configure the tile descriptors to deal with tile of needed sizes.
+    tile = &gsetNew.tileA;
+    tile->nrRows = (unsigned int)dims[1].y;
+    tile->nrCols = (unsigned int)dims[1].y;
+    tile->trans = false;
+    tile = &gsetNew.tileBX;
+    tile->nrRows = (unsigned int)dims[1].y;
+    tile->nrCols = (unsigned int)dims[1].x;
+    tile->trans = true;
+    tileMulGen(ctx, &gsetNew, &optsNew);
+
+    gset->varNames.A = "uA";
+    gset->varNames.B = "uB";
+}
+
+static void
+genOneTrsmPass(
+    struct KgenContext *ctx,
+    BlasGenSettings *gset,
+    const char *updateResFnRev,
+    const char *updateResGenericFnRev,
+    CopyBufFuncs *copyFuncs,
+    ZeroFuncs *zeroFuncs,
+    bool isTailPass)
+{
+    const CLBLASKernExtra *kextra = gset->kextra;
+    CLBLASKernExtra kextraTmp;
+    KernelExtraFlags kflags = kextra->flags;
+    char tmp[1024];
+    DataType dtype = kextra->dtype;
+    unsigned int vecLen = gset->kextra->vecLen;
+    SubproblemDim *subdims = gset->subdims;
+    int tra, trb;
+    UpdateResultFlags upFlags;
+    TilePostFetchPrivate pfpriv;
+    TileMulOpts mulOpts;
+    TailFetch tf;
+    TailStatus tailStatus = 0;
+
+    memset(&pfpriv, 0, sizeof(pfpriv));
+
+    // multiply options
+    mulOpts.memA = CLMEM_GLOBAL_MEMORY;
+    mulOpts.memB = CLMEM_GLOBAL_MEMORY;
+    mulOpts.core = TILEMUL_MAD;//TILEMUL_MULADD;
+    mulOpts.postFetch = NULL;
+    mulOpts.flags = kextraToTilemulFlags(CLBLAS_TRSM, kflags);
+    mulOpts.flags |= TILEMUL_EXTERN_RDECL;
+    mulOpts.flags |= getCyclicFlags(subdims, kflags, isTailPass, vecLen);
+
+    tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A);
+    trb = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B);
+
+    tf = checkForTailFetches(CLBLAS_TRSM, &subdims[1], kextra, MATRIX_B,
+                             false, false);
+    if (trb) {
+        tf &= ~FETCH_TAIL_COL;
+    }
+
+    /*
+     * For lower triangular matrix we proceed upto the diagonal, so we
+     * can't exceed matrix bound and zeroing is not needed
+     */
+    if (isMatrixUpper(kflags)) {
+        tf |= checkForTailFetches(CLBLAS_TRSM, &subdims[1], kextra,
+                                  MATRIX_A, false, false);
+        if (tra && trb) {
+            tf &= ~FETCH_TAIL_COL;
+        }
+    }
+
+    if (tf != FETCH_NO_TAILS) {
+        memset(&pfpriv, 0, sizeof(pfpriv));
+        pfpriv.funcID = CLBLAS_TRSM;
+        pfpriv.gset = gset;
+    }
+
+    // loop over M
+    if (!isTailPass) {
+        sprintf(tmp, "for (m0 = 0; m0 < M / %lu * %lu; m0 += %lu)",
+                subdims->y, subdims->y, subdims->y);
+        kgenBeginBranch(ctx, tmp);
+    }
+
+    genSetupCoordinates(ctx, subdims, kflags);
+    genZeroResult(ctx, dtype, subdims, vecLen);
+
+    if (!isMatrixUpper(kflags) && isTailPass) {
+        // skip update loop is the matrix consist of the single block
+        sprintf(tmp, "if (M > %lu)", subdims->y);
+        kgenBeginBranch(ctx, tmp);
+    }
+
+    // Avoid tail adjusting along M.
+
+    memcpy(&kextraTmp, kextra, sizeof(kextraTmp));
+    kextraTmp.flags &= ~(KEXTRA_TAILS_M | KEXTRA_TAILS_M_LOWER);
+
+    // update loop is not needed for tail of an upper triangular matrix
+    if (!(isTailPass && isMatrixUpper(kflags))) {
+        if (isTailPass || (kflags & KEXTRA_TAILS_N)) {
+            kgenBeginBranch(ctx, "if (coordB < N)");
+        }
+
+        gset->kextra = &kextraTmp;
+        tailStatus = checkGenAdjustTailCoords(ctx, CLBLAS_TRSM, gset, NULL);
+        gset->kextra = kextra;
+
+        genInternalLoopCtl(ctx, subdims, kflags);           // loop over K
+
+        // multiplication for the step-by-step block updating
+        subdims[0].bwidth = subdims[1].bwidth;
+        tileMulGen(ctx, gset, &mulOpts);
+        subdims[0].bwidth = subdims[0].y;
+
+        genInternalLoopEnd(ctx);                             // loop over K
+        kgenAddBlankLine(ctx);
+
+        // invoke once again, in order to process tails along K
+        if (isMatrixUpper(kflags) && (tf != FETCH_NO_TAILS)) {
+            subdims[0].bwidth = subdims[1].bwidth;
+
+            if (!(tra && trb)) {
+                mulOpts.flags |= TILEMUL_WRAP_AROUND_TAIL;
+            }
+            mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K;
+
+            mulOpts.postFetchPriv = &pfpriv;
+            mulOpts.postFetch = defaultTilePostFetch;
+
+            subdims[0].bwidth = subdims[1].bwidth;
+            tileMulGen(ctx, gset, &mulOpts);
+            subdims[0].bwidth = subdims[0].y;
+
+            mulOpts.postFetch = NULL;
+            mulOpts.postFetchPriv = NULL;
+        }
+
+        gset->kextra = &kextraTmp;
+        checkGenRestoreTailCoords(ctx, gset, tailStatus);
+        gset->kextra = kextra;
+
+        if (isTailPass || (kflags & KEXTRA_TAILS_N)) {
+            kgenEndBranch(ctx, NULL);
+        }
+    }
+    else if (!trb && (kflags & KEXTRA_TAILS_N)) {
+        tailStatus |= TAIL_B_RAISED;
+    }
+
+    mulOpts.flags &= ~(TILEMUL_WRAP_AROUND_TAIL | TILEMUL_GLOBAL_CYCLIC_A |
+                       TILEMUL_GLOBAL_CYCLIC_K);
+
+    if (!isMatrixUpper(kflags) && isTailPass) {
+        /*
+         * end of branch for non single block tail processing of
+         * the lower triangular matrix
+         */
+        kgenEndBranch(ctx, NULL);
+    }
+
+    /*
+     * Final phase: update the accumulated result, multiply on an inverted
+     *              block and write back the result
+     */
+    if (isMatrixUpper(kflags) || ((kflags & KEXTRA_VENDOR_AMD) != 0)) {
+        kgenAddStmt(ctx, "k0 = currM;\n");
+    }
+    else {
+        kgenAddStmt(ctx, "k0 = m0;\n");
+    }
+
+    genReadDiagBlock(ctx, subdims, dtype, copyFuncs, zeroFuncs,
+                     kflags, 'C');
+    genInvertDiagBlock(ctx, gset, zeroFuncs);
+
+    // Avoid generating not executed non optimal path
+    gset->kextra = &kextraTmp;
+    if (isTailPass) {
+        kextraTmp.flags |= (KEXTRA_TAILS_M | KEXTRA_TAILS_M_LOWER);
+    }
+    genUpdateIntermTrsmResult(ctx, gset, updateResFnRev,
+                              updateResGenericFnRev, true);
+    gset->kextra = kextra;
+
+    /*
+     * Heap to LDS.
+     * Zero unuseful part along columns since it will have an influence
+     * on the result at multiplication on an inverted block
+     */
+    if (isTailPass) {
+        genZeroResultTrash(ctx, &subdims[1], kextra);
+    }
+    genLdsCopy(ctx, gset);
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+    genZeroResult(ctx, dtype, subdims, vecLen);
+
+    genMulOnDiagBlock(ctx, gset, &mulOpts);
+
+    // write back the tile evaluated
+    upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags);
+    upFlags |= tailStatusToUpresFlags(tailStatus);
+    upFlags |= UPRES_EXCEED_PROBLEM_CONDITION;
+    setupVdepUpresFlags(kflags, &upFlags);
+
+    gset->kextra = &kextraTmp;
+
+    genResultUpdateWithFlags(ctx, CLBLAS_TRSM, gset, upFlags,
+                             NULL, NULL, NULL);
+    gset->kextra = kextra;
+
+    kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE);
+
+    if (isMatrixUpper(kflags)) {
+        sprintf(tmp, "currM -= %lu;\n", subdims[0].y);
+    }
+    else {
+        sprintf(tmp, "currM += %lu;\n", subdims[0].y);
+    }
+    kgenAddStmt(ctx, tmp);
+
+    if (!isTailPass) {
+        kgenEndBranch(ctx, NULL);                       // loop over M
+    }
+}
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+    char tmp[1024];
+    struct KgenContext *ctx;
+    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    KernelExtraFlags kflags = kextra->flags;
+    DataType dtype = kextra->dtype;
+    BlasGenSettings gset;
+    char updateResFnRev[FUNC_NAME_MAXLEN];
+    char updateResGenericFnRev[FUNC_NAME_MAXLEN];
+    CopyBufFuncs copyFuncs;
+    ZeroFuncs zeroFuncs;
+    UpdateResultFlags upFlags;
+    const char *ptrName;
+    bool b;
+    ssize_t ret;
+    unsigned int l1Pans = (unsigned int)(subdims[0].x / subdims[1].x);
+    bool tailMarker[2] = {false, true};
+    int triang;
+    int i;
+
+    if (pgran->wgDim != 1) {
+        return -EINVAL;
+    }
+
+    if (kflags & KEXTRA_TAILS_M) {
+        kflags |= KEXTRA_TAILS_M_LOWER;
+    }
+    if (kflags & KEXTRA_TAILS_N) {
+        kflags |= KEXTRA_TAILS_N_LOWER;
+    }
+    if (kflags & KEXTRA_TAILS_K) {
+        kflags |= KEXTRA_TAILS_K_LOWER;
+    }
+    kextra->flags = kflags;
+
+    ctx = createKgenContext(buf, buflen, true);
+    if (ctx == NULL) {
+        return -ENOMEM;
+    }
+
+    triang = isMatrixUpper(kflags);
+
+    memset(&gset, 0, sizeof(gset));
+    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
+    gset.kextra = kextra;
+    gset.pgran = pgran;
+
+    initKernelVarNames(&gset.varNames);
+
+    b = isDoubleBasedType(dtype);
+    kgenDeclareUptrs(ctx, b);
+    if (isComplexType(dtype)) {
+        genComplexMathOperators(ctx, dtype);
+    }
+
+    /*
+     * For intermediate result after blocks modification.
+     * Take into account tails adjusting
+     */
+    upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags);
+    upFlags |= UPRES_WITH_BETA | UPRES_PRIV_DEST;
+
+    if (!isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B) &&
+        (kflags & KEXTRA_TAILS_N)) {
+
+        upFlags |= UPRES_TAIL_COL;
+    }
+
+    setupVdepUpresFlags(kflags, &upFlags);
+    initTiles(&gset);
+    genUpresFuncsWithFlags(ctx, &gset, upFlags, updateResFnRev,
+                           updateResGenericFnRev);
+
+    generateBufCopyFuncs(&copyFuncs, ctx, CLBLAS_TRSM, &gset, BCHF_MATRIX_A);
+    generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype,
+                         ZF_MATRIX_A);
+
+    //matrix inversion function
+    genInvertingBlockFunc(ctx, subdims[0].bwidth, dtype, kflags);
+    kgenAddBlankLine(ctx);
+
+    // now, generate the kernel
+    declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRSM, "Cached", false,
+                      true);
+    ret = kgenBeginFuncBody(ctx);
+
+    declareLocalVariables(ctx, &gset);
+    prepareTilesForMainLoop(&gset);
+
+    sprintf(tmp, "currN = gid * %lu;\n", subdims[0].x);
+    kgenAddStmt(ctx, tmp);
+    genInitCurrM(ctx, subdims, kflags);
+
+    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
+        kgenAddStmt(ctx, "A += offA;\n");
+    }
+    genTrxmBMatrShift(ctx, kflags, false);
+
+    ptrName = dtypeUPtrField(dtype);
+    sprintf(tmp, "uA.%s = A;\n"
+                 "uB.%s = B;\n\n",
+            ptrName, ptrName);
+    kgenAddStmt(ctx, tmp);
+
+    /*
+     * B matrix is divided on panels, each work group
+     * multiply such a panel on the whole matrix A.
+     */
+
+    sprintf(tmp, "coordB = gid * %lu + lid %% %u * %lu;\n",
+            subdims[0].x, l1Pans, subdims[1].x);
+    kgenAddStmt(ctx, tmp);
+
+    for (i = 0; i < 2; i++) {
+        b = (i) ? tailMarker[1 - triang] : tailMarker[triang];
+        if (!b || (kflags & KEXTRA_TAILS_M)) {
+            genOneTrsmPass(ctx, &gset, updateResFnRev, updateResGenericFnRev,
+                           &copyFuncs, &zeroFuncs, b);
+        }
+    }
+
+    kgenEndFuncBody(ctx);
+    ret = kgenAddBlankLine(ctx);
+
+    if (!ret) {
+        ret = (ssize_t)kgenSourceSize(ctx) + 1;
+    }
+
+    destroyKgenContext(ctx);
+
+    return (ret < 0) ? -EOVERFLOW : ret;
+}
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs)
+{
+    cl_ulong sizeA, sizeC;
+    const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs;
+
+    /*
+     * It's needed one block for matrix A,
+     * and one block of size maximal of this one for
+     * matrix A and matrix C
+     */
+
+    sizeA = matrBlockSize(dim, MATRIX_A, dtype, kargs->side);
+    sizeC = matrBlockSize(dim, MATRIX_B, dtype, kargs->side);
+    if (sizeA > sizeC) {
+        sizeC = sizeA;
+    }
+
+    return ((sizeA + sizeC) * dtypeSize(dtype) <= ldsSize);
+}
+
+static SolverFlags
+solverFlags(void)
+{
+    return (SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS);
+}
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra)
+{
+    const CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+    KernelExtraFlags kflags = ((const CLBLASKernExtra*)extra)->flags;
+    int idx = 7;
+
+    initSizeKarg(&args[0], blasArgs->M);
+    initSizeKarg(&args[1], blasArgs->N);
+    assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype);
+    initMemobjKarg(&args[3], blasArgs->A, NULL, 0, 0);
+    initSizeKarg(&args[4], blasArgs->lda.matrix);
+    initMemobjKarg(&args[5], blasArgs->B, NULL, 0, 0);
+    initSizeKarg(&args[6], blasArgs->ldb.matrix);
+    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
+        initSizeKarg(&args[idx++], blasArgs->offA);
+    }
+    if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
+        initSizeKarg(&args[idx++], blasArgs->offBX);
+    }
+}
+
+static void
+fixupArgs(void *args, SubproblemDim *subdims, void *extra)
+{
+    (void)extra;
+    (void)subdims;
+
+    fixupTrxmKargs((CLBlasKargs*)args);
+}
+
+void
+initTrsmCachedPattern(MemoryPattern *mempat)
+{
+    mempat->name = "Cached global memory based block trsm";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 0;
+    mempat->sops = &trsmSops;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L1;
+    mpatExtra.mobjA = CLMEM_BUFFER;
+    mpatExtra.mobjB = CLMEM_BUFFER;
+    mempat->extra = &mpatExtra;
+}
diff --git a/src/library/blas/gens/legacy/trsm_img.c b/src/library/blas/gens/legacy/trsm_img.c
new file mode 100644
index 0000000..54127c7
--- /dev/null
+++ b/src/library/blas/gens/legacy/trsm_img.c
@@ -0,0 +1,1165 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Image based trsm generator
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+
+#include <matrix_dims.h>
+
+#include "blas_kgen_legacy.h"
+#include "gen_helper_legacy.h"
+#include "trsm_kgen_legacy.h"
+#include "../gen_helper.h"
+#include "../trsm_kgen.h"
+#include <dis_warning.h>
+
+static const char *trsmImDecl =
+    "__attribute__((reqd_work_group_size(%lu, %lu, 1)))\n"
+    "void __kernel\n"
+    "%ctrsmIm(\n"
+    "    uint %c,\n"
+    "    uint %c,\n"
+    "    %s alpha,\n"
+    "    __read_only image2d_t A,\n"
+    "    __global %s *B,\n"
+    "    uint ldb,\n"
+    "    uint startRow,\n"
+    "    uint finishRow,\n"
+    "    uint offB)\n";
+
+/*
+ *  template for memory object based trsm preparation part
+ *  for one dimensional work space
+ */
+static const char *trsmImPrep1D =
+    "uint m0, k0;\n"
+    "__local %s tempC[%lu];\n"
+    "%s c[%u];\n"
+    "const int lid = get_local_id(0);\n"
+    "const int skew = lid %% %lu;\n"
+    "%s"                                    // groups per Panel variable
+    "uint blockN;\n"
+    "uint x, y, imx, imy;\n"
+    "uint2 coordA, coordB;\n"
+    "\n"
+    "const uint currN = get_global_id(0) / %u * %lu;\n"       // group ID
+    "\n";
+
+static const char *readRectBlock =
+    "y = (currN + %lu <= N) ? %lu : N - currN;\n"
+    "x = (k0 + %lu <= finishRow) ? %lu : finishRow - k0;\n"
+    "if ((y == %lu) && (x == %lu)) {\n"
+    // just read with an optimized function
+    "    %s((LPtr)temp%c, (GPtr)B, currN, k0, ldb);\n"
+    "}\n"
+    "else {\n"
+    "    %s((__local float4*)temp%c);\n"           // zeroing
+    "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+    "    %s((LPtr)temp%c, (GPtr)B, currN, k0, y, x, %lu, ldb);\n"
+    "}\n\n";
+
+static const char *readRectBlockOpt =
+    // just read with an optimized function
+    "%s((LPtr)temp%c, (GPtr)B, currN, k0, ldb);\n";
+
+static const char *readRectBlockTrans =
+    "y = (currN + %lu <= N) ? %lu : N - currN;\n"
+    "x = (k0 + %lu <= finishRow) ? %lu : finishRow - k0;\n"
+    "if ((y == %lu) && (x == %lu)) {\n"
+    // read and transpose with an optimized function
+    "    %s((LPtr)temp%c, (GPtr)B, k0, currN, ldb);\n"
+    "}\n"
+    "else {\n"
+    "    %s((__local float4*)temp%c);\n"           // zeroing
+    "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+    // read and transpose with slow function
+    "    %s((LPtr)temp%c, (GPtr)B, k0, currN, x, y, %lu, ldb);\n"
+    "}\n\n";
+
+static const char *readRectBlockTransOpt =
+    // read and transpose with an optimized function
+    "%s((LPtr)temp%c, (GPtr)B, k0, currN, ldb);\n";
+
+static ssize_t
+wrapper(
+    char *buf,
+    size_t buflen,
+    const struct SubproblemDim *subdims,
+    const struct PGranularity *pgran,
+    void *extra);
+
+static ssize_t
+generator(
+    char *buf,
+    size_t buflen,
+    const struct SubproblemDim *subdims,
+    const struct PGranularity *pgran,
+    void *extra);
+
+static ssize_t
+prepGenerator(
+    char *buf,
+    size_t buflen,
+    const struct SubproblemDim *subdims,
+    const struct PGranularity *pgran,
+    void *extra);
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra);
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs);
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *dims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+
+static void
+imgPackMode(
+    const void *extra,
+    const SubproblemDim *dims,
+    int dataID,
+    unsigned int *packRate,
+    clblasOrder *packOrder);
+
+static SolverFlags
+solverFlags(void);
+
+static SolverOps solverOps = {
+    wrapper,
+    assignKargs,
+    isFitToLDS,
+    NULL,
+    NULL,
+    calcNrThreads,
+    imgPackMode,
+    solverFlags,
+    NULL, //fixupArgs
+    NULL, //getDefaultDecomp
+   	NULL, //getDecompList
+   	NULL,
+   	NULL
+};
+
+static CLBLASMpatExtra mpatExtra;
+
+/* Prepare A kernel begin */
+
+static const char *trsmPrepDecl =
+    "void __kernel\n"
+    "%ctrsmPrepare(\n"
+    "    uint %c,\n"
+    "    __global %s *A,\n"
+    "    uint lda,\n"
+    "    __write_only image2d_t imA,\n"
+    "    uint startRow,\n"
+    "    uint offA)\n";
+
+/*
+ * template for memory object based trsm preparation part
+ * for one dimensional work space
+ */
+static const char *trsmPrep1D =
+    "__local %s tempA[%lu];\n"
+    "__local %s tempC[%lu];\n"
+    "int lid, gid;\n"
+    "uint currM, k0;\n"
+    "uint x, y, imx, imy;\n"
+    "\n"
+    "lid = get_local_id(0);\n"
+    "gid = get_global_id(0) / %u;\n"      // group ID
+    "A += offA;\n"
+    "\n";
+
+static const char *readSquareBlock =
+    "y = (currM + %lu <= M) ? %lu : M - currM;\n"
+    "x = (k0 + %lu <= M) ? %lu : M - k0;\n"
+    "if ((y == %lu) && (x == %lu)) {\n"
+    // just read with an optimized function
+    "    %s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n"
+    "}\n"
+    "else {\n"
+    "    %s((__local float4*)temp%c);\n"          // zeroing
+    "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+    "    %s((LPtr)temp%c, (GPtr)A, currM, k0, y, x, %lu, lda);\n"
+    "}\n\n";
+
+static const char *readSquareBlockOpt =
+    // just read with an optimized function
+    "%s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n";
+
+static const char *readSquareBlockTrans =
+    "y = (currM + %lu <= M) ? %lu : M - currM;\n"
+    "x = (k0 + %lu <= M) ? %lu : M - k0;\n"
+    "if ((y == %lu) && (x == %lu)) {\n"
+    // read and transpose with an optimized function
+    "    %s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n"
+    "}\n"
+    "else {\n"
+    "    %s((__local float4*)temp%c);\n"          // zeroing
+    "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+    // read and transpose with slow function
+    "    %s((LPtr)temp%c, (GPtr)A, k0, currM, x, y, %lu, lda);\n"
+    "}\n\n";
+
+static const char *readSquareBlockTransOpt =
+    // read and transpose with an optimized function
+    "%s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n";
+
+
+static bool
+useTransposedMul(const SubproblemDim *dims, DataType dtype, bool trb)
+{
+    unsigned int vecLen;
+
+    vecLen = sizeof(cl_float4) / dtypeSize(dtype);
+
+    return (!(trb || isComplexType(dtype) || (dims[1].x % vecLen)));
+}
+
+static size_t
+calcPitchB(const SubproblemDim *dim, DataType dtype, bool transpMul)
+{
+    size_t ret;
+    size_t tsize;
+
+    tsize = dtypeSize(dtype);
+    ret = (transpMul) ? dim->x : dim->bwidth;
+    ret = fl4RowWidth(ret, tsize) * sizeof(cl_float4) / tsize;
+
+    return ret;
+}
+
+static void
+genPrepareSquareBlock(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    DataType dtype,
+    const CopyBufFuncs *copyFuncs,
+    const ZeroFuncs *zeroFuncs,
+    bool tra,
+    char c,
+    bool opt)
+{
+    char tmp[1024];
+    size_t pitch;
+    const char *readBlock;
+
+    pitch = matrBlockPitch(dim, MATRIX_A, dtype, clblasLeft);
+    if (opt) {
+        readBlock = (tra) ? readSquareBlockTransOpt : readSquareBlockOpt;
+        sprintf(tmp, readBlock, copyFuncs->read[MATRIX_A], c);
+    }
+    else {
+        readBlock = (tra) ? readSquareBlockTrans : readSquareBlock;
+        sprintf(tmp, readBlock, dim->y, dim->y, dim->bwidth, dim->bwidth,
+                dim->y, dim->bwidth, copyFuncs->read[MATRIX_A], c,
+                zeroFuncs->names[MATRIX_A], c,
+                copyFuncs->readGeneric[MATRIX_A], c, pitch);
+    }
+    kgenAddStmt(ctx, tmp);
+}
+
+static void
+genPrepZeroBlockC(
+    struct KgenContext *ctx,
+    const ZeroFuncs *zeroFuncs)
+{
+    char tmp[1024];
+    sprintf(tmp, "%s((__local float4*)tempC);\n", zeroFuncs->names[MATRIX_A]);
+    kgenAddStmt(ctx, tmp);
+}
+
+static void
+genWriteBlock(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    const CopyBufFuncs *copyFuncs)
+{
+    char tmp[1024];
+
+    sprintf(tmp, "%s(imA, imx, imy, (LPtr)tempC, %lu, %lu, %lu);\n",
+        copyFuncs->write, dim[0].y, dim[0].y, dim[0].y);
+    kgenAddStmt(ctx, tmp);
+}
+
+static void
+getBufferPos(struct KgenContext *ctx, bool isU) //n -> x,y buffer
+{
+    kgenDeclareFunction(ctx, "void\ngetBufferPos(uint n, uint startRow, "
+                                                "uint width, uint *y, "
+                                                "uint *x)\n");
+    kgenBeginFuncBody(ctx);
+    if (isU) {
+        //n from beginning
+        kgenAddStmt(ctx, "n += (2 * width - startRow + 1) * (startRow) / 2;\n");
+        kgenAddStmt(ctx, "*y = trunc((2 * width + 1) - "
+                               "sqrt((2 * width + 1) *"
+                               "(2 * width + 1) - 8 * n)) / 2;\n");
+        kgenAddStmt(ctx, "*x = *y + n - (2 * width - *y + 1) * (*y) / 2;\n");
+    }
+    else {
+        //n from beginning
+        kgenAddStmt(ctx, "n += startRow * (startRow + 1) / 2;\n");
+        kgenAddStmt(ctx, "*y = trunc((-0.5 + sqrt(2.0 * n + 0.25)));\n");
+        kgenAddStmt(ctx, "*x = n - (*y) * (*y + 1) / 2;\n");
+    }
+    kgenEndFuncBody(ctx);
+
+    kgenAddBlankLine(ctx);
+}
+
+static void
+genGetImagePos(
+    struct KgenContext *ctx,
+    const SubproblemDim *subdims,
+    DataType dtype,
+    const char *blockName,
+    bool tra) //n -> x,y image
+{
+    char tmp[1024];
+    const char *parName;
+    const char *op[2] = {"/", "%"};
+
+    parName = (tra) ? "bpc" : "bpr";
+
+    sprintf(tmp, "imy = %s %s %s * %lu;\n"
+                 "imx = (%s %s %s) * %lu;\n",
+            blockName, op[tra], parName, subdims[0].y,
+            blockName, op[1 - tra], parName,
+            subdims[0].y * dtypeSize(dtype) / sizeof(cl_float4));
+    kgenAddStmt(ctx, tmp);
+}
+
+// global memory to image converter
+static ssize_t
+prepGenerator(
+    char *buf,
+    size_t buflen,
+    const struct SubproblemDim *subdims,
+    const struct PGranularity *pgran,
+    void *extra)
+{
+    struct KgenContext *ctx;
+    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    char tmp[1024];
+    const char *typeName;
+    CopyBufFuncs copyFuncs;
+    ZeroFuncs zeroFuncs;
+    char fpref;
+    DataType dtype = kextra->dtype;
+    KernelExtraFlags kflags = kextra->flags;
+    ssize_t ret;
+    size_t pitchAB;
+    bool b;
+    bool tra, trb, isU, transpMul;
+    BlasGenSettings gset;
+
+    if (pgran->wgDim != 1) {
+        return -EINVAL;
+    }
+
+    ctx = createKgenContext(buf, buflen, true);
+    if (ctx == NULL) {
+        return -ENOMEM;
+    }
+
+    tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A);
+    trb = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B);
+    isU = isMatrixUpper(kflags);
+
+    // at first, generate needed declarations and auxiliary functions
+
+    b = isDoubleBasedType(dtype);
+    kgenDeclareUptrs(ctx, b);
+
+    if (isComplexType(dtype)) {
+        genComplexMathOperators(ctx, dtype);
+    }
+
+    memset(&gset, 0, sizeof(gset));
+    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
+    gset.kextra = kextra;
+    gset.pgran = pgran;
+
+    generateBufCopyFuncs(&copyFuncs, ctx, CLBLAS_TRSM, &gset,
+                         BCHF_MATRIX_A | BCHF_WRITE_OUTPUT | BCHF_IMAGE_WRITE);
+    generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype,
+                         ZF_MATRIX_A);
+
+    //matrix inversion function
+    genInvertingBlockFunc(ctx, (unsigned int)subdims[0].bwidth, dtype, isU);
+
+    //coordinates calculation
+    getBufferPos(ctx, isU);
+
+    typeName = dtypeBuiltinType(dtype);
+    fpref = dtypeToBlasPrefix(dtype);
+
+    // now, generate the kernel
+
+    sprintf(tmp, trsmPrepDecl, fpref, 'M', typeName,
+        typeName, typeName, typeName);
+
+    kgenDeclareFunction(ctx, tmp);
+    ret = kgenBeginFuncBody(ctx);
+
+    transpMul = useTransposedMul(subdims, dtype, trb);
+    if (!transpMul) {
+        sprintf(tmp, "const int bpr = get_image_width(imA) / %lu;\n",
+                subdims[0].y / (sizeof(cl_float4) / dtypeSize(dtype)));
+    }
+    else {
+        sprintf(tmp, "const int bpc = get_image_height(imA) / %lu;\n",
+                subdims[0].y);
+    }
+    kgenAddStmt(ctx, tmp);
+
+    /*
+     * Calculate local buffer pitches, and then insert the
+     * preparative code
+     */
+    pitchAB = matrBlockPitch(subdims, MATRIX_A, dtype, clblasLeft);
+    sprintf(tmp, trsmPrep1D, typeName, pitchAB * subdims[0].y,
+            typeName, pitchAB * subdims[0].y, pgran->wgSize[0]);
+    ret = kgenAddStmt(ctx, tmp);
+
+    sprintf(tmp, "getBufferPos(gid, startRow / %lu, (M + %lu) / %lu, &currM, &k0);\n",
+            subdims[0].y, subdims[0].y - 1, subdims[0].y);
+    kgenAddStmt(ctx, tmp);
+    sprintf(tmp, "currM *= %lu;\n"
+            "k0 *= %lu;\n", subdims[0].y, subdims[0].y);
+    kgenAddStmt(ctx, tmp);
+
+    genGetImagePos(ctx, subdims, dtype, "gid", transpMul);
+
+    kgenBeginBranch(ctx, "if (currM == k0)");
+    genPrepareSquareBlock(ctx, subdims, dtype, &copyFuncs, &zeroFuncs,
+                          tra, 'A', !(kextra->flags & KEXTRA_TAILS_M));
+    genPrepZeroBlockC(ctx, &zeroFuncs);
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+
+    if (kextra->flags & KEXTRA_UNIT_DIAGONAL) {
+        sprintf(tmp, "if (lid < %lu) {\n"
+                     "    tempA[lid * %lu + lid] = %s;\n"
+                     "}\n",
+                subdims[0].bwidth, pitchAB, strOne(dtype));
+        kgenAddStmt(ctx, tmp);
+        kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+        kgenAddBlankLine(ctx);
+    }
+
+    sprintf(tmp, "if (lid < %lu)", subdims[0].bwidth);
+    kgenBeginBranch(ctx, tmp);
+    sprintf(tmp, "invert(tempA, tempC, lid, (currM + %lu > M) ? "
+                                            "M - currM : %lu);\n",
+            subdims[0].y, subdims[0].y);
+    kgenAddStmt(ctx, tmp);
+    kgenEndBranch(ctx, NULL);
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+    kgenEndBranch(ctx, NULL);
+
+    kgenBeginBranch(ctx, "else");
+    genPrepareSquareBlock(ctx, subdims, dtype, &copyFuncs, &zeroFuncs, tra,
+                          'C', !(kextra->flags & KEXTRA_TAILS_M));
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+    kgenEndBranch(ctx, NULL);
+
+    genWriteBlock(ctx, subdims, &copyFuncs);
+    kgenEndFuncBody(ctx);
+    ret = kgenAddBlankLine(ctx);
+
+    if (!ret) {
+        ret = (ssize_t)kgenSourceSize(ctx) + 1;
+    }
+
+    destroyKgenContext(ctx);
+
+    return (ret < 0) ? -EOVERFLOW : ret;
+}
+
+static void
+genZeroResult(
+    struct KgenContext *ctx,
+    DataType dtype,
+    const SubproblemDim *dims)
+{
+    unsigned int n;
+    char tmp[1024];
+    unsigned int vecLen = sizeof(cl_float4) / dtypeSize(dtype);
+
+    getResultGPRsInfo(dtype, &dims[1], vecLen, &n, NULL);
+
+    sprintf(tmp, "for (x = 0; x < %u; x++) {\n"
+                 "    c[x] = 0;\n"
+                 "}\n\n", n);
+
+    kgenAddStmt(ctx, tmp);
+}
+
+static void
+genPrepareRectBlock(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    DataType dtype,
+    const CopyBufFuncs *copyFuncs,
+    const ZeroFuncs *zeroFuncs,
+    bool trb,
+    char c,
+    bool opt)
+{
+    char tmp[1024];
+    size_t pitch;
+    const char *readBlock;
+    size_t bsizes[2] = {dim->bwidth, dim->x};
+
+    /*
+     * NOTE: in case of accessing to B in the non transposed way
+     *       block multiplication is done with transposed block B
+     */
+    pitch = calcPitchB(dim, dtype, !trb);
+    if (opt) {
+        readBlock = (trb) ? readRectBlockTransOpt : readRectBlockOpt;
+        sprintf(tmp, readBlock, copyFuncs->read[MATRIX_B], c);
+    }
+    else {
+        readBlock = (trb) ? readRectBlockTrans : readRectBlock;
+        sprintf(tmp, readBlock, bsizes[trb], bsizes[trb], bsizes[1 - trb],
+                bsizes[1 - trb], bsizes[trb], bsizes[1 - trb],
+                copyFuncs->read[MATRIX_B], c, zeroFuncs->names[MATRIX_B], c,
+                copyFuncs->readGeneric[MATRIX_B], c, pitch);
+    }
+    kgenAddStmt(ctx, tmp);
+}
+
+static void
+getNblock(struct KgenContext *ctx, bool isU) //x, y -> n
+{
+    kgenDeclareFunction(ctx, "void\ngetNBlock(uint y, uint x, uint startRow, "
+        "uint width, uint *n)\n");
+    kgenBeginFuncBody(ctx);
+    if (isU) {
+        kgenAddStmt(ctx, "*n = ((2 * width - y + 1) * y - "
+            "(2 * width - startRow + 1) * startRow) / 2 + x - y;\n");
+    }
+    else {
+        kgenAddStmt(ctx, "*n = (y * (y + 1) - startRow * (startRow + 1)) / 2 + x;\n");
+    }
+    kgenEndFuncBody(ctx);
+    kgenAddBlankLine(ctx);
+}
+
+static void
+genMultiplication(
+    struct KgenContext *ctx,
+    const SubproblemDim *dims,
+    DataType dtype,
+    const char *blkmulName,
+    BlkMulFlags mulFlags)
+{
+    char tmp[1024];
+    size_t u;
+    unsigned int l1Pans;
+
+    l1Pans = (unsigned int)(dims[0].x / dims[1].x);
+    if (mulFlags & BLKMUL_TRANSPOSED_B) {
+        u = 1;
+    }
+    else {
+        u = matrBlockPitch(dims, MATRIX_B, dtype, clblasLeft);
+    }
+
+    // find image position and invoke the multiplier
+    sprintf(tmp, "getNBlock(m0 / %lu, k0 / %lu, startRow / %lu, "
+                           "(M + %lu) / %lu, &blockN);\n",
+            dims[0].y, dims[0].y, dims[0].y, dims[0].y - 1, dims[0].y);
+    kgenAddStmt(ctx, tmp);
+    genGetImagePos(ctx, dims, dtype, "blockN", (mulFlags & BLKMUL_TRANSPOSED_B) != 0);
+    sprintf(tmp, "%s(A, (int2)(imx, imy + lid / %u * %lu), \n"
+                  "   (LPtr)(tempC + (lid %% %u * %lu) * %lu),\n"
+                  "   c, skew);\n",
+            blkmulName, l1Pans, dims[1].y, l1Pans, dims[1].x, u);
+    kgenAddStmt(ctx, tmp);
+}
+
+static void
+genReorderSolution(
+    struct KgenContext *ctx,
+    const SubproblemDim *subdims,
+    const char *outTypeName,
+    unsigned int colRegs)
+{
+    char tmp[1024], tmp1[1024];
+    char *p;
+    unsigned i;
+
+    sprintf(tmp, "void\n"
+                 "reorderResult(%s *c, int skew)",
+            outTypeName);
+    kgenDeclareFunction(ctx, tmp);
+    kgenBeginFuncBody(ctx);
+
+    sprintf(tmp, "%s tmp;\n"
+                 "int i, j;\n",
+           outTypeName);
+    kgenAddStmt(ctx, tmp);
+
+    p = tmp1;
+    for (i = 0; i < colRegs; i++) {
+        unsigned int k = (unsigned int)(subdims[1].y - 1) * colRegs + i;
+
+        sprintf(p,  "\n"
+                    "    tmp = c[%u];\n"
+                    "    for (j = %lu; j >= 0; j--) {\n"
+                    "        c[(j+1) * %u + %u] = c[j * %u + %u];\n"
+                    "    }\n"
+                    "    c[%u] = tmp;\n",
+                k, subdims[1].y - 2, colRegs, i, colRegs, i, i);
+        p += strlen(p);
+    }
+
+    sprintf(tmp, "\n"
+                 "for (i = 0; i < skew; i++) {\n"
+                 "%s"
+                 "}\n"
+                 "\n",
+            tmp1);
+    kgenAddStmt(ctx, tmp);
+
+    kgenEndFuncBody(ctx);
+    kgenAddBlankLine(ctx);
+}
+
+static void
+initKernelVarNames(KernelVarNames *kvars, KernelExtraFlags kflags)
+{
+    kvars->A = "imgA";
+    kvars->B = "B";
+
+    if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A)) {
+        kvars->coordA = "coordA.x";
+    }
+    else {
+        kvars->coordA = "coordA.y";
+    }
+    if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) {
+        kvars->coordB = "coordB.x";
+    }
+    else {
+        kvars->coordB = "coordB.y";
+    }
+
+    kvars->sizeM = "M";
+    kvars->sizeN = "N";
+    kvars->sizeK = "origM";
+}
+
+// image based kernel generator
+static ssize_t
+generator(
+    char *buf,
+    size_t buflen,
+    const struct SubproblemDim *subdims,
+    const struct PGranularity *pgran,
+    void *extra)
+{
+    struct KgenContext *ctx;
+    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    CLBLASKernExtra kextraTmp = *kextra;
+    char tmp[1024], tmp1[1024];
+    char blkmul[FUNC_NAME_MAXLEN];
+    char updateResFn[FUNC_NAME_MAXLEN];
+    char updateResGenericFn[FUNC_NAME_MAXLEN];
+    char updateResFnRev[FUNC_NAME_MAXLEN];
+    char updateResGenericFnRev[FUNC_NAME_MAXLEN];
+    char copyPLFn[FUNC_NAME_MAXLEN];
+    char *s1 = "";
+    const char *typeName;
+    CopyBufFuncs copyFuncs;
+    ZeroFuncs zeroFuncs;
+    char fpref;
+    DataType dtype = kextra->dtype;
+    ssize_t ret;
+    BlasGenSettings gset;
+    BlkMulOpts mulOpts;
+    BlkMulFlags mulFlags;
+    size_t pitchAB;
+    size_t u;
+    bool b;
+    bool isU;
+    bool areTails;
+    const char *outTypeName;
+    unsigned int nrRegs, colRegs;
+    KernelExtraFlags kflags = kextra->flags;
+    size_t tsize;
+    unsigned int vecLen = sizeof(cl_float4) / dtypeSize(dtype);
+    UpdateResultFlags upFlags;
+    int tra, trb;
+    unsigned int l1Pans;
+    char vect[2] = {'y', 'x'};
+
+    if (pgran->wgDim != 1) {
+        return -EINVAL;
+    }
+
+    ctx = createKgenContext(buf, buflen, true);
+    if (ctx == NULL) {
+        return -ENOMEM;
+    }
+
+    tsize = dtypeSize(dtype);
+    areTails = (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N));
+    isU = isMatrixUpper(kflags);
+
+    tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A);
+    trb = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B);
+    l1Pans = (unsigned int)subdims[0].x / (unsigned int)subdims[1].x;
+
+    /*
+     * Force generation of the transposed version of the block
+     * reading function with following multiplication with transposed
+     * block B to decrease LDS bank conflicts without column skew using.
+     * Reverse temporarily the flag of the column-major order for that
+     */
+    if (useTransposedMul(subdims, dtype, trb)) {
+        if (kflags & KEXTRA_COLUMN_MAJOR) {
+            kflags &= ~KEXTRA_COLUMN_MAJOR;
+        }
+        else {
+            kflags |= KEXTRA_COLUMN_MAJOR;
+        }
+        mulFlags = BLKMUL_SKEW_ROW | BLKMUL_TRANSPOSED_B;
+        u = subdims[1].y;
+    }
+    else {
+        mulFlags = BLKMUL_SKEW_COLUMN;
+        u = subdims[0].y / (sizeof(cl_float4) / dtypeSize(dtype));
+    }
+
+    ctx = createKgenContext(buf, buflen, true);
+    if (ctx == NULL) {
+        return -ENOMEM;
+    }
+
+    // at first, generate needed declarations and auxiliary functions
+
+    b = isDoubleBasedType(dtype);
+    kgenDeclareUptrs(ctx, b);
+
+    kextraTmp.flags = kflags;
+    memset(&gset, 0, sizeof(gset));
+    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
+    gset.kextra = &kextraTmp;
+    gset.pgran = pgran;
+    initKernelVarNames(&gset.varNames, kextra->flags);
+
+    if (isComplexType(dtype)) {
+        genComplexMathOperators(ctx, dtype);
+    }
+
+    generateBufCopyFuncs(&copyFuncs, ctx, CLBLAS_TRSM, &gset, BCHF_MATRIX_B);
+    /*
+     * Temporary kernel extra has been needed to produce inverted block B read.
+     * Restore the original one, and restore kflags as well
+     */
+    gset.kextra = kextra;
+    kflags = kextra->flags;
+
+    // functions updating result
+    // for the final result
+    generateUpresFuncs(ctx, CLBLAS_TRSM, &gset, updateResFn,
+                       updateResGenericFn);
+    // for intermediate result after blocks modification
+    upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags);
+    upFlags |= UPRES_WITH_BETA | UPRES_PRIV_DEST;
+    genUpresFuncsWithFlags(ctx, &gset, upFlags, updateResFnRev,
+                           updateResGenericFnRev);
+    // for heaping before multiplying on inverted block
+    upFlags = UPRES_USE_LDS;
+    if (!(mulFlags & BLKMUL_TRANSPOSED_B)) {
+        upFlags |= UPRES_COLUMN_MAJOR;
+    }
+    updateResultGenOld(ctx, &gset, UPRES_SET, upFlags, NULL);
+    kgenGetLastFuncName(copyPLFn, FUNC_NAME_MAXLEN, ctx);
+    kgenAddBlankLine(ctx);
+
+    generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype,
+                         ZF_MATRIX_B | ZF_MATRIX_C);
+
+    // block multiplication function
+    mulOpts.aMobj = CLMEM_IMAGE;
+    mulOpts.bMobj = CLMEM_BUFFER;
+    mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | mulFlags;
+    if (isComplexType(dtype)) {
+        mulOpts.core = BLKMUL_SEPARATE_MULADD;
+    }
+    else {
+        mulOpts.core = BLKMUL_MAD;
+    }
+    ret = blkMulGen(ctx, subdims, dtype, &mulOpts);
+    if (ret) {
+        destroyKgenContext(ctx);
+
+        return -EOVERFLOW;
+    }
+
+    kgenAddBlankLine(ctx);
+    kgenGetLastFuncName(blkmul, sizeof(blkmul), ctx);
+
+    typeName = dtypeBuiltinType(dtype);
+    fpref = dtypeToBlasPrefix(dtype);
+
+    // block number calculation
+    getNblock(ctx, isU);
+
+    getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName);
+    if (isComplexType(dtype)) {
+        colRegs = (unsigned int)subdims[1].x;
+    }
+    else {
+        colRegs = (unsigned int)fl4RowWidth(subdims[1].x, tsize);
+    }
+
+    if (mulFlags & BLKMUL_SKEW_ROW) {
+        genReorderSolution(ctx, subdims, outTypeName, colRegs);
+    }
+
+    // now, generate the kernel
+
+    if (kflags & KEXTRA_SIDE_RIGHT) {
+        sprintf(tmp, trsmImDecl, pgran->wgSize[0], pgran->wgSize[1],
+            fpref, 'N', 'M', typeName, typeName, typeName, typeName);
+    }
+    else {
+        sprintf(tmp, trsmImDecl, pgran->wgSize[0], pgran->wgSize[1],
+            fpref, 'M', 'N', typeName, typeName, typeName, typeName);
+    }
+
+    kgenDeclareFunction(ctx, tmp);
+    ret = kgenBeginFuncBody(ctx);
+
+    if (!(mulFlags & BLKMUL_TRANSPOSED_B)) {
+        sprintf(tmp, "const int bpr = get_image_width(A) / %lu;\n",
+                subdims[0].y / (sizeof(cl_float4) / tsize));
+    }
+    else {
+        sprintf(tmp, "const int bpc = get_image_height(A) / %lu;\n",
+                subdims[0].y);
+    }
+    kgenAddStmt(ctx, tmp);
+
+    /*
+     * Calculate local buffer pitches, and then insert the
+     * preparative code
+     */
+    pitchAB = matrBlockPitch(subdims, MATRIX_A, dtype, clblasLeft);
+
+    sprintf(tmp, trsmImPrep1D, typeName, pitchAB * subdims[0].x,
+        outTypeName, nrRegs, u, s1, pgran->wgSize[0], subdims[0].itemX);
+    kgenAddStmt(ctx, tmp);
+    kgenAddBlankLine(ctx);
+
+    kgenAddStmt(ctx, "B += offB;\n");
+    sprintf(tmp, "coordB.%c = currN + lid %% %u * %lu;\n"
+                 "coordB.%c = 0;\n\n",
+            vect[trb], l1Pans, subdims[1].x, vect[1 - trb]);
+    kgenAddStmt(ctx, tmp);
+
+   /*
+    * B matrix is divided on panels, each work group
+    * multiply such a panel on the whole matrix A.
+    */
+
+    // top level loop over M
+    if (isU) {
+        sprintf(tmp1, "(((finishRow - 1) / %lu) * %lu)", subdims[0].y,
+                subdims[0].y); //last block start
+        sprintf(tmp, "for (m0 = %s; m0 + %lu != startRow; m0 -= %lu)",
+                tmp1, subdims[0].y, subdims[0].y);
+        ret = kgenBeginBranch(ctx, tmp);
+    }
+    else {
+        sprintf(tmp, "for (m0 = startRow; m0 < finishRow; m0 += %lu)",
+                subdims[0].y);
+        ret = kgenBeginBranch(ctx, tmp);
+    }
+
+    sprintf(tmp, "coordA.%c = m0 + lid / %u * %lu;\n"
+                 "coordA.%c = 0;\n\n",
+            vect[tra], l1Pans, subdims[1].y, vect[1 - tra]);
+    kgenAddStmt(ctx, tmp);
+
+    genZeroResult(ctx, dtype, subdims);
+
+    // loop over K
+    if (isU) {
+        sprintf(tmp, "for (k0 = m0 + %lu; k0 < M; k0 += %lu)",
+            subdims[0].bwidth, subdims[0].bwidth);
+    }
+    else {
+        sprintf(tmp, "for (k0 = 0; k0 < m0; k0 += %lu)",
+            subdims[0].bwidth);
+    }
+    ret = kgenBeginBranch(ctx, tmp);
+
+    genPrepareRectBlock(ctx, subdims, dtype, &copyFuncs, &zeroFuncs,
+                        trb, 'C', !areTails);
+
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+
+    // multiplication in the adjusting loop
+    genMultiplication(ctx, subdims, dtype, blkmul, mulFlags);
+
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+    kgenEndBranch(ctx, NULL); // loop over K
+    kgenAddBlankLine(ctx);
+
+    if (mulFlags & BLKMUL_SKEW_ROW) {
+        kgenAddStmt(ctx, "reorderResult(c, skew);\n");
+    }
+    kgenAddStmt(ctx, "k0 = m0;\n");
+
+    genUpdateIntermTrsmResult(ctx, &gset, updateResFnRev,
+                                  updateResGenericFnRev, true);
+
+    genHeapTrsmResultToLDS(ctx, &gset, copyPLFn, "tempC");
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+    genZeroResult(ctx, dtype, subdims);
+
+    // multiplication on the inverted block
+    genMultiplication(ctx, subdims, dtype, blkmul, mulFlags);
+    if (mulFlags & BLKMUL_SKEW_ROW) {
+        kgenAddStmt(ctx, "reorderResult(c, skew);\n");
+    }
+
+    // write back the tile evaluated
+    upFlags = UPRES_EXCEED_PROBLEM_CONDITION;
+    if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_C)) {
+        upFlags |= UPRES_COLUMN_MAJOR;
+    }
+    genResultUpdateWithFlagsOld(ctx, CLBLAS_TRSM, &gset, upFlags, updateResFn,
+                                updateResGenericFn, NULL);
+
+    kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE);
+
+    // end external loops over panels of matrix A
+    kgenEndBranch(ctx, NULL);
+    kgenEndFuncBody(ctx);
+    ret = kgenAddBlankLine(ctx);
+
+    if (!ret) {
+        ret = (ssize_t)kgenSourceSize(ctx) + 1;
+    }
+
+    destroyKgenContext(ctx);
+
+    return (ret < 0) ? -EOVERFLOW : ret;
+}
+
+static ssize_t
+wrapper(
+    char *buf,
+    size_t buflen,
+    const struct SubproblemDim *subdims,
+    const struct PGranularity *pgran,
+    void *extra)
+{
+    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    if (kextra->kernType == CLBLAS_COMPUTING_KERNEL) {
+        return generator(buf, buflen, subdims, pgran, extra);
+    }
+    else {
+        return prepGenerator(buf, buflen, subdims, pgran, extra);
+    }
+}
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra)
+{
+    const CLBlasKargs *blasArgs = (const CLBlasKargs*)params;
+
+    (void)extra;
+
+    if (blasArgs->kernType == CLBLAS_COMPUTING_KERNEL) {
+        if (blasArgs->side == clblasLeft) {
+           initSizeKarg(&args[0], blasArgs->K);
+           initSizeKarg(&args[1], blasArgs->N);
+        }
+        else {
+           initSizeKarg(&args[0], blasArgs->M);
+           initSizeKarg(&args[1], blasArgs->K);
+        }
+        assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype);
+        initMemobjKarg(&args[3], blasArgs->scimage[0], NULL, 0, 0);
+        initMemobjKarg(&args[4], blasArgs->B, NULL, 0, 0);
+        initSizeKarg(&args[5], blasArgs->ldb.matrix);
+        if (blasArgs->side == clblasLeft) {
+            initSizeKarg(&args[6], blasArgs->offsetM);
+            initSizeKarg(&args[7], blasArgs->M + blasArgs->offsetM);
+        }
+        else {
+            initSizeKarg(&args[6], blasArgs->offsetN);
+            initSizeKarg(&args[7], blasArgs->N + blasArgs->offsetN);
+        }
+        initSizeKarg(&args[8], blasArgs->offBX);
+    }
+    else {
+        if (blasArgs->side == clblasLeft) {
+            initSizeKarg(&args[0], blasArgs->M);
+        }
+        else {
+            initSizeKarg(&args[0], blasArgs->N);
+        }
+        initMemobjKarg(&args[1], blasArgs->A, NULL, 0, 0);
+        initSizeKarg(&args[2], blasArgs->lda.matrix);
+        initMemobjKarg(&args[3], blasArgs->scimage[0], NULL, 0, 0);
+        if (blasArgs->side == clblasLeft) {
+            initSizeKarg(&args[4], blasArgs->offsetM);
+        }
+        else {
+            initSizeKarg(&args[4], blasArgs->offsetN);
+        }
+        initSizeKarg(&args[5], blasArgs->offA);
+    }
+}
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs)
+{
+    cl_ulong sizeA, sizeB, size;
+    const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs;
+
+    /*
+     * For prepare kernel two square local blocks required.
+     * For main kernel two rectangular blocks required.
+     * Maximum of these two values checked.
+     */
+
+    sizeA = matrBlockSize(dim, MATRIX_A, dtype, kargs->side);
+    sizeB = matrBlockSize(dim, MATRIX_B, dtype, kargs->side);
+    size = (sizeA > sizeB) ? sizeA : sizeB;
+
+    return (2 * size * dtypeSize(dtype) <= ldsSize);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *dims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra)
+{
+    SubproblemDim globDim, offDim;
+    const CLBlasKargs *kargs = (const CLBlasKargs*)args;
+    size_t width, startBlock, finishBlock;
+    bool isU = (kargs->uplo == clblasUpper) ^
+        (kargs->transA != clblasNoTrans) ^ (kargs->side == clblasRight);
+
+    (void)extra;
+
+    width = kargs->K;
+    width = (width + dims[0].bwidth - 1) / dims[0].bwidth;
+    kargsToProbDims(&globDim, CLBLAS_TRSM, kargs, false);
+    kargsToProbDims(&offDim, CLBLAS_TRSM, kargs, true);
+
+    startBlock = offDim.y / dims[0].bwidth;
+    finishBlock = (globDim.y + offDim.y + dims[0].bwidth - 1) / dims[0].bwidth;
+
+    if (kargs->kernType == CLBLAS_PREP_A_KERNEL) {
+        if (isU) {
+            threads[0] = ((2 * width - startBlock - finishBlock + 1) *
+                (finishBlock - startBlock) / 2) * pgran->wgSize[0];
+        }
+        else {
+            threads[0] = ((1 + finishBlock + startBlock) *
+                (finishBlock - startBlock) / 2) * pgran->wgSize[0];
+        }
+        threads[1] = 0;
+    }
+    else {
+        calcGlobalThreads(threads, dims, pgran, globDim.y, globDim.x);
+    }
+}
+
+static void
+imgPackMode(
+    const void *extra,
+    const SubproblemDim *dims,
+    int dataID,
+    unsigned int *packRate,
+    clblasOrder *packOrder)
+{
+    bool trb;
+    const CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+
+    (void)dataID;
+
+    trb = isMatrixAccessColMaj(CLBLAS_TRSM, kextra->flags, MATRIX_B);
+    if (trb || isComplexType(kextra->dtype)) {
+        *packOrder = clblasRowMajor;
+        *packRate = (unsigned int)dims[0].y;
+    }
+    else {
+        *packOrder = clblasColumnMajor;
+        *packRate = (unsigned int)dims[0].y;
+    }
+}
+
+static SolverFlags
+solverFlags(void)
+{
+    return (SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS);
+}
+
+void
+initTrsmImgPattern(MemoryPattern *mempat)
+{
+    mempat->name = "Image based block trsm";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &solverOps;
+    mpatExtra.aMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS;
+    mpatExtra.bMset = CLMEM_LEVEL_LDS;
+    mpatExtra.mobjA = CLMEM_IMAGE;
+    mpatExtra.mobjB = CLMEM_BUFFER;
+    mempat->extra = &mpatExtra;
+}
diff --git a/src/library/blas/gens/legacy/trsm_kgen_legacy.c b/src/library/blas/gens/legacy/trsm_kgen_legacy.c
new file mode 100644
index 0000000..2aa91cc
--- /dev/null
+++ b/src/library/blas/gens/legacy/trsm_kgen_legacy.c
@@ -0,0 +1,190 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+
+#include "../blas_kgen.h"
+#include "trsm_kgen_legacy.h"
+
+void
+genUpdateIntermTrsmResult(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    const char *optFuncName,
+    const char *genericFuncName,
+    bool withMhitCond)
+{
+    char tmp[1024];
+    const char *coordY, *coordX;
+    char *revAlp, *alp;
+    DataType dtype = gset->kextra->dtype;
+    KernelExtraFlags kflags = gset->kextra->flags;
+    const SubproblemDim *dim = &gset->subdims[1];
+    const KernelVarNames *kvarNames = &gset->varNames;
+
+    if (isComplexType(dtype)) {
+        if (dtype == TYPE_COMPLEX_FLOAT) {
+            revAlp = "div((float2)(-1.f, 0), alpha)";
+            alp = "(float2)(1.f, 0)";
+        }
+        else {
+            revAlp = "div((double2)(-1., 0), alpha)";
+            alp = "(double2)(1., 0)";
+        }
+    }
+    else {
+        revAlp = "-1. / alpha";
+        alp = "1.";
+    }
+
+    coordY = kvarNames->coordA;
+    coordX = kvarNames->coordB;
+
+    if (!(kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N))) {
+        sprintf(tmp, "%s(B, c, %s, %s, %s, ldb, %s);\n",
+                optFuncName, alp, coordY, coordX, revAlp);
+        kgenAddStmt(ctx, tmp);
+    }
+    else {
+        if (withMhitCond) {
+            sprintf(tmp, "if ((%s < %s) && (%s < %s))",
+                    coordY, kvarNames->sizeM, coordX, kvarNames->sizeN);
+            kgenBeginBranch(ctx, tmp);
+        }
+        else {
+            /* for x, y variables scope */
+            kgenBeginBranch(ctx, NULL);
+        }
+
+        sprintf(tmp, "uint y = min(%luu, %s - (uint)%s);\n"
+                     "uint x = min(%luu, %s - (uint)%s);\n"
+                     "if ((y == %luu) && (x == %luu)) {\n"
+                     "    %s(B, c, %s, %s, %s, ldb, %s);\n"
+                     "}\n"
+                     "else {\n"
+                     "    %s(B, c, %s, %s, %s, ldb, %s, y, x);\n"
+                     "}\n",
+                dim->y, kvarNames->sizeM, coordY,
+                dim->x, kvarNames->sizeN, coordX,
+                dim->y, dim->x,
+                optFuncName, alp, coordY, coordX, revAlp,
+                genericFuncName, alp, coordY, coordX, revAlp);
+
+        kgenAddStmt(ctx, tmp);
+
+        kgenEndBranch(ctx, NULL);
+    }
+}
+
+void
+genHeapTrsmResultToLDS(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    const char *funcName,
+    const char *dstName)
+{
+    char tmp[1024];
+    char *alp;
+    unsigned int l1Pans;
+    DataType dtype = gset->kextra->dtype;
+    const SubproblemDim *dims = gset->subdims;
+
+    if(isComplexType(dtype)) {
+        if (dtype == TYPE_COMPLEX_FLOAT) {
+            alp = "(float2)(1.f, 0)";
+        }
+        else {
+            alp = "(double2)(1., 0)";
+        }
+    }
+    else {
+        alp = "1.";
+    }
+
+    l1Pans = (unsigned int)dims[0].x / (unsigned int)dims[1].x;
+    sprintf(tmp, "%s(%s, c, %s, (lid / %u * %lu), (lid %% %u * %lu), %lu);\n",
+            funcName, dstName, alp, l1Pans, dims[1].y, l1Pans, dims[1].x,
+            dims[0].bwidth);
+    kgenAddStmt(ctx, tmp);
+}
+
+void
+genInvertingBlockFunc(
+    struct KgenContext *ctx,
+    size_t pitch,
+    DataType dtype,
+    KernelExtraFlags kflags)
+{
+    char tmp[1024];
+    const char *ctype;
+    ctype = dtypeBuiltinType(dtype);
+
+    sprintf(tmp, "void\ninvert(__local %s *src, __local %s *dst, int lid, "
+                              "int lastRow)\n", ctype, ctype);
+    kgenDeclareFunction(ctx, tmp);
+    kgenBeginFuncBody(ctx);
+    kgenAddStmt(ctx, "int i, k;\n");
+
+    if (isComplexType(dtype)) {
+        sprintf(tmp, "dst[lid * %lu + lid].x = 1.f;\n", pitch);
+    }
+    else {
+        sprintf(tmp, "dst[lid * %lu + lid] = 1.f;\n", pitch);
+    }
+    kgenAddStmt(ctx, tmp);
+
+    if (isMatrixUpper(kflags)) {
+        sprintf(tmp, "for (i = lastRow - 1; i >= 0; i--)");
+    }
+    else {
+        sprintf(tmp, "for (i = 0; i < lastRow; i++)");
+    }
+    kgenBeginBranch(ctx, tmp);
+
+    if (isComplexType(dtype)) {
+        sprintf(tmp, "dst[i * %lu + lid] = div(dst[i * %lu + lid], "
+                     "src[i * %lu + i]);\n", pitch, pitch, pitch);
+    }
+    else {
+        sprintf(tmp, "dst[i * %lu + lid] = dst[i * %lu + lid] / "
+                     "src[i * %lu + i];\n", pitch, pitch, pitch);
+    }
+    kgenAddStmt(ctx, tmp);
+
+    if (isMatrixUpper(kflags)) {
+        sprintf(tmp, "for (k = 0; k < i; k++)");
+    }
+    else {
+        sprintf(tmp, "for (k = i + 1; k < %lu; k++)", pitch);
+    }
+    kgenBeginBranch(ctx, tmp);
+    if (isComplexType(dtype)) {
+        sprintf(tmp, "dst[k * %lu + lid] = dst[k * %lu + lid] - "
+                     "mul(src[k * %lu + i], dst[i * %lu + lid]);\n",
+                pitch, pitch, pitch, pitch);
+    }
+    else {
+        sprintf(tmp, "dst[k * %lu + lid] = dst[k * %lu + lid] - "
+                      "dst[i * %lu + lid] * src[k * %lu + i];\n",
+                pitch, pitch, pitch, pitch);
+    }
+    kgenAddStmt(ctx, tmp);
+    kgenEndBranch(ctx, NULL);
+    kgenEndBranch(ctx, NULL);
+    kgenEndFuncBody(ctx);
+}
+
diff --git a/src/library/blas/gens/legacy/trsm_kgen_legacy.h b/src/library/blas/gens/legacy/trsm_kgen_legacy.h
new file mode 100644
index 0000000..272150e
--- /dev/null
+++ b/src/library/blas/gens/legacy/trsm_kgen_legacy.h
@@ -0,0 +1,43 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef TRSM_KGEN_LEGACY_H_
+#define TRSM_KGEN_LEGACY_H_
+
+void
+genUpdateIntermTrsmResult(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    const char *optFuncName,
+    const char *genericFuncName,
+    bool withMhitCond);
+
+void
+genHeapTrsmResultToLDS(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    const char *funcName,
+    const char *dstName);
+
+void
+genInvertingBlockFunc(
+    struct KgenContext *ctx,
+    size_t pitch,
+    DataType dtype,
+    KernelExtraFlags kflags);
+
+#endif /* TRSM_KGEN_LEGACY_H_ */
diff --git a/src/library/blas/gens/legacy/trsm_lds.c b/src/library/blas/gens/legacy/trsm_lds.c
new file mode 100644
index 0000000..c2dd87a
--- /dev/null
+++ b/src/library/blas/gens/legacy/trsm_lds.c
@@ -0,0 +1,649 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * LDS based trsm generator
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include <matrix_dims.h>
+
+#include "../init.h"
+#include "blas_kgen_legacy.h"
+#include "gen_helper_legacy.h"
+#include "trsm_kgen_legacy.h"
+#include "../trxm_common.h"
+#include "../trsm_kgen.h"
+
+static CLBLASMpatExtra mpatExtra;
+
+/*
+ *  template for memory object based trsm preparation part
+ *  for one dimensional work space
+ */
+static const char *trsmPrep1D =
+    "uint m0, k0;\n"
+    "__local %s tempA[%lu];\n"
+    "__local %s tempC[%lu];\n"
+    "%s c[%u];\n"
+    "int lid, gid;\n"
+    "%s"                                    // groups per Panel variable
+    "uint currM, currN;\n"
+    "uint x, y;\n"
+    "uint2 coordA, coordB;\n"
+    "\n"
+    "lid = get_local_id(0);\n"
+    "gid = get_global_id(0) / %u;\n"       // group ID
+    "\n";
+
+static const char *readSquareBlock =
+    "y = (currM + %lu <= M) ? %lu : M - currM;\n"
+    "x = (k0 + %lu <= M) ? %lu : M - k0;\n"
+    "if ((y == %lu) && (x == %lu)) {\n"
+    // just read with an optimized function
+    "    %s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n"
+    "}\n"
+    "else {\n"
+    "    %s((__local float4*)temp%c);\n"           // zeroing
+    "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+    "    %s((LPtr)temp%c, (GPtr)A, currM, k0, y, x, %lu, lda);\n"
+    "}\n\n";
+
+static const char *readSquareBlockOpt =
+    // just read with an optimized function
+    "%s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n";
+
+static const char *readSquareBlockTrans =
+    "y = (currM + %lu <= M) ? %lu : M - currM;\n"
+    "x = (k0 + %lu <= M) ? %lu : M - k0;\n"
+    "if ((y == %lu) && (x == %lu)) {\n"
+    // read and transpose with an optimized function
+    "    %s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n"
+    "}\n"
+    "else {\n"
+    "    %s((__local float4*)temp%c);\n"           // zeroing
+    "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+    // read and transpose with slow function
+    "    %s((LPtr)temp%c, (GPtr)A, k0, currM, x, y, %lu, lda);\n"
+    "}\n\n";
+
+static const char *readSquareBlockTransOpt =
+    // read and transpose with an optimized function
+    "%s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n";
+
+static const char *readRectBlock =
+    "y = (currN + %lu <= N) ? %lu : N - currN;\n"
+    "x = (k0 + %lu <= M) ? %lu : M - k0;\n"
+    "if ((y == %lu) && (x == %lu)) {\n"
+    // just read with an optimized function
+    "    %s((LPtr)temp%c, (GPtr)B, currN, k0, ldb);\n"
+    "}\n"
+    "else {\n"
+    "    %s((__local float4*)temp%c);\n"           // zeroing
+    "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+    "    %s((LPtr)temp%c, (GPtr)B, currN, k0, y, x, %lu, ldb);\n"
+    "}\n\n";
+
+static const char *readRectBlockOpt =
+    // just read with an optimized function
+    "%s((LPtr)temp%c, (GPtr)B, currN, k0, ldb);\n";
+
+static const char *readRectBlockTrans =
+    "y = (currN + %lu <= N) ? %lu : N - currN;\n"
+    "x = (k0 + %lu <= M) ? %lu : M - k0;\n"
+    "if ((y == %lu) && (x == %lu)) {\n"
+    // read and transpose with an optimized function
+    "    %s((LPtr)temp%c, (GPtr)B, k0, currN, ldb);\n"
+    "}\n"
+    "else {\n"
+    "    %s((__local float4*)temp%c);\n"           // zeroing
+    "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+    // read and transpose with slow function
+    "    %s((LPtr)temp%c, (GPtr)B, k0, currN, x, y, %lu, ldb);\n"
+    "}\n\n";
+
+static const char *readRectBlockTransOpt =
+    // read and transpose with an optimized function
+    "%s((LPtr)temp%c, (GPtr)B, k0, currN, ldb);\n";
+
+static ssize_t
+generator(
+    char *buf,
+    size_t buflen,
+    const struct SubproblemDim *subdims,
+    const struct PGranularity *pgran,
+    void *extra);
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra);
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs);
+
+static SolverFlags
+solverFlags(void);
+
+static SolverOps solverOps = {
+    generator,
+    assignKargs,
+    isFitToLDS,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    solverFlags,
+    NULL, //fixupArgs
+    NULL, //getDefaultDecomp
+   	NULL, //getDecompList
+   	NULL,
+   	NULL
+};
+
+static void
+genZeroResult(
+    struct KgenContext *ctx,
+    DataType dtype,
+    const SubproblemDim *dims)
+{
+    unsigned int n;
+    char tmp[1024];
+    unsigned int vecLen = sizeof(cl_float4) / dtypeSize(dtype);
+
+    getResultGPRsInfo(dtype, &dims[1], vecLen, &n, NULL);
+
+    sprintf(tmp, "for (x = 0; x < %u; x++) {\n"
+                 "    c[x] = 0;\n"
+                 "}\n\n", n);
+
+    kgenAddStmt(ctx, tmp);
+}
+
+static void
+genPrepareSquareBlock(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    DataType dtype,
+    const CopyBufFuncs *copyFuncs,
+    const ZeroFuncs *zeroFuncs,
+    KernelExtraFlags kflags,
+    char c)
+{
+    char tmp[1024];
+    size_t pitch;
+    const char *readBlock;
+    bool tra;
+
+    tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A);
+    pitch = matrBlockPitch(dim, MATRIX_A, dtype, clblasLeft);
+
+    if (!(kflags & KEXTRA_TAILS_M)) {
+        readBlock = (tra) ? readSquareBlockTransOpt : readSquareBlockOpt;
+        sprintf(tmp, readBlock, copyFuncs->read[MATRIX_A], c);
+    }
+    else {
+        readBlock = (tra) ? readSquareBlockTrans : readSquareBlock;
+        sprintf(tmp, readBlock, dim->y, dim->y, dim->bwidth, dim->bwidth,
+                dim->y, dim->bwidth, copyFuncs->read[MATRIX_A], c,
+                zeroFuncs->names[MATRIX_A], c,
+                copyFuncs->readGeneric[MATRIX_A], c, pitch);
+    }
+    kgenAddStmt(ctx, tmp);
+}
+
+static void
+genPrepareRectBlock(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    DataType dtype,
+    const CopyBufFuncs *copyFuncs,
+    const ZeroFuncs *zeroFuncs,
+    KernelExtraFlags kflags,
+    char c)
+{
+    char tmp[1024];
+    size_t pitch;
+    const char *readBlock;
+    bool trb;
+
+    trb = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B);
+    pitch = matrBlockPitch(dim, MATRIX_B, dtype, clblasLeft);
+
+    if (!(kflags & (KEXTRA_TAILS_N | KEXTRA_TAILS_M))) {
+        readBlock = (trb) ? readRectBlockTransOpt : readRectBlockOpt;
+        sprintf(tmp, readBlock, copyFuncs->read[MATRIX_B], c);
+    }
+    else {
+        readBlock = (trb) ? readRectBlockTrans : readRectBlock;
+        sprintf(tmp, readBlock, dim->x, dim->x, dim->bwidth, dim->bwidth,
+                dim->x, dim->bwidth, copyFuncs->read[MATRIX_B], c,
+                zeroFuncs->names[MATRIX_B], c,
+                copyFuncs->readGeneric[MATRIX_B], c, pitch);
+    }
+    kgenAddStmt(ctx, tmp);
+}
+
+static void
+genZeroBlockA(
+    struct KgenContext *ctx,
+    const ZeroFuncs *zeroFuncs)
+{
+    char tmp[1024];
+    sprintf(tmp, "%s((__local float4*)tempA);\n", zeroFuncs->names[MATRIX_A]);
+    kgenAddStmt(ctx, tmp);
+}
+
+/*
+ * Generate control block of the loop over K
+ * Two kind of loops: without triangle block and only triangle block
+ */
+static void
+genInternalLoopCtl(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    KernelExtraFlags kflags,
+    bool triangPart)
+{
+    char tmp[1024];
+
+    (void)triangPart;
+
+    if (isMatrixUpper(kflags)) {
+        sprintf(tmp, "for (k0 = currM + %lu; k0 < M; k0 += %lu)",
+                dim->bwidth, dim->bwidth);
+    }
+    else {
+        sprintf(tmp, "for (k0 = 0; k0 < currM; k0 += %lu)",
+                dim->bwidth);
+    }
+
+    kgenBeginBranch(ctx, tmp);
+}
+
+static void
+genInitCurrM(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    KernelExtraFlags kflags)
+{
+    char tmp[1024];
+
+    if (isMatrixUpper(kflags)) {
+        /* start from the last block */
+        sprintf(tmp, "currM = ((M - 1) / %lu) * %lu;\n", dim->y, dim->y);
+        kgenAddStmt(ctx, tmp);
+    }
+    else {
+        kgenAddStmt(ctx, "currM = 0;\n");
+    }
+}
+
+static void
+initKernelVarNames(KernelVarNames *kvars, KernelExtraFlags kflags)
+{
+    kvars->A = "A";
+    kvars->B = "B";
+
+    if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A)) {
+        kvars->coordA = "coordA.x";
+    }
+    else {
+        kvars->coordA = "coordA.y";
+    }
+    if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) {
+        kvars->coordB = "coordB.x";
+    }
+    else {
+        kvars->coordB = "coordB.y";
+    }
+    kvars->sizeM = "M";
+    kvars->sizeN = "N";
+    kvars->sizeK = "origM";
+}
+
+static ssize_t
+generator(
+    char *buf,
+    size_t buflen,
+    const struct SubproblemDim *subdims,
+    const struct PGranularity *pgran,
+    void *extra)
+{
+    struct KgenContext *ctx;
+    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    KernelExtraFlags kflags = kextra->flags;
+    char tmp[1024];
+    char blkmul[FUNC_NAME_MAXLEN];
+    char updateResFn[FUNC_NAME_MAXLEN];
+    char updateResGenericFn[FUNC_NAME_MAXLEN];
+    char updateResFnRev[FUNC_NAME_MAXLEN];
+    char updateResGenericFnRev[FUNC_NAME_MAXLEN];
+    char copyPLFn[FUNC_NAME_MAXLEN];
+    char *s1 = "";
+    const char *typeName;
+    CopyBufFuncs copyFuncs;
+    ZeroFuncs zeroFuncs;
+    DataType dtype = kextra->dtype;
+    ssize_t ret;
+    BlasGenSettings gset;
+    BlkMulOpts mulOpts;
+    size_t pitchAB, pitchC;
+    bool b;
+    const char *outTypeName;
+    unsigned int nrRegs;
+    unsigned int vecLen = sizeof(cl_float4) / dtypeSize(dtype);
+    int tra, trb;
+    unsigned int l1Pans;
+    char vect[2] = {'y', 'x'};
+    UpdateResultFlags upFlags;
+
+    if (pgran->wgDim != 1) {
+        return -EINVAL;
+    }
+
+    ctx = createKgenContext(buf, buflen, true);
+    if (ctx == NULL) {
+        return -ENOMEM;
+    }
+
+    // at first, generate needed declarations and auxiliary functions
+
+    b = isDoubleBasedType(dtype);
+    kgenDeclareUptrs(ctx, b);
+
+    memset(&gset, 0, sizeof(gset));
+    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
+    gset.kextra = kextra;
+    gset.pgran = pgran;
+
+    initKernelVarNames(&gset.varNames, kflags);
+
+    if (isComplexType(dtype)) {
+        genComplexMathOperators(ctx, dtype);
+    }
+
+    generateBufCopyFuncs(&copyFuncs, ctx, CLBLAS_TRSM, &gset,
+                         BCHF_MATRIX_A | BCHF_MATRIX_B | BCHF_WRITE_OUTPUT);
+
+    generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype,
+                         ZF_MATRIX_A | ZF_MATRIX_B);
+    getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName);
+
+    // functions updating result
+    // for the final result
+    generateUpresFuncs(ctx, CLBLAS_TRSM, &gset, updateResFn,
+                       updateResGenericFn);
+    // for intermediate result after blocks modification
+    upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags);
+    upFlags |= UPRES_WITH_BETA | UPRES_PRIV_DEST;
+    genUpresFuncsWithFlags(ctx, &gset, upFlags, updateResFnRev,
+                           updateResGenericFnRev);
+    // for heaping before multiplying on inverted block
+    updateResultGenOld(ctx, &gset, UPRES_SET,
+                    UPRES_COLUMN_MAJOR | UPRES_USE_LDS, NULL);
+    kgenGetLastFuncName(copyPLFn, FUNC_NAME_MAXLEN, ctx);
+    kgenAddBlankLine(ctx);
+
+    // block multiplication function
+    mulOpts.aMobj = CLMEM_BUFFER;
+    mulOpts.bMobj = CLMEM_BUFFER;
+    mulOpts.flags = BLKMUL_SKEW_COLUMN | BLKMUL_OUTPUT_PRIVATE;
+    mulOpts.core = BLKMUL_SEPARATE_MULADD;
+    ret = blkMulGen(ctx, subdims, dtype, &mulOpts);
+    if (ret) {
+        destroyKgenContext(ctx);
+
+        return -EOVERFLOW;
+    }
+
+    kgenAddBlankLine(ctx);
+    kgenGetLastFuncName(blkmul, sizeof(blkmul), ctx);
+
+    //matrix inversion function
+    genInvertingBlockFunc(ctx, subdims[0].bwidth, dtype, kflags);
+
+    typeName = dtypeBuiltinType(dtype);
+
+    // now, generate the kernel
+    declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRSM, NULL, false,
+                      false);
+    ret = kgenBeginFuncBody(ctx);
+
+    /*
+     * Calculate local buffer pitches, and then insert the
+     * preparative code
+     */
+    pitchAB = matrBlockPitch(subdims, MATRIX_A, dtype, clblasLeft);
+    pitchC = matrBlockPitch(subdims, MATRIX_C, dtype, clblasLeft);
+    sprintf(tmp, trsmPrep1D, typeName, pitchAB * subdims[0].y,
+            typeName,
+            ((pitchC > pitchAB) ? pitchC : pitchAB) * subdims[0].y,
+            outTypeName, nrRegs, s1, pgran->wgSize[0]);
+    ret = kgenAddStmt(ctx, tmp);
+
+   /*
+    * B matrix is divided on panels, each work group
+    * multiply such a panel on the whole matrix A.
+    */
+
+    sprintf(tmp, "currN = gid * %lu;\n", subdims[0].x);
+    kgenAddStmt(ctx, tmp);
+
+    genInitCurrM(ctx, subdims, kflags);
+    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
+        kgenAddStmt(ctx, "A += offA;\n");
+    }
+    genTrxmBMatrShift(ctx, kflags, false);
+
+    tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A);
+    trb = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B);
+
+    l1Pans = (unsigned int)subdims[0].x / (unsigned int)subdims[1].x;
+
+    sprintf(tmp, "coordB.%c = currN + lid %% %u * %lu;\n"
+                 "coordB.%c = 0;\n\n",
+            vect[trb], l1Pans, subdims[1].x, vect[1 - trb]);
+    kgenAddStmt(ctx, tmp);
+
+    // loop over M
+    sprintf(tmp, "for (m0 = 0; m0 < M; m0 += %lu)", subdims->y);
+    kgenBeginBranch(ctx, tmp);
+
+    sprintf(tmp, "coordA.%c = currM + lid / %u * %lu;\n"
+                 "coordA.%c = 0;\n\n",
+            vect[tra], l1Pans, subdims[1].y, vect[1 - tra]);
+    kgenAddStmt(ctx, tmp);
+
+    genZeroResult(ctx, dtype, subdims);
+
+    genInternalLoopCtl(ctx, subdims, kflags, false);   // loop over K
+
+    genPrepareSquareBlock(ctx, subdims, dtype, &copyFuncs, &zeroFuncs,
+                          kflags, 'A');
+    genPrepareRectBlock(ctx, subdims, dtype, &copyFuncs, &zeroFuncs,
+                        kflags, 'C');
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+
+    // multiplication for the step-by-step block updating
+    sprintf(tmp, "%s((LPtr)(tempA + (lid / %u * %lu) * %lu), \n"
+                 "    (LPtr)(tempC + (lid %% %u * %lu) * %lu),\n"
+                 "    (%s*)c, lid %% %lu);\n",
+                blkmul, l1Pans, subdims[1].y, pitchAB,
+                l1Pans, subdims[1].x, pitchAB, outTypeName, subdims[1].y);
+    ret = kgenAddStmt(ctx, tmp);
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+
+    genInternalLoopEnd(ctx);                             // loop over K
+    kgenAddBlankLine(ctx);
+
+    kgenAddStmt(ctx, "k0 = currM;\n");
+    genPrepareSquareBlock(ctx, subdims, dtype, &copyFuncs, &zeroFuncs,
+                          kflags, 'C');
+    genZeroBlockA(ctx, &zeroFuncs);
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+
+    if (kflags & KEXTRA_UNIT_DIAGONAL) {
+        sprintf(tmp, "if (lid < %lu) {\n"
+                     "    tempC[lid * %lu + lid] = %s;\n"
+                     "}\n",
+                subdims[0].bwidth, pitchAB, strOne(dtype));
+        kgenAddStmt(ctx, tmp);
+        kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+        kgenAddBlankLine(ctx);
+    }
+
+    sprintf(tmp, "if (lid < %lu)", subdims[0].bwidth);
+    kgenBeginBranch(ctx, tmp);
+    sprintf(tmp, "invert(tempC, tempA, lid, (currM + %lu > M) ? "
+                         "M - currM : %lu);\n",
+            subdims[0].y, subdims[0].y);
+    kgenAddStmt(ctx, tmp);
+    kgenEndBranch(ctx, NULL);
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+    kgenAddBlankLine(ctx);
+
+    genUpdateIntermTrsmResult(ctx, &gset, updateResFnRev,
+                              updateResGenericFnRev, true);
+
+    genHeapTrsmResultToLDS(ctx, &gset, copyPLFn, "tempC");
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+    genZeroResult(ctx, dtype, subdims);
+
+    // multypling on an inverted block
+    sprintf(tmp, "%s((LPtr)(tempA + (lid / %u * %lu) * %lu), \n"
+                 "    (LPtr)(tempC + (lid %% %u * %lu) * %lu),\n"
+                 "    (%s*)c, lid %% %lu);\n\n",
+            blkmul, l1Pans, subdims[1].y, pitchAB,
+            l1Pans, subdims[1].x, pitchAB, outTypeName, subdims[1].y);
+    ret = kgenAddStmt(ctx, tmp);
+
+    // write back the tile evaluated
+    upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags);
+    upFlags |= UPRES_EXCEED_PROBLEM_CONDITION;
+    genResultUpdateWithFlagsOld(ctx, CLBLAS_TRSM, &gset, upFlags, updateResFn,
+                                updateResGenericFn, NULL);
+
+    kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE);
+
+    if (isMatrixUpper(kflags)) {
+        sprintf(tmp, "currM -= %lu;\n", subdims[0].y);
+    }
+    else {
+        sprintf(tmp, "currM += %lu;\n", subdims[0].y);
+    }
+    kgenAddStmt(ctx, tmp);
+
+    kgenEndBranch(ctx, NULL);                       // loop over M
+
+    kgenEndFuncBody(ctx);
+    ret = kgenAddBlankLine(ctx);
+
+    if (!ret) {
+        ret = (ssize_t)kgenSourceSize(ctx) + 1;
+    }
+
+    destroyKgenContext(ctx);
+
+    return (ret < 0) ? -EOVERFLOW : ret;
+}
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra)
+{
+    const CLBlasKargs *blasArgs = (const CLBlasKargs*)params;
+    KernelExtraFlags kflags = ((CLBLASKernExtra*)extra)->flags;
+    int idx = 7;
+
+    initSizeKarg(&args[0], blasArgs->M);
+    initSizeKarg(&args[1], blasArgs->N);
+    assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype);
+    initMemobjKarg(&args[3], blasArgs->A, NULL, 0, 0);
+    initSizeKarg(&args[4], blasArgs->lda.matrix);
+    initMemobjKarg(&args[5], blasArgs->B, NULL, 0, 0);
+    initSizeKarg(&args[6], blasArgs->ldb.matrix);
+    if (kflags & KEXTRA_STARTM_NOT_ZERO) {
+        initSizeKarg(&args[idx++], blasArgs->offsetM);
+    }
+    if (kflags & KEXTRA_STARTN_NOT_ZERO) {
+        initSizeKarg(&args[idx++], blasArgs->offsetN);
+    }
+    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
+        initSizeKarg(&args[idx++], blasArgs->offA);
+    }
+    if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
+        initSizeKarg(&args[idx++], blasArgs->offBX);
+    }
+}
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs)
+{
+    cl_ulong sizeA, sizeB, size;
+    const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs;
+
+    /*
+     * It's needed one block for each matrix A and B,
+     * and one block of size maximal of this one for
+     * matrix B and matrix C
+     */
+
+    sizeA = matrBlockSize(dim, MATRIX_A, dtype, kargs->side);
+    sizeB = matrBlockSize(dim, MATRIX_B, dtype, kargs->side);
+    size = matrBlockSize(dim, MATRIX_C, dtype, kargs->side);
+    if (sizeB > size) {
+        size = sizeB;
+    }
+    size += sizeA + sizeB;
+
+    return (size * dtypeSize(dtype) <= ldsSize);
+}
+
+static SolverFlags
+solverFlags(void)
+{
+    return (SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS);
+}
+
+void
+initTrsmLdsPattern(MemoryPattern *mempat)
+{
+    mempat->name = "LDS based block trsm";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &solverOps;
+
+    mpatExtra.aMset = CLMEM_LEVEL_LDS;
+    mpatExtra.bMset = CLMEM_LEVEL_LDS;
+    mpatExtra.mobjA = CLMEM_BUFFER;
+    mpatExtra.mobjB = CLMEM_BUFFER;
+    mempat->extra = &mpatExtra;
+}
diff --git a/src/library/blas/gens/legacy/trxm_common_legacy.c b/src/library/blas/gens/legacy/trxm_common_legacy.c
new file mode 100644
index 0000000..448dbf1
--- /dev/null
+++ b/src/library/blas/gens/legacy/trxm_common_legacy.c
@@ -0,0 +1,250 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+
+#include <matrix_props.h>
+#include <matrix_dims.h>
+
+#include "trxm_common_legacy.h"
+
+void
+declareLdsBasedTrxmVariables(
+    struct KgenContext *ctx,
+    DataType dtype,
+    const SubproblemDim *dims,
+    const PGranularity *pgran,
+    bool useLocalC)
+{
+    char tmp[1024];
+    size_t pitchAB, pitchC;
+    const char *inTypeName, *outTypeName;
+    unsigned int nrRegs;
+    unsigned int vecLen;
+
+    inTypeName = dtypeBuiltinType(dtype);
+    pitchAB = matrBlockPitch(dims, MATRIX_A, dtype, clblasLeft);
+    pitchC = matrBlockPitch(dims, MATRIX_C, dtype, clblasLeft);
+    vecLen = sizeof(cl_float4) / dtypeSize(dtype);
+
+    sprintf(tmp, "__local %s tempA[%lu];\n"
+                 "__local %s tempB[%lu];\n"
+                 "uint m0, k0;\n"
+                 "uint currM, currN;\n"
+                 "uint2 coordA, coordB;\n"
+                 "uint x, y;\n",
+            inTypeName, pitchAB * dims->y, inTypeName,
+            pitchAB * dims->x);
+    kgenAddStmt(ctx, tmp);
+
+    getResultGPRsInfo(dtype, &dims[1], vecLen, &nrRegs, &outTypeName);
+    if (useLocalC) {
+        sprintf(tmp, "__local %s tempC[%lu];\n", inTypeName,
+                pitchC * dims->y);
+    }
+    else {
+        sprintf(tmp, "%s c[%u];\n", outTypeName, nrRegs);
+    }
+
+    kgenAddStmt(ctx, tmp);
+    kgenDeclareLocalID(ctx, "lid", pgran);
+    kgenDeclareGroupID(ctx, "gid", pgran);
+    kgenAddBlankLine(ctx);
+}
+
+void
+genPrepareTrxmBlockA(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    DataType dtype,
+    const CopyBufFuncs *copyFuncs,
+    const ZeroFuncs *zeroFuncs,
+    KernelExtraFlags flags,
+    const char *nameM)
+{
+    char tmp[1024];
+    size_t pitch;
+    const char *coordName[2] = {"currM", "k0"};
+    const char *sizeName[2] = {"y", "x"};
+    int tra;
+
+    pitch = matrBlockPitch(dim, MATRIX_A, dtype, clblasLeft);
+    tra = isMatrixAccessColMaj(CLBLAS_TRMM, flags, MATRIX_A);
+
+    /*
+     * If the (sub)problem is integrally divisible,
+     * skip any checks, and just read with optimal blocks,
+     * otherwise check for tails and then read with a
+     * fast function in the case of optimal blocks, and with
+     * the slow one in the case of tails respectively
+     */
+
+    if (!(flags & KEXTRA_TAILS_M)) {
+        sprintf(tmp, "%s((LPtr)tempA, (GPtr)A, %s, %s, lda);\n",
+                copyFuncs->read[MATRIX_A], coordName[tra], coordName[1 - tra]);
+    }
+    else {
+        sprintf(tmp,
+                "y = (currM + %lu <= M) ? %lu : M - currM;\n"
+                "x = (k0 + %lu <= %s) ? %lu : %s - k0;\n"
+                "if ((y == %lu) && (x == %lu)) {\n"
+                     // fast read
+                "    %s((LPtr)tempA, (GPtr)A, %s, %s, lda);\n"
+                "}\n"
+                "else {\n"
+                "    %s((__local float4*)tempA);\n"           // zeroing
+                "    barrier(CLK_LOCAL_MEM_FENCE);\n"
+                     // slow read
+                "    %s((LPtr)tempA, (GPtr)A, %s, %s, %s, %s, %lu, lda);\n"
+                "}\n\n",
+                dim->y, dim->y, dim->bwidth, nameM, dim->bwidth, nameM, dim->y,
+                dim->bwidth, copyFuncs->read[MATRIX_A], coordName[tra],
+                coordName[1 - tra], zeroFuncs->names[MATRIX_A],
+                copyFuncs->readGeneric[MATRIX_A], coordName[tra],
+                coordName[1 - tra], sizeName[tra], sizeName[1 - tra],
+                pitch);
+    }
+
+    kgenAddStmt(ctx, tmp);
+}
+
+void
+genPrepareTrxmBlockB(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    DataType dtype,
+    const CopyBufFuncs *copyFuncs,
+    const ZeroFuncs *zeroFuncs,
+    KernelExtraFlags flags)
+{
+    char tmp[1024];
+    size_t pitch;
+    const char *coordName[2] = {"currN", "k0"};
+    const char *sizeName[2] = {"y", "x"};
+    int trb;
+
+    trb = isMatrixAccessColMaj(CLBLAS_TRMM, flags, MATRIX_B);
+    pitch = matrBlockPitch(dim, MATRIX_B, dtype, clblasLeft);
+
+    if (!(flags & (KEXTRA_TAILS_N | KEXTRA_TAILS_K))) {
+        sprintf(tmp, "%s((LPtr)tempB, (GPtr)B, %s, %s, ldb);\n",
+                copyFuncs->read[MATRIX_B], coordName[trb],
+                coordName[1 - trb]);
+    }
+    else {
+        sprintf(tmp,
+                "y = (currN + %lu <= N) ? %lu : N - currN;\n"
+                "x = (k0 + %lu <= M) ? %lu : M - k0;\n"
+                "if ((y == %lu) && (x == %lu)) {\n"
+                     // fast read
+                "    %s((LPtr)tempB, (GPtr)B, %s, %s, ldb);\n"
+                "}\n"
+                "else {\n"
+                "    %s((__local float4*)tempB);\n"           // zeroing
+                "    barrier(CLK_LOCAL_MEM_FENCE);\n"    // barrier if it's needed
+                             // slow read
+                "    %s((LPtr)tempB, (GPtr)B, %s, %s, %s, %s, %lu, ldb);\n"
+                "}\n\n",
+                dim->x, dim->x, dim->bwidth, dim->bwidth, dim->x, dim->bwidth,
+                copyFuncs->read[MATRIX_B], coordName[trb], coordName[1 - trb],
+                zeroFuncs->names[MATRIX_B],
+                copyFuncs->readGeneric[MATRIX_B], coordName[trb],
+                coordName[1 - trb], sizeName[trb], sizeName[1 - trb], pitch);
+    }
+
+    kgenAddStmt(ctx, tmp);
+}
+
+void
+genTriangMatrBlock(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    DataType dtype,
+    KernelExtraFlags kflags)
+{
+    char tmp[1024], tmp1[512];
+    const char *one;
+    size_t pitch;
+
+    pitch = matrBlockPitch(dim, MATRIX_A, dtype, clblasLeft);
+    one = strOne(dtype);
+
+    strcpy(tmp1, "");
+    // staring diagonal coordinates
+    kgenAddStmt(ctx, "y = (k0 < currM) ? 0 : (k0 - currM);\n"
+                     "x = (k0 < currM) ? (currM - k0) : 0;\n\n");
+
+    if (isMatrixUpper(kflags)) {
+        /*
+         * resulting block is upper diagonal, zeroing everything
+         * below the diagonal and set "1" on the diagonal for the
+         * unit diagonal matrix
+         */
+        if (kflags & KEXTRA_UNIT_DIAGONAL) {
+            sprintf(tmp1, "\n"
+                          "    if (x < %lu) {\n"
+                          "        tempA[lid * %lu + x] = %s;\n"
+                          "    }\n",
+                    dim->bwidth, pitch, one);
+        }
+
+        sprintf(tmp, "if (lid >= y && lid < %lu) {\n"
+                     "    uint i;\n"
+                     "\n"
+                     "    x = x + lid - y;\n"
+                     "    x = (x > %lu) ? %lu : x;\n"
+                     "\n"
+                     "    for (i = 0; i < x; i++) {\n"
+                     "        tempA[lid * %lu + i] = 0;\n"
+                     "    }\n"
+                     "%s"
+                     "}\n",
+                dim->y, dim->bwidth, dim->bwidth, pitch, tmp1);
+    }
+    else {
+        /*
+         * resulting block is lower diagonal, zeroing everything
+         * above the diagonal and set "1" on the diagonal for the
+         * unit diagonal matrix
+         */
+        if (kflags & KEXTRA_UNIT_DIAGONAL) {
+            sprintf(tmp1, "\n"
+                          "    if (y < %lu) {\n"
+                          "        tempA[y * %lu + lid] = %s;\n"
+                          "    }\n",
+                    dim->y, pitch, one);
+        }
+
+        sprintf(tmp, "if (lid >= x && lid < %lu) {\n"
+                     "    uint i;\n"
+                     "\n"
+                     "    y = y + lid - x;\n"
+                     "    y = (y > %lu) ? %lu : y;\n"
+                     "\n"
+                     "    for (i = 0; i < y; i++) {\n"
+                     "        tempA[i * %lu + lid] = 0;\n"
+                     "    }\n"
+                     "%s"
+                     "}\n",
+                dim->bwidth, dim->y, dim->y, pitch, tmp1);
+    }
+
+    kgenAddStmt(ctx, tmp);
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+
+    kgenAddBlankLine(ctx);
+}
diff --git a/src/library/blas/gens/legacy/trxm_common_legacy.h b/src/library/blas/gens/legacy/trxm_common_legacy.h
new file mode 100644
index 0000000..ead3831
--- /dev/null
+++ b/src/library/blas/gens/legacy/trxm_common_legacy.h
@@ -0,0 +1,88 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef TRXM_COMMON_LEGACY_H_
+#define TRXM_COMMON_LEGACY_H_
+
+#include "../gen_helper.h"
+
+/*
+ * Declare local variables for LDS based version
+ * of TRXM kernels.
+ *
+ * It provides the names typical for another generators as well:
+ *
+ * lid, gid - local and global ID.
+ * m0, k0 - top level counters over M and N
+ * currM, currN - current block coordinates over M and N at the top level
+ * tempA, tempB - blocks of matrix A and B located in the local memory
+ * tempC - block of matrix C located in the local memory; declared if
+ *      the 'useLocalC' argument is set
+ * c - matrix C tile located in registers; declared if the 'useLocalC'
+ *      argument is not set
+ * x, y - auxiliary variables to evaluate size of read/write blocks
+ *
+ * TRXM specific variables:
+ *
+ * startM, endM - starting and end coordinate over rows a kernel can access
+ */
+void
+declareLdsBasedTrxmVariables(
+    struct KgenContext *ctx,
+    DataType dtype,
+    const SubproblemDim *dims,
+    const PGranularity *pgran,
+    bool useLocalC);
+
+/*
+ * NOTE: the all following functions generate a code
+ *       using local variables declared with the
+ *       'declareTrxmLocalVariables' function
+ */
+
+void
+genPrepareTrxmBlockA(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    DataType dtype,
+    const CopyBufFuncs *copyFuncs,
+    const ZeroFuncs *zeroFuncs,
+    KernelExtraFlags flags,
+    const char *nameM);
+
+void
+genPrepareTrxmBlockB(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    DataType dtype,
+    const CopyBufFuncs *copyFuncs,
+    const ZeroFuncs *zeroFuncs,
+    KernelExtraFlags flags);
+
+/*
+ * Triangulate matrix block. The decision to triangulate is
+ * made based on the current coordinates.
+ */
+void
+genTriangMatrBlock(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    DataType dtype,
+    KernelExtraFlags kflags);
+
+
+#endif /* TRXM_COMMON_LEGACY_H_ */
diff --git a/src/library/blas/gens/nrm2.cpp b/src/library/blas/gens/nrm2.cpp
new file mode 100644
index 0000000..832f5e4
--- /dev/null
+++ b/src/library/blas/gens/nrm2.cpp
@@ -0,0 +1,295 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/*
+ * nrm2 generator
+ */
+//#define DEBUG_NRM2
+
+#define WORKGROUPS_PER_CU  32
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include "blas_kgen.h"
+#include <kprintf.hpp>
+#include <nrm2.clT>
+#include <solution_seq.h>
+
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+
+extern "C"
+unsigned int dtypeSize(DataType type);
+
+
+static char Prefix[4];
+
+static SolverFlags
+solverFlags(void)
+{
+	#ifdef DEBUG_NRM2
+	printf("solverFlags called...\n");
+	#endif
+
+    return (SF_WSPACE_1D);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+
+static void
+    fixupArgs(void *args, SubproblemDim *subdims, void *extra);
+
+static void
+assignKargs(KernelArg *args, const void *params, const void* extra );
+
+extern "C"
+void initNrm2RegisterPattern(MemoryPattern *mempat);
+
+static  KernelExtraFlags
+selectVectorization(
+    void *kargs,
+    unsigned int vlen );
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *kArgs);
+
+static SolverOps nrm2Ops = {
+    generator,
+    assignKargs,
+    NULL,
+    NULL,
+    NULL,
+    calcNrThreads,
+    NULL,
+    solverFlags,
+	fixupArgs,
+	NULL,
+	NULL,
+	setBuildOpts,
+	selectVectorization
+};
+
+static  KernelExtraFlags
+selectVectorization(
+	void *args,
+	unsigned int vlen )
+{
+	KernelExtraFlags kflags = KEXTRA_NO_FLAGS;
+	CLBlasKargs *kargs  = (CLBlasKargs *)args;
+
+    if(((kargs->offBX) % vlen) != 0)
+    {
+        kflags = KEXTRA_NO_COPY_VEC_A;
+    }
+	return kflags;
+}
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+	const SolutionStep *step = (const SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
+	{
+		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+	}
+    if( (kargs->dtype == TYPE_COMPLEX_FLOAT) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) ) {
+        strcat( buildOptStr, " -DCOMPLEX ");
+    }
+    if(kargs->redctnType == REDUCE_BY_HYPOT) {
+            strcat( buildOptStr, "-DUSE_HYPOT ");
+    } else if(kargs->redctnType == REDUCE_BY_SSQ) {
+            strcat( buildOptStr, " -DUSE_SSQ ");
+    }
+
+    if( (kargs->ldb.vector) != 1) {
+        strcat( buildOptStr, " -DINCX_NONUNITY ");
+    }
+    if( (kargs->ldb.vector) < 1) {
+        strcat( buildOptStr, " -DRETURN_ON_INVALID");
+    }
+	return;
+}
+
+
+static CLBLASMpatExtra mpatExtra;
+
+extern "C"
+void initNrm2RegisterPattern(MemoryPattern *mempat)
+{
+	#ifdef DEBUG_NRM2
+	printf("initRegPattern called with mempat = 0x%p\n", mempat);
+	#endif
+
+	fflush(stdout);
+    mempat->name = "Register accumulation based Nrm2";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &nrm2Ops;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L2;
+    mpatExtra.bMset = CLMEM_LEVEL_L2;
+    mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY;
+    mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY;
+    mempat->extra = &mpatExtra;
+
+	Prefix[TYPE_FLOAT] = 'S';
+	Prefix[TYPE_DOUBLE] = 'D';
+	Prefix[TYPE_COMPLEX_FLOAT] = 'C';
+	Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *_extra)
+{
+    DUMMY_ARG_USAGE(subdims);
+    const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra;
+    CLBlasKargs *kargs = (CLBlasKargs *)args;
+    SolutionStep *step = container_of(kargs, args, SolutionStep);
+    TargetDevice *kDevice = &(step->device);
+
+    cl_int err;
+    unsigned int numComputeUnits = deviceComputeUnits( (kDevice->id), &err );
+    if(err != CL_SUCCESS) {
+        numComputeUnits = 1;
+    }
+
+    unsigned int vecLen = extra->vecLenA;
+	unsigned int blockSize = pgran->wgSize[0] * pgran->wgSize[1];
+
+	unsigned int wgToSpawn = ((kargs->N + (blockSize*vecLen) - 1)/ (blockSize*vecLen));
+    wgToSpawn = min( wgToSpawn, (numComputeUnits * WORKGROUPS_PER_CU) );
+
+	threads[0] = wgToSpawn * blockSize;
+	threads[1] = 1;
+}
+
+//
+// FIXME: Report correct return value - Needs change in KPRINTF
+//
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+
+	size_t BLOCKSIZE  = pgran->wgSize[0];
+	char tempTemplate[32*1024];
+    SolutionStep *step = container_of(subdims, subdims, SolutionStep);
+
+	if ( buf == NULL) // return buffer size
+	{
+		buflen = (32 * 1024 * sizeof(char));
+        return (ssize_t)buflen;
+	}
+	CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
+
+    unsigned int vecLenA = extraFlags->vecLenA;
+	bool doVLOAD = false;
+
+	if( extraFlags->flags &  KEXTRA_NO_COPY_VEC_A )
+	{
+		doVLOAD = true;
+	}
+    const char *kernName;
+
+    if(step->args.redctnType == REDUCE_BY_HYPOT) {
+            kernName = nrm2_hypot_kernel;
+    } else if (step->args.redctnType == REDUCE_BY_SSQ) {
+            kernName = nrm2_ssq_kernel;
+    } else {
+            printf(" Error in selecting kernel!\n");
+            return 0;
+    }
+
+    strcpy( tempTemplate, kernName );
+	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD, BLOCKSIZE);
+    kobj.spit((char*)buf, tempTemplate);
+
+    return (32 * 1024 * sizeof(char));
+}
+
+/*
+__kernel void %PREFIXnrm2_kernel( __global %TYPE *_X, __global %TYPE *_Y, __global %TYPE *scratchBuff,
+                                        uint N, uint offx, int incx, uint offy, int incy, int doConj )
+*/
+static void
+assignKargs(KernelArg *args, const void *params, const void* )
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+	cl_int incx;
+
+    INIT_KARG(&args[0], blasArgs->B);
+	INIT_KARG(&args[1], blasArgs->D);
+    initSizeKarg(&args[2], blasArgs->N);
+    initSizeKarg(&args[3], blasArgs->offBX);
+    incx = blasArgs->ldb.vector;
+    INIT_KARG(&args[4], incx);
+
+	return;
+}
+
+/** The purpose of this function is to add an work-group size indicator in
+    kernelKey, so that a different kernel is generated when work-group size is changed.
+    Reduction loop is unrolled in kprintf based on work-group size.
+
+    Member of SubproblemDim- bwidth, will be used to store work-group size of the current kernel
+    this will become a kernelKey, and kernel cache will be accordingly managed.
+    Note -- SubproblemDim is a member of kernelKey
+**/
+static void
+fixupArgs(void *args, SubproblemDim *subdims, void *extra)
+{
+    DUMMY_ARG_USAGE(extra);
+    CLBlasKargs *kargs = (CLBlasKargs*)args;
+    SolutionStep *step = container_of(kargs, args, SolutionStep);
+
+    subdims->bwidth = (step->pgran.wgSize[0]) * (step->pgran.wgSize[1]);
+}
+
diff --git a/src/library/blas/gens/reduction.cpp b/src/library/blas/gens/reduction.cpp
new file mode 100644
index 0000000..1c81c0b
--- /dev/null
+++ b/src/library/blas/gens/reduction.cpp
@@ -0,0 +1,311 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/*
+ * reduction generator
+ */
+//#define DEBUG_REDUCTION
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include "blas_kgen.h"
+#include <kprintf.hpp>
+#include <reduction.clT>
+#include <solution_seq.h>
+
+extern "C"
+unsigned int dtypeSize(DataType type);
+
+
+static char Prefix[4];
+
+static SolverFlags
+solverFlags(void)
+{
+	#ifdef DEBUG_REDUCTION
+	printf("solverFlags called...\n");
+	#endif
+
+    return (SF_WSPACE_1D);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+static void
+    fixupArgs(void *args, SubproblemDim *subdims, void *extra);
+
+static void
+assignKargs(KernelArg *args, const void *params, const void* extra );
+
+extern "C"
+void initReductionRegisterPattern(MemoryPattern *mempat);
+
+static  KernelExtraFlags
+selectVectorization(
+    void *kargs,
+    unsigned int vlen );
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *kArgs);
+
+static SolverOps reductionOps = {
+    generator,
+    assignKargs,
+    NULL,
+    NULL, // Prepare Translate Dims
+    NULL, // Inner Decomposition Axis
+    calcNrThreads,
+    NULL,
+    solverFlags,
+	fixupArgs,
+	NULL,
+	NULL,
+	setBuildOpts,
+	selectVectorization
+};
+
+static  KernelExtraFlags
+selectVectorization(
+	void *args,
+	unsigned int vlen )
+{
+	KernelExtraFlags kflags = KEXTRA_NO_FLAGS;
+	CLBlasKargs *kargs  = (CLBlasKargs *)args;
+
+    if( (((kargs->offBX) % vlen) != 0) || (((kargs->offCY) % vlen) != 0) )
+    {
+        kflags = KEXTRA_NO_COPY_VEC_A;
+    }
+    // Since ssq will be vector-loaded from Nth location of scratch buffer i.e scratchBuff[N]
+    // If N is not a multiple of vlen, then use vload
+    if( (kargs->redctnType == REDUCE_BY_SSQ) && (((kargs->N) % vlen) != 0) )
+    {
+        kflags = KEXTRA_NO_COPY_VEC_A;
+    }
+
+	return kflags;
+}
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+	const SolutionStep *step = (const SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
+	{
+		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+	}
+    switch(kargs->redctnType)
+    {
+        case REDUCE_BY_SUM:                 strcat( buildOptStr, "-DREDUCE_BY_SUM ");
+                                            break;
+
+        case REDUCE_BY_MAX:                 strcat( buildOptStr, "-DREDUCE_BY_MAX ");
+                                            break;
+
+        case REDUCE_BY_MIN:                 strcat( buildOptStr, "-DREDUCE_BY_MIN ");
+                                            break;
+
+        case REDUCE_MAX_WITH_INDEX:         strcat( buildOptStr, "-DREDUCE_MAX_WITH_INDEX ");
+                                            break;
+
+        case REDUCE_BY_HYPOT:               strcat( buildOptStr, "-DREDUCE_BY_HYPOT ");
+                                            break;
+
+        case REDUCE_BY_SSQ:                 strcat( buildOptStr, "-DREDUCE_BY_SSQ ");
+                                            break;
+
+        case REDUCE_MAX_WITH_INDEX_ATOMICS: strcat( buildOptStr, "-DREDUCE_MAX_WITH_INDEX_ATOMICS ");
+                                            break;
+
+        default:                            printf("Invalid reduction type!!\n");
+                                            break;
+    }
+
+	return;
+}
+
+
+static CLBLASMpatExtra mpatExtra;
+
+extern "C"
+void initReductionRegisterPattern(MemoryPattern *mempat)
+{
+	#ifdef DEBUG_REDUCTION
+	printf("initRegPattern called with mempat = 0x%p\n", mempat);
+	#endif
+
+	fflush(stdout);
+    mempat->name = "Register accumulation based swap";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &reductionOps;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L2;
+    mpatExtra.bMset = CLMEM_LEVEL_L2;
+    mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY;
+    mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY;
+    mempat->extra = &mpatExtra;
+
+	Prefix[TYPE_FLOAT] = 'S';
+	Prefix[TYPE_DOUBLE] = 'D';
+	Prefix[TYPE_COMPLEX_FLOAT] = 'C';
+	Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *_extra)
+{
+	DUMMY_ARGS_USAGE_3(subdims, args, _extra);
+    int BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block
+
+	size_t blocks = 1;          // Reduction will use only 1 block
+	#ifdef DEBUG_REDUCTION
+	printf("blocks : %d\n", blocks);
+	#endif
+
+	threads[0] = blocks * BLOCKSIZE;
+	#ifdef DEBUG_REDUCTION
+	printf("pgran-wgSize[0] : %d, globalthreads[0]  : %d\n", pgran->wgSize[0], threads[0]);
+	#endif
+	threads[1] = 1;
+}
+
+//
+// FIXME: Report correct return value - Needs change in KPRINTF
+//
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+    DUMMY_ARG_USAGE(subdims);
+	size_t BLOCKSIZE  = pgran->wgSize[0];
+	char tempTemplate[32*1024];
+
+	if ( buf == NULL) // return buffer size
+	{
+		buflen = (32 * 1024 * sizeof(char));
+        return (ssize_t)buflen;
+	}
+	CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
+    SolutionStep *step = container_of( pgran , pgran, SolutionStep);
+    CLBlasKargs* kargs = (CLBlasKargs*) &(step->args);
+    char const *kernName;
+
+    if(kargs->redctnType == REDUCE_BY_SUM) {
+            kernName = red_sum_kernel;
+    } else if(kargs->redctnType == REDUCE_BY_MAX) {
+            kernName = red_max_kernel;
+    } else if(kargs->redctnType == REDUCE_BY_MIN) {
+            kernName = red_min_kernel;
+    } else if(kargs->redctnType == REDUCE_MAX_WITH_INDEX) {
+            kernName = red_with_index_kernel;
+    } else if(kargs->redctnType == REDUCE_BY_HYPOT) {
+            kernName = red_hypot_kernel;
+    } else if(kargs->redctnType == REDUCE_BY_SSQ) {
+            kernName = red_ssq_kernel;
+    }
+
+	#ifdef DEBUG_REDUCTION
+ 	printf("REDUCTION GENERATOR called....\n");
+	printf("dataType : %c\n", Prefix[extraFlags->dtype]);
+	printf("Vector length used : %d\n\n", vecLenA);
+	#endif
+
+    unsigned int vecLenA = extraFlags->vecLenA;
+	bool doVLOAD = false;
+	if( extraFlags->flags &  KEXTRA_NO_COPY_VEC_A )
+	{
+		doVLOAD = true;
+	}
+    strcpy( tempTemplate, kernName );
+	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD, BLOCKSIZE);
+    kobj.spit((char*)buf, tempTemplate);
+
+    return (32 * 1024 * sizeof(char));
+}
+
+/*
+ __kernel void %PREFIXred_sum_kernel( __global %TYPE *_X, __global %TYPE *_res,
+                                                      uint N, uint offx, uint offRes )
+*/
+static void
+assignKargs(KernelArg *args, const void *params, const void* _extra)
+{
+    DUMMY_ARG_USAGE(_extra);
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+
+    INIT_KARG(&args[0], blasArgs->D);
+	INIT_KARG(&args[1], blasArgs->A);
+    initSizeKarg(&args[2], blasArgs->N);
+    size_t offScratch = 0;
+    initSizeKarg(&args[3], offScratch);
+    initSizeKarg(&args[4], blasArgs->offA);
+
+	return;
+}
+
+/** The purpose of this function is to add an work-group size indicator in
+    kernelKey, so that a different kernel is generated when work-group size is changed.
+    Reduction loop is unrolled in kprintf based on work-group size.
+
+    Member of SubproblemDim- bwidth, will be used to store work-group size of the current kernel
+    this will become a kernelKey, and kernel cache will be accordingly managed.
+    Note -- SubproblemDim is a member of kernelKey
+**/
+static void
+fixupArgs(void *args, SubproblemDim *subdims, void *extra)
+{
+    DUMMY_ARG_USAGE(extra);
+    CLBlasKargs *kargs = (CLBlasKargs*)args;
+    SolutionStep *step = container_of(kargs, args, SolutionStep);
+
+    subdims->bwidth = (step->pgran.wgSize[0]) * (step->pgran.wgSize[1]);
+}
diff --git a/src/library/blas/gens/rotg_reg.cpp b/src/library/blas/gens/rotg_reg.cpp
new file mode 100644
index 0000000..0ec1eb0
--- /dev/null
+++ b/src/library/blas/gens/rotg_reg.cpp
@@ -0,0 +1,216 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/*
+ * rotg generator
+ */
+//#define DEBUG_ROTG
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include "blas_kgen.h"
+#include <kprintf.hpp>
+#include <rotg.clT>
+#include <solution_seq.h>
+#include "blas_subgroup.h"
+#include "gen_helper.h"
+
+extern "C"
+unsigned int dtypeSize(DataType type);
+
+
+static char Prefix[4];
+
+static SolverFlags
+solverFlags(void)
+{
+    return (SF_WSPACE_1D);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+
+static void
+assignKargs(KernelArg *args, const void *params, const void* extra );
+
+extern "C"
+void initRotgRegisterPattern(MemoryPattern *mempat);
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *kArgs);
+
+static SolverOps rotgOps = {
+    generator,
+    assignKargs,
+    NULL,
+    NULL, // Prepare Translate Dims
+    NULL, // Inner Decomposition Axis
+    calcNrThreads,
+    NULL,
+    solverFlags,
+	NULL,
+	NULL,
+	NULL,
+	setBuildOpts,
+	NULL
+};
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+	const SolutionStep *step = (const SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+	if ( (kargs->dtype == TYPE_DOUBLE) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) ) {
+		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+	}
+	if( (kargs->dtype == TYPE_COMPLEX_FLOAT) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) ) {
+	    strcat( buildOptStr, " -DCOMPLEX ");
+	}
+
+	return;
+}
+
+
+static CLBLASMpatExtra mpatExtra;
+
+extern "C"
+void initRotgRegisterPattern(MemoryPattern *mempat)
+{
+	#ifdef DEBUG_ROTG
+	printf("initRegPattern called with mempat = 0x%p\n", mempat);
+	#endif
+
+	fflush(stdout);
+    mempat->name = "Register accumulation based swap";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &rotgOps;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L2;
+    mpatExtra.bMset = CLMEM_LEVEL_L2;
+    mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY;
+    mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY;
+    mempat->extra = &mpatExtra;
+
+	Prefix[TYPE_FLOAT] = 'S';
+	Prefix[TYPE_DOUBLE] = 'D';
+	Prefix[TYPE_COMPLEX_FLOAT] = 'C';
+	Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *_extra)
+{
+	int BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block
+	DUMMY_ARGS_USAGE_3(subdims, _extra, args);
+
+	size_t blocks = 1;  // Only 1 work-group is enough
+	#ifdef DEBUG_ROTG
+	printf("blocks : %d\n", blocks);
+	#endif
+
+	threads[0] = blocks * BLOCKSIZE;
+	#ifdef DEBUG_ROTG
+	printf("pgran-wgSize[0] : %d, globalthreads[0]  : %d\n", pgran->wgSize[0], threads[0]);
+	#endif
+	threads[1] = 1;
+}
+
+//
+// FIXME: Report correct return value - Needs change in KPRINTF
+//
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+
+	CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
+	DUMMY_ARGS_USAGE_2(subdims, pgran);
+	char tempTemplate[32*1024];
+
+	if ( buf == NULL) // return buffer size
+	{
+		buflen = (32 * 1024 * sizeof(char));
+        return (ssize_t)buflen;
+	}
+
+	#ifdef DEBUG_ROTG
+	printf("dataType : %c\n", Prefix[extraFlags->dtype]);
+	#endif
+
+    strcpy( tempTemplate, (char*)rotg_kernel );
+
+	kprintf kobj( Prefix[extraFlags->dtype], 1, false, false);
+    kobj.spit((char*)buf, tempTemplate);
+
+    return (32 * 1024 * sizeof(char));
+}
+
+/*
+__kernel void %PREFIXrotg_kernel( __global %TYPE *_A, __global %TYPE *_B, __global %PTYPE *_C,
+                                __global %TYPE *_S, uint offa, uint offb, uint offc, uint offs )
+
+*/
+static void
+assignKargs(KernelArg *args, const void *params, const void* )
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+
+    INIT_KARG(&args[0], blasArgs->A);
+	INIT_KARG(&args[1], blasArgs->B);
+	INIT_KARG(&args[2], blasArgs->C);
+    INIT_KARG(&args[3], blasArgs->D);
+    initSizeKarg(&args[4], blasArgs->offa);
+    initSizeKarg(&args[5], blasArgs->offb);
+    initSizeKarg(&args[6], blasArgs->offc);
+    initSizeKarg(&args[7], blasArgs->offd);
+
+	return;
+}
diff --git a/src/library/blas/gens/rotm_reg.cpp b/src/library/blas/gens/rotm_reg.cpp
new file mode 100644
index 0000000..2b04419
--- /dev/null
+++ b/src/library/blas/gens/rotm_reg.cpp
@@ -0,0 +1,291 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/*
+ * rotm generator
+ */
+//#define DEBUG_ROTM
+
+#define WORKGROUPS_PER_CU  32
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include "blas_kgen.h"
+#include <kprintf.hpp>
+#include <rotm.clT>
+#include <solution_seq.h>
+
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+
+extern "C"
+unsigned int dtypeSize(DataType type);
+
+
+static char Prefix[4];
+
+static SolverFlags
+solverFlags(void)
+{
+    return (SF_WSPACE_1D);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+
+static void
+assignKargs(KernelArg *args, const void *params, const void* extra );
+
+extern "C"
+void initRotmRegisterPattern(MemoryPattern *mempat);
+
+static  KernelExtraFlags
+selectVectorization(
+    void *kargs,
+    unsigned int vlen );
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *kArgs);
+
+static SolverOps rotmOps = {
+    generator,
+    assignKargs,
+    NULL,
+    NULL, // Prepare Translate Dims
+    NULL, // Inner Decomposition Axis
+    calcNrThreads,
+    NULL,
+    solverFlags,
+	NULL,
+	NULL,
+	NULL,
+	setBuildOpts,
+	selectVectorization
+};
+
+static  KernelExtraFlags
+selectVectorization(
+	void *args,
+	unsigned int vlen )
+{
+	KernelExtraFlags kflags = KEXTRA_NO_FLAGS;
+	CLBlasKargs *kargs  = (CLBlasKargs *)args;
+
+    if( (((kargs->offBX) % vlen) != 0) || (((kargs->offCY) % vlen) != 0) )
+    {
+        kflags = KEXTRA_NO_COPY_VEC_A;
+    }
+	return kflags;
+}
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+	const SolutionStep *step = (const SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
+	{
+		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+	}
+	if(kargs->pigFuncID == CLBLAS_ROT)
+	{
+	    strcat( buildOptStr, " -DDO_ROT ");
+	}
+	if( (kargs->ldb.vector) != 1) {
+        strcat( buildOptStr, " -DINCX_NONUNITY ");
+    }
+    if( (kargs->ldc.vector) != 1) {
+        strcat( buildOptStr, " -DINCY_NONUNITY ");
+    }
+
+	return;
+}
+
+
+static CLBLASMpatExtra mpatExtra;
+
+extern "C"
+void initRotmRegisterPattern(MemoryPattern *mempat)
+{
+	#ifdef DEBUG_ROTM
+	printf("initRegPattern called with mempat = 0x%p\n", mempat);
+	#endif
+
+	fflush(stdout);
+    mempat->name = "Register accumulation based swap";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &rotmOps;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L2;
+    mpatExtra.bMset = CLMEM_LEVEL_L2;
+    mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY;
+    mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY;
+    mempat->extra = &mpatExtra;
+
+	Prefix[TYPE_FLOAT] = 'S';
+	Prefix[TYPE_DOUBLE] = 'D';
+	Prefix[TYPE_COMPLEX_FLOAT] = 'C';
+	Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *_extra)
+{
+	DUMMY_ARG_USAGE(subdims);
+    const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra;
+    CLBlasKargs *kargs = (CLBlasKargs *)args;
+    SolutionStep *step = container_of(kargs, args, SolutionStep);
+    TargetDevice *kDevice = &(step->device);
+
+    cl_int err;
+    unsigned int numComputeUnits = deviceComputeUnits( (kDevice->id), &err );
+    if(err != CL_SUCCESS) {
+        numComputeUnits = 1;
+    }
+
+    unsigned int vecLen = extra->vecLenA;
+	unsigned int blockSize = pgran->wgSize[0] * pgran->wgSize[1];
+
+	unsigned int wgToSpawn = ((kargs->N - 1)/ (blockSize*vecLen)) + 1;
+    wgToSpawn = min( wgToSpawn, (numComputeUnits * WORKGROUPS_PER_CU) );
+
+	threads[0] = wgToSpawn * blockSize;
+	threads[1] = 1;
+}
+
+//
+// FIXME: Report correct return value - Needs change in KPRINTF
+//
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+
+	DUMMY_ARGS_USAGE_2(pgran, subdims);
+	CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
+	char tempTemplate[32*1024];
+
+	if ( buf == NULL) // return buffer size
+	{
+		buflen = (32 * 1024 * sizeof(char));
+        return (ssize_t)buflen;
+	}
+
+	#ifdef DEBUG_ROTM
+	printf("dataType : %c\n", Prefix[extraFlags->dtype]);
+	#endif
+
+    unsigned int vecLenA = extraFlags->vecLenA;
+
+	#ifdef DEBUG_ROTM
+	printf("Vector length used : %d\n\n", vecLenA);
+	#endif
+
+	bool doVLOAD = false;
+	if( extraFlags->flags &  KEXTRA_NO_COPY_VEC_A )
+	{
+		doVLOAD = true;
+		#ifdef DEBUG_ROTM
+			printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
+		#endif
+	}
+	else
+	{
+		#ifdef DEBUG_ROTM
+			printf("Using Aligned Data Pointer .......\n");
+		#endif
+	}
+    strcpy( tempTemplate, (char*)rotm_kernel );
+	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD);
+    kobj.spit((char*)buf, tempTemplate);
+
+    return (32 * 1024 * sizeof(char));
+}
+
+/*
+__kernel void %PREFIXrotm_kernel( __global %TYPE *_X, __global %TYPE *_Y, uint N,
+                                uint offx, int incx, uint offy, int incy
+#ifndef DO_ROT
+                                , __global %TYPE *_param, uint offParam             // Rotm parameters
+#else
+                                , %PTYPE C,  %PTYPE S                               // Rot parameters
+#endif
+*/
+static void
+assignKargs(KernelArg *args, const void *params, const void* )
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+	cl_int incx, incy;
+
+    INIT_KARG(&args[0], blasArgs->A);
+	INIT_KARG(&args[1], blasArgs->B);
+    initSizeKarg(&args[2], blasArgs->N);
+    initSizeKarg(&args[3], blasArgs->offBX);
+    incx = blasArgs->ldb.vector;
+    INIT_KARG(&args[4], incx);
+    initSizeKarg(&args[5], blasArgs->offCY);
+    incy = blasArgs->ldc.vector;
+    INIT_KARG(&args[6], incy);
+
+    if(blasArgs->pigFuncID == CLBLAS_ROT)
+    {
+        DataType alphaBetaType = (blasArgs->dtype == TYPE_COMPLEX_FLOAT)? TYPE_FLOAT:
+                                ((blasArgs->dtype == TYPE_COMPLEX_DOUBLE)? TYPE_DOUBLE:  blasArgs->dtype);
+        assignScalarKarg(&args[7], &(blasArgs->alpha), alphaBetaType);
+	    assignScalarKarg(&args[8], &(blasArgs->beta), alphaBetaType);
+	}
+	else if(blasArgs->pigFuncID == CLBLAS_ROTM)
+	{
+        INIT_KARG(&args[7], blasArgs->D);
+        initSizeKarg(&args[8], blasArgs->offd);
+    }
+
+	return;
+}
diff --git a/src/library/blas/gens/rotmg_reg.cpp b/src/library/blas/gens/rotmg_reg.cpp
new file mode 100644
index 0000000..b256ac6
--- /dev/null
+++ b/src/library/blas/gens/rotmg_reg.cpp
@@ -0,0 +1,215 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/*
+ * rotmg generator
+ */
+//#define DEBUG_ROTMG
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include "blas_kgen.h"
+#include <kprintf.hpp>
+#include <rotmg.clT>
+#include <solution_seq.h>
+
+extern "C"
+unsigned int dtypeSize(DataType type);
+
+
+static char Prefix[4];
+
+static SolverFlags
+solverFlags(void)
+{
+    return (SF_WSPACE_1D);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+
+static void
+assignKargs(KernelArg *args, const void *params, const void* extra );
+
+extern "C"
+void initRotmgRegisterPattern(MemoryPattern *mempat);
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *kArgs);
+
+static SolverOps rotmgOps = {
+    generator,
+    assignKargs,
+    NULL,
+    NULL, // Prepare Translate Dims
+    NULL, // Inner Decomposition Axis
+    calcNrThreads,
+    NULL,
+    solverFlags,
+	NULL,
+	NULL,
+	NULL,
+	setBuildOpts,
+	NULL
+};
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+	const SolutionStep *step = (const SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
+	{
+		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+	}
+
+	return;
+}
+
+
+static CLBLASMpatExtra mpatExtra;
+
+extern "C"
+void initRotmgRegisterPattern(MemoryPattern *mempat)
+{
+	#ifdef DEBUG_ROTMG
+	printf("initRegPattern called with mempat = 0x%p\n", mempat);
+	#endif
+
+	fflush(stdout);
+    mempat->name = "Register accumulation based swap";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &rotmgOps;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L2;
+    mpatExtra.bMset = CLMEM_LEVEL_L2;
+    mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY;
+    mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY;
+    mempat->extra = &mpatExtra;
+
+	Prefix[TYPE_FLOAT] = 'S';
+	Prefix[TYPE_DOUBLE] = 'D';
+	Prefix[TYPE_COMPLEX_FLOAT] = 'C';
+	Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *_extra)
+{
+	DUMMY_ARGS_USAGE_3(subdims, _extra, args);
+	int BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block
+
+	size_t blocks = 1;  // Only 1 work-group is enough
+	#ifdef DEBUG_ROTMG
+	printf("blocks : %d\n", blocks);
+	#endif
+
+	threads[0] = blocks * BLOCKSIZE;
+	#ifdef DEBUG_ROTMG
+	printf("pgran-wgSize[0] : %d, globalthreads[0]  : %d\n", pgran->wgSize[0], threads[0]);
+	#endif
+	threads[1] = 1;
+}
+
+//
+// FIXME: Report correct return value - Needs change in KPRINTF
+//
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+
+	DUMMY_ARGS_USAGE_2(subdims, pgran);
+	CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
+	char tempTemplate[32*1024];
+
+	if ( buf == NULL) // return buffer size
+	{
+		buflen = (32 * 1024 * sizeof(char));
+        return (ssize_t)buflen;
+	}
+
+	#ifdef DEBUG_ROTMG
+	printf("dataType : %c\n", Prefix[extraFlags->dtype]);
+	#endif
+
+    strcpy( tempTemplate, (char*)rotmg_kernel );
+
+	kprintf kobj( Prefix[extraFlags->dtype], 1, false, false);
+    kobj.spit((char*)buf, tempTemplate);
+
+    return (32 * 1024 * sizeof(char));
+}
+
+/*
+__kernel void %PREFIXrotmg_kernel( __global %TYPE *_D1, __global %TYPE *_D2, __global %TYPE *_X1,
+                                __global %TYPE *_Y1, __global %TYPE *_param,
+                                uint offD1, uint offD2, uint offX1, uint offY1, uint offParam )
+
+*/
+static void
+assignKargs(KernelArg *args, const void *params, const void* )
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+
+    INIT_KARG(&args[0], blasArgs->A);
+	INIT_KARG(&args[1], blasArgs->B);
+	INIT_KARG(&args[2], blasArgs->C);
+    INIT_KARG(&args[3], blasArgs->D);
+    INIT_KARG(&args[4], blasArgs->E);
+    initSizeKarg(&args[5], blasArgs->offa);
+    initSizeKarg(&args[6], blasArgs->offb);
+    initSizeKarg(&args[7], blasArgs->offc);
+    initSizeKarg(&args[8], blasArgs->offd);
+    initSizeKarg(&args[9], blasArgs->offe);
+
+	return;
+}
diff --git a/src/library/blas/gens/scal_reg.cpp b/src/library/blas/gens/scal_reg.cpp
new file mode 100644
index 0000000..d82362b
--- /dev/null
+++ b/src/library/blas/gens/scal_reg.cpp
@@ -0,0 +1,268 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/*
+ * SCAL generator
+ */
+//#define DEBUG_SCAL
+
+#define WORKGROUPS_PER_CU  32
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include "blas_kgen.h"
+#include <kprintf.hpp>
+#include <scal.clT>
+#include <solution_seq.h>
+
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+
+extern "C"
+unsigned int dtypeSize(DataType type);
+
+
+static char Prefix[4];
+
+static SolverFlags
+solverFlags(void)
+{
+	#ifdef DEBUG_SCAL
+	printf("solverFlags called......\n");
+	#endif
+
+    return (SF_WSPACE_1D);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+
+static void
+assignKargs(KernelArg *args, const void *params, const void* extra );
+
+extern "C"
+void initScalRegisterPattern(MemoryPattern *mempat);
+
+static  KernelExtraFlags
+selectVectorization(
+    void *kargs,
+    unsigned int vlen );
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *kArgs);
+
+static SolverOps SCALOps = {
+    generator,
+    assignKargs,
+    NULL,
+    NULL, // Prepare Translate Dims
+    NULL, // Inner Decomposition Axis
+    calcNrThreads,
+    NULL,
+    solverFlags,
+	NULL,
+	NULL,
+	NULL,
+	setBuildOpts,
+	selectVectorization
+};
+
+static  KernelExtraFlags
+selectVectorization(
+	void *args,
+	unsigned int vlen )
+{
+	KernelExtraFlags kflags = KEXTRA_NO_FLAGS;
+	CLBlasKargs *kargs  = (CLBlasKargs *)args;
+
+    if( (((kargs->offBX) % vlen) != 0))
+    {
+        kflags = KEXTRA_NO_COPY_VEC_A;
+    }
+	return kflags;
+}
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+	const SolutionStep *step = (const SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
+	{
+		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		#ifdef DEBUG_SCAL
+		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
+		#endif
+	}
+	if( (kargs->ldb.vector) != 1) {
+        strcat( buildOptStr, " -DINCX_NONUNITY ");
+    }
+
+	return;
+}
+
+
+static CLBLASMpatExtra mpatExtra;
+
+extern "C"
+void initScalRegisterPattern(MemoryPattern *mempat)
+{
+	#ifdef DEBUG_SCAL
+	printf("initRegPattern called with mempat = 0x%p\n", mempat);
+	#endif
+
+	fflush(stdout);
+    mempat->name = "Register accumulation based SCAL";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &SCALOps;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L2;
+    mpatExtra.bMset = CLMEM_LEVEL_L2;
+    mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY;
+    mempat->extra = &mpatExtra;
+
+	Prefix[TYPE_FLOAT] = 'S';
+	Prefix[TYPE_DOUBLE] = 'D';
+	Prefix[TYPE_COMPLEX_FLOAT] = 'C';
+	Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *_extra)
+{
+	DUMMY_ARG_USAGE(subdims);
+    const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra;
+    CLBlasKargs *kargs = (CLBlasKargs *)args;
+    SolutionStep *step = container_of(kargs, args, SolutionStep);
+    TargetDevice *kDevice = &(step->device);
+
+    cl_int err;
+    unsigned int numComputeUnits = deviceComputeUnits( (kDevice->id), &err );
+    if(err != CL_SUCCESS) {
+        numComputeUnits = 1;
+    }
+
+    unsigned int vecLen = extra->vecLenA;
+	unsigned int blockSize = pgran->wgSize[0] * pgran->wgSize[1];
+
+	unsigned int wgToSpawn = ((kargs->N - 1)/ (blockSize*vecLen)) + 1;
+    wgToSpawn = min( wgToSpawn, (numComputeUnits * WORKGROUPS_PER_CU) );
+
+	threads[0] = wgToSpawn * blockSize;
+	threads[1] = 1;
+}
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+
+    DUMMY_ARGS_USAGE_2(pgran, subdims);
+	char tempTemplate[32*1024];
+
+	if ( buf == NULL) // return buffer size
+	{
+		buflen = (32 * 1024 * sizeof(char));
+        return (ssize_t)buflen;
+	}
+	CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
+
+	#ifdef DEBUG_SCAL
+ 	printf("SCAL GENERATOR called....\n");
+	printf("dataType : %c\n", Prefix[extraFlags->dtype]);
+	#endif
+
+    unsigned int vecLenA = extraFlags->vecLenA;
+
+	#ifdef DEBUG_SCAL
+	printf("Vector length used : %d\n\n", vecLenA);
+	#endif
+
+	bool doVLOAD = false;
+	if( extraFlags->flags &  KEXTRA_NO_COPY_VEC_A )
+	{
+		doVLOAD = true;
+		#ifdef DEBUG_SCAL
+		printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
+		#endif
+	}
+	else
+	{
+		#ifdef DEBUG_SCAL
+		printf("Using Aligned Data Pointer .........................\n");
+		#endif
+	}
+    strcpy( tempTemplate, (char*)scal_kernel );
+	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD);
+    kobj.spit((char*)buf, tempTemplate);
+
+    return (32 * 1024 * sizeof(char));
+}
+
+/*
+__kernel void %PREFIXSCAL_kernel( __global %TYPE *_alpha, __global %TYPE *_X,
+                                        uint N, uint offx, int incx )
+
+*/
+static void
+assignKargs(KernelArg *args, const void *params, const void* )
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+	cl_int incx;
+
+    assignScalarKarg(&args[0], &(blasArgs->alpha), blasArgs->dtype);
+    INIT_KARG(&args[1], blasArgs->A);
+    initSizeKarg(&args[2], blasArgs->N);
+    initSizeKarg(&args[3], blasArgs->offBX);
+    incx = blasArgs->ldb.vector;
+    INIT_KARG(&args[4], incx);
+
+	return;
+}
diff --git a/src/library/blas/gens/swap_reg.cpp b/src/library/blas/gens/swap_reg.cpp
new file mode 100644
index 0000000..5b44ceb
--- /dev/null
+++ b/src/library/blas/gens/swap_reg.cpp
@@ -0,0 +1,275 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/*
+ * swap generator
+ */
+//#define DEBUG_SWAP
+
+#define WORKGROUPS_PER_CU  32
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include "blas_kgen.h"
+#include <kprintf.hpp>
+#include <swap.clT>
+#include <solution_seq.h>
+
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+
+extern "C"
+unsigned int dtypeSize(DataType type);
+
+
+static char Prefix[4];
+
+static SolverFlags
+solverFlags(void)
+{
+	#ifdef DEBUG_SWAP
+	printf("solverFlags called......\n");
+	#endif
+
+    return (SF_WSPACE_1D);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+
+static void
+assignKargs(KernelArg *args, const void *params, const void* extra );
+
+extern "C"
+void initSwapRegisterPattern(MemoryPattern *mempat);
+
+static  KernelExtraFlags
+selectVectorization(
+    void *kargs,
+    unsigned int vlen );
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *kArgs);
+
+static SolverOps swapOps = {
+    generator,
+    assignKargs,
+    NULL,
+    NULL, // Prepare Translate Dims
+    NULL, // Inner Decomposition Axis
+    calcNrThreads,
+    NULL,
+    solverFlags,
+	NULL,
+	NULL,
+	NULL,
+	setBuildOpts,
+	selectVectorization
+};
+
+static  KernelExtraFlags
+selectVectorization(
+	void *args,
+	unsigned int vlen )
+{
+	KernelExtraFlags kflags = KEXTRA_NO_FLAGS;
+	CLBlasKargs *kargs  = (CLBlasKargs *)args;
+
+    if( (((kargs->offBX) % vlen) != 0) || (((kargs->offCY) % vlen) != 0) )
+    {
+        kflags = KEXTRA_NO_COPY_VEC_A;
+    }
+	return kflags;
+}
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+	const SolutionStep *step = (const SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
+	{
+		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		#ifdef DEBUG_SWAP
+		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
+		#endif
+	}
+	if( (kargs->ldb.vector) != 1) {
+        strcat( buildOptStr, " -DINCX_NONUNITY ");
+    }
+    if( (kargs->ldc.vector) != 1) {
+        strcat( buildOptStr, " -DINCY_NONUNITY ");
+    }
+
+	return;
+}
+
+
+static CLBLASMpatExtra mpatExtra;
+
+extern "C"
+void initSwapRegisterPattern(MemoryPattern *mempat)
+{
+	#ifdef DEBUG_SWAP
+	printf("initREgPattern called with mempat = 0x%p\n", mempat);
+	#endif
+
+	fflush(stdout);
+    mempat->name = "Register accumulation based swap";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &swapOps;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L2;
+    mpatExtra.bMset = CLMEM_LEVEL_L2;
+    mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY;
+    mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY;
+    mempat->extra = &mpatExtra;
+
+	Prefix[TYPE_FLOAT] = 'S';
+	Prefix[TYPE_DOUBLE] = 'D';
+	Prefix[TYPE_COMPLEX_FLOAT] = 'C';
+	Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *_extra)
+{
+    DUMMY_ARG_USAGE(subdims);
+    const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra;
+    CLBlasKargs *kargs = (CLBlasKargs *)args;
+    SolutionStep *step = container_of(kargs, args, SolutionStep);
+    TargetDevice *kDevice = &(step->device);
+
+    cl_int err;
+    unsigned int numComputeUnits = deviceComputeUnits( (kDevice->id), &err );
+    if(err != CL_SUCCESS) {
+        numComputeUnits = 1;
+    }
+
+    unsigned int vecLen = extra->vecLenA;
+	unsigned int blockSize = pgran->wgSize[0] * pgran->wgSize[1];
+
+	unsigned int wgToSpawn = ((kargs->N - 1)/ (blockSize*vecLen)) + 1;
+    wgToSpawn = min( wgToSpawn, (numComputeUnits * WORKGROUPS_PER_CU) );
+
+	threads[0] = wgToSpawn * blockSize;
+	threads[1] = 1;
+}
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+
+    DUMMY_ARGS_USAGE_2(pgran, subdims);
+	char tempTemplate[32*1024];
+
+	if ( buf == NULL) // return buffer size
+	{
+		buflen = (32 * 1024 * sizeof(char));
+        return (ssize_t)buflen;
+	}
+	CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
+
+	#ifdef DEBUG_SWAP
+ 	printf("SWAP GENERATOR called....\n");
+	printf("dataType : %c\n", Prefix[extraFlags->dtype]);
+	#endif
+
+    unsigned int vecLenA = extraFlags->vecLenA;
+
+	#ifdef DEBUG_SWAP
+	printf("Vector length used : %d\n\n", vecLenA);
+	#endif
+
+	bool doVLOAD = false;
+	if( extraFlags->flags &  KEXTRA_NO_COPY_VEC_A )
+	{
+		doVLOAD = true;
+		#ifdef DEBUG_SWAP
+			printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
+		#endif
+	}
+	else
+	{
+		#ifdef DEBUG_SWAP
+			printf("Using Aligned Data Pointer .........................\n");
+		#endif
+	}
+    strcpy( tempTemplate, (char*)swap_kernel );
+	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD);
+    kobj.spit((char*)buf, tempTemplate);
+
+    return (32 * 1024 * sizeof(char));
+}
+
+/*
+__kernel void %PREFIXswap_kernel( __global %TYPE *_X, __global %TYPE *_Y,
+                                        uint N, uint offx, int incx, uint offy, int incy )
+
+*/
+static void
+assignKargs(KernelArg *args, const void *params, const void* )
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+	cl_int incx, incy;
+
+    INIT_KARG(&args[0], blasArgs->A);
+	INIT_KARG(&args[1], blasArgs->B);
+    initSizeKarg(&args[2], blasArgs->N);
+    initSizeKarg(&args[3], blasArgs->offBX);
+    incx = blasArgs->ldb.vector;
+    INIT_KARG(&args[4], incx);
+    initSizeKarg(&args[5], blasArgs->offCY);
+    incy = blasArgs->ldc.vector;
+    INIT_KARG(&args[6], incy);
+
+	return;
+}
diff --git a/src/library/blas/gens/symm_cached.cpp b/src/library/blas/gens/symm_cached.cpp
new file mode 100644
index 0000000..cc8c035
--- /dev/null
+++ b/src/library/blas/gens/symm_cached.cpp
@@ -0,0 +1,279 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Cached global buffers based symm generator
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include <kprintf.hpp>
+#include <symm.clT>
+#include <solution_seq.h>
+
+//#define DEBUG_SYMM
+
+extern "C"
+unsigned int dtypeSize(DataType type);
+
+static char Prefix[4];
+
+static CLBLASMpatExtra mpatExtra;
+
+extern "C"
+unsigned int dtypeSize(DataType type);
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+static void
+assignKargs(KernelArg *args, const void *params, const void*);
+
+/*
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+	*/
+
+static SolverFlags
+solverFlags(void);
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+	const void *kArgs);
+
+
+static SolverOps symmSops = {
+    generator,
+    assignKargs,
+    NULL, 				//isFitLDS?
+    NULL,				//prepareTranslateDims?
+    NULL,				//DecomAxis
+    NULL, 				// calcNrThreads,
+    NULL,				//ImagePackMode
+    solverFlags, 		//SolverFlags
+	NULL,
+	NULL,
+	NULL,
+	setBuildOpts, 		//Set Build Options
+	NULL
+};
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+	const SolutionStep *step = (const SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+
+    if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
+    {
+        strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+        #ifdef DEBUG_TRMV
+        printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
+        #endif
+    }
+
+	if (kargs->side == clblasLeft)
+	{
+		strcat(buildOptStr, " -D__SYMM_LEFT__ ");
+	} else {
+		strcat(buildOptStr, " -D__SYMM_RIGHT__ ");
+	}
+
+	if (kargs->uplo == clblasUpper)
+	{
+		strcat(buildOptStr, " -D__SYMM_UPPER__ ");
+	} else {
+		strcat(buildOptStr, " -D__SYMM_LOWER__ ");
+	}
+
+	if (kargs->order == clblasColumnMajor)
+	{
+		strcat(buildOptStr, " -D__SYMM_COLMAJOR__ ");
+	} else {
+		strcat(buildOptStr, " -D__SYMM_ROWMAJOR__ ");
+	}
+
+	strcat(buildOptStr, " -cl-mad-enable ");
+	#ifdef DEBUG_SYMM
+	printf("setBuildOptions: Setting to %s\n", buildOptStr);
+	#endif
+    return;
+}
+
+static SolverFlags
+solverFlags(void)
+{
+	return (SF_WSPACE_1D);
+}
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+ 	CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    KernelExtraFlags kflags = kextra->flags;
+    DataType dtype = kextra->dtype;
+	char tempTemplate[32*1024];
+	char itemx[10], itemy[10], width[10], itemy_by_width[10];
+	size_t Y, X, BLOCKSIZE, ITEMX, ITEMY;
+
+	if (buf == NULL)
+	{
+		buflen = 32*1024*sizeof(char);
+        return (ssize_t)buflen;
+	}
+
+	//
+	// Row-major is implemented in terms of column major routines
+	//
+	if ((kflags & KEXTRA_COLUMN_MAJOR) == 0)
+	{
+		return 0;
+	}
+	kprintf kobj(Prefix[dtype], kextra->vecLenA, true, true);
+
+	BLOCKSIZE = pgran->wgSize[0];
+	#ifdef DEBUG_SYMM
+	printf("SYMM- generator(): Blocksize passed = %lu, subdimy = %lu, subdimx = %lu, veclen = %lu \n", BLOCKSIZE, subdims->y, subdims->x, kextra->vecLenA);
+	#endif
+
+	Y = 16;
+	while (Y*(kextra->vecLenA) > subdims->y)
+	{
+		Y /= 2;
+	}
+
+	X = BLOCKSIZE/Y;
+	ITEMY = (subdims->y) / Y;
+	ITEMX = (subdims->x) / X;
+	if (ITEMX == 0)
+   	{
+   		ITEMX = 1;
+	}
+
+	if ((BLOCKSIZE % Y) || ((subdims->y) % Y) || ((subdims->x)%X) || (ITEMY % kextra->vecLenA))
+	{
+		printf("WARNING: SYMM- generator: subdim and blocksize in-compatible.\n");
+	}
+
+	sprintf(width, "%d", Y);
+	sprintf(itemy, "%lu", ITEMY);
+	sprintf(itemx, "%lu", ITEMX);
+	sprintf(itemy_by_width, "%lu", (size_t) ITEMY/kextra->vecLenA);
+
+	kobj.put("%WIDTH", width);
+	kobj.put("%ITEMX", itemx);
+	kobj.put("%ITEMY", itemy);
+	kobj.put("%ITEMY_BY_V", itemy_by_width);
+	#ifdef DEBUG_SYMM
+	printf("ColMajor SYMM - WIDTH = %s, ITEMX = %s, ITEMY = %s\n", width, itemx, itemy);
+	#endif
+
+	strcpy(tempTemplate, SYMM_C_KERNEL);
+	kobj.spit(buf, tempTemplate);
+	#ifdef DEBUG_SYMM
+   	printf("Kernel = \n%s\n", buf);
+   	#endif
+   	size_t tail = strlen(buf) + 1;
+   	while(tail < 32*1024)
+   	{
+   		buf[tail++] = 0;
+  	}
+	return 32*1024*sizeof(char);
+}
+
+/*
+__kernel void symm_C_kernel( __global %TYPE const * restrict _A, __global %TYPE const * restrict _B, __global %TYPE *C,\n\
+							uint M, uint N, uint _lda, uint _ldb, int ldc, %TYPE alpha, %TYPE beta)
+*/
+
+static void
+assignKargs(KernelArg *args, const void *params, const void*)
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+
+	#ifdef DEBUG_SYMM
+	printf("SAlpha=%f, DAlpha=%f, CAlpha =<%f, %f>, DAlpha=<%f, %f>\n",
+			blasArgs->alpha.argFloat, blasArgs->alpha.argDouble, CREAL(blasArgs->alpha.argFloatComplex), CIMAG(blasArgs->alpha.argFloatComplex),
+			CREAL(blasArgs->alpha.argDoubleComplex) , CIMAG(blasArgs->alpha.argDoubleComplex));
+	printf("SBeta=%f, DBeta=%f, CBeta=<%f, %f>, DBeta=<%f, %f>\n",
+			blasArgs->beta.argFloat, blasArgs->beta.argDouble, CREAL(blasArgs->beta.argFloatComplex), CIMAG(blasArgs->beta.argFloatComplex),
+			CREAL(blasArgs->beta.argDoubleComplex) , CIMAG(blasArgs->beta.argDoubleComplex));
+	#endif
+
+    INIT_KARG(&args[0], blasArgs->A);   //A - input matrix - argument
+	INIT_KARG(&args[1], blasArgs->B);
+	INIT_KARG(&args[2], blasArgs->C);
+	initSizeKarg(&args[3], blasArgs->M);
+	initSizeKarg(&args[4], blasArgs->N);
+	initSizeKarg(&args[5], blasArgs->lda.matrix);
+	initSizeKarg(&args[6], blasArgs->ldb.matrix);
+	initSizeKarg(&args[7], blasArgs->ldc.matrix);
+	initSizeKarg(&args[8], blasArgs->offa); //PENDING: offA or offa ??
+	initSizeKarg(&args[9], blasArgs->offBX);
+	initSizeKarg(&args[10], blasArgs->offCY);
+	assignScalarKarg(&args[11], &(blasArgs->alpha), blasArgs->dtype);
+	assignScalarKarg(&args[12], &(blasArgs->beta), blasArgs->dtype);
+	return;
+}
+
+extern "C"
+void
+initSymmDefaultPattern(MemoryPattern *mempat)
+{
+    mempat->name = "Cached global memory based block Symm";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &symmSops;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L1;
+    mpatExtra.bMset = CLMEM_LEVEL_L1;
+    mpatExtra.mobjA = CLMEM_BUFFER;
+    mpatExtra.mobjB = CLMEM_BUFFER;
+    mempat->extra = &mpatExtra;
+
+    Prefix[TYPE_FLOAT] = 'S';
+    Prefix[TYPE_DOUBLE] = 'D';
+    Prefix[TYPE_COMPLEX_FLOAT] = 'C';
+    Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
+	return;
+}
+
diff --git a/src/library/blas/gens/symv.c b/src/library/blas/gens/symv.c
new file mode 100644
index 0000000..4944892
--- /dev/null
+++ b/src/library/blas/gens/symv.c
@@ -0,0 +1,1141 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * symv generator
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <math.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+
+#include "blas_kgen.h"
+#include "xxmv_common.h"
+
+static const char *symvDecl =
+    "__attribute__((reqd_work_group_size(%lu, %lu, 1)))\n"
+    "void __kernel\n"
+    "%csymv(\n"
+    "    uint N,\n"
+    "    const %s alpha,\n"
+    "    const __global %s *restrict A,\n"
+    "    const __global %s *restrict X,\n"
+    "%s"
+    "    __global %s *Y,\n"
+    "    uint lda,\n"
+    "%s"    // offset A, X and Y
+    "%s"
+    "%s"
+    "    const uint startN,\n"
+    "    uint actualN)\n";
+
+static CLBLASMpatExtra mpatExtra;
+
+struct symvPrivate {
+    TilePostFetchPrivate *pfPriv;
+    TileMulOpts *mulOpts;
+    Tile tilea;
+    bool diag;
+    bool coord;
+};
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra);
+
+static SolverFlags
+solverFlags(void);
+
+static void
+fixupArgs(void *args, SubproblemDim *subdims, void *extra);
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs);
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static int
+symvSubgGetDefaultDecomp(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    void * pArgs);
+
+static bool
+subgCheckCalcDecomp(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    DataType dtype,
+    int check);
+
+static SolverOps symvSops = {
+    generator,
+    assignKargs,
+    isFitToLDS,
+    NULL,
+    NULL,
+    calcNrThreads,
+    NULL,
+    solverFlags,
+    fixupArgs,
+    symvSubgGetDefaultDecomp, //getDefaultDecomposition
+    subgCheckCalcDecomp, // get Decomp. List
+    NULL,
+    NULL
+};
+
+static void
+declareSymvKernel(
+    struct KgenContext *ctx,
+    DataType dtype,
+    const PGranularity *pgran,
+    KernelExtraFlags kflags)
+{
+    bool incxOne = ((kflags & KEXTRA_INCX_ONE) != 0);
+    bool incyOne = ((kflags & KEXTRA_INCY_ONE) != 0);
+    bool beta0 = ((kflags & KEXTRA_BETA_ZERO) != 0);
+    const char *incxDecl = incxOne ? "" : "    const int incx,\n";
+    const char *incyDecl = incyOne ? "" : "    const int incy,\n";
+    char betaDecl[128];
+    char offDecl[128];
+    char tmp[512];
+    char fpref;
+    const char *typeName;
+
+    typeName = dtypeBuiltinType(dtype);
+    fpref = dtypeToBlasPrefix(dtype);
+    if (beta0) {
+        betaDecl[0] = '\0';
+    }
+    else {
+        sprintf(betaDecl, "    const %s beta,\n", typeName);
+    }
+
+    offDecl[0] = '\0';
+    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
+        strcpy(offDecl, "    const uint offA,\n");
+    }
+    if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
+        strcat(offDecl, "    const uint offX,\n");
+    }
+    if (kflags & KEXTRA_CY_OFF_NOT_ZERO) {
+        strcat(offDecl, "    const uint offY,\n");
+    }
+
+    sprintf(tmp, symvDecl, pgran->wgSize[0], pgran->wgSize[1], fpref, typeName,
+            typeName, typeName, betaDecl, typeName, offDecl, incxDecl,
+            incyDecl);
+    kgenDeclareFunction(ctx, tmp);
+}
+
+/* avoid " + 0" statements */
+static void
+genAdd(char *buf, size_t val)
+{
+    if (val == 0) {
+        buf[0] = 0; //zero length string
+    }
+    else {
+        sprintf(buf, " + %lu", val);
+    }
+}
+
+static int
+genPostFetchMirror(
+    struct KgenContext *ctx,
+    MatrixRole mrole,
+    void *priv)
+{
+    TilePostFetchPrivate *pfPriv = ((struct symvPrivate *)priv)->pfPriv;
+    TileMulOpts *mulOpts = ((struct symvPrivate *)priv)->mulOpts;
+    Tile *tileb = (Tile *)&pfPriv->gset->tileA;
+    Tile *tilea = &((struct symvPrivate *)priv)->tilea;
+    bool tra = ((mulOpts->flags & TILEMUL_TRA) != 0);
+    char tmp[1024];
+    char stmtStr[2][128];
+    size_t blockx, blocky;
+    unsigned int x, y;
+    const struct SubproblemDim *dims = &pfPriv->gset->subdims[1];
+    (void)mrole;
+
+
+    blockx = blocky = 0;
+    // zero triangular part of tile a
+    // either single row of tile a either the whole tile have been fetched
+
+    if (tra) {
+        blocky = dims->bwidth;
+        blockx = dims->y;
+    }
+    else {
+        blocky = dims->y;
+        blockx = dims->bwidth;
+    }
+
+    // loop through block rows
+    for(y = 0; y < blocky; y++) {
+        // loop through all elements of block row
+        for(x = 0; x < blockx; x++) {
+            Kstring kstr[3];
+            const char *cmp = ">";
+            sprintfTileElement(&kstr[0], tileb, x, y, 1);
+            sprintfTileElement(&kstr[1], tileb, y, x, 1);
+            sprintfTileElement(&kstr[2], tilea, y, x, 1);
+            genAdd(stmtStr[0], x);
+            genAdd(stmtStr[1], y);
+            sprintf(tmp, "%s = k%s %s n%s ? %s : %s;\n",
+                    kstr[2].buf, stmtStr[0], cmp, stmtStr[1],
+                    kstr[0].buf, kstr[1].buf);
+            kgenAddStmt(ctx, tmp);
+        }
+        pfPriv->fetchNumA++;
+    }
+
+    *tileb = *tilea;
+
+    return 0;
+}
+
+static int
+genPostFetchDiag(
+    struct KgenContext *ctx,
+    MatrixRole mrole,
+    void *priv)
+{
+    TilePostFetchPrivate *pfPriv = ((struct symvPrivate *)priv)->pfPriv;
+    Tile *tile = (Tile *)&pfPriv->gset->tileA;
+    bool diag = ((struct symvPrivate *)priv)->diag;
+    bool tra = ((struct symvPrivate *)priv)->coord;
+    char tmp[1024];
+    char stmtStr[2][128];
+    const KernelVarNames *vnames = &pfPriv->gset->varNames;
+    const char *coord = tra ? vnames->coordA : vnames->k;
+    size_t blockx, blocky;
+    unsigned int x, y;
+    const struct SubproblemDim *dims = &pfPriv->gset->subdims[1];
+    (void)mrole;
+
+
+    blockx = blocky = 0;
+    // zero triangular part of tile a
+    // either single row of tile a either the whole tile have been fetched
+
+    if (tra) {
+        blocky = dims->bwidth;
+        blockx = dims->y;
+    }
+    else {
+        blocky = dims->y;
+        blockx = dims->bwidth;
+    }
+
+    // loop through block rows
+    for(y = 0; y < blocky; y++) {
+        // loop through all elements of block row
+        for(x = 0; x < blockx; x++) {
+            Kstring kstr[3];
+            const char *cmp = diag ? ">=" : ">";
+            if (diag) {
+                sprintfTileElement(&kstr[0], tile, x, y, 1);
+            }
+            else {
+                sprintfTileElement(&kstr[0], tile, y, x, 1);
+            }
+            genAdd(stmtStr[0], x);
+            genAdd(stmtStr[1], y);
+            sprintf(tmp, "%s = Ktail <= %i || %s%s %s n%s ? 0 : %s;\n",
+                    kstr[0].buf, y, coord, stmtStr[0], cmp, stmtStr[1],
+                    kstr[0].buf);
+            kgenAddStmt(ctx, tmp);
+        }
+        pfPriv->fetchNumA++;
+    }
+    return 0;
+}
+
+static int
+genPostFetchVertDiag(
+    struct KgenContext *ctx,
+    MatrixRole mrole,
+    void *priv)
+{
+    TilePostFetchPrivate *pfPriv = ((struct symvPrivate *)priv)->pfPriv;
+    TileMulOpts *mulOpts = ((struct symvPrivate *)priv)->mulOpts;
+    Tile *tile = (Tile *)&pfPriv->gset->tileA;
+    bool diag = ((struct symvPrivate *)priv)->diag;
+    char tmp[1024], tmp1[128] = "";
+    char stmtStr[2][128];
+    size_t blockx, blocky;
+    unsigned int x, y;
+    const struct SubproblemDim *dims = &pfPriv->gset->subdims[1];
+    (void)mrole;
+
+    blockx = blocky = 0;
+    // zero triangular part of tile a
+    // either single row of tile a either the whole tile have been fetched
+
+    if (!diag) {
+        blocky = dims->bwidth;
+        blockx = dims->y;
+    }
+    else {
+        blocky = dims->y;
+        blockx = dims->bwidth;
+    }
+
+    // loop through block rows
+    for(y = 0; y < blocky; y++) {
+        // loop through all elements of block row
+        for(x = 0; x < blockx; x++) {
+            Kstring kstr[3];
+            const char *cmp = diag ? ">=" : ">";
+            const char *name = diag ? "k" : "coordA";
+            if (diag) {
+                sprintfTileElement(&kstr[0], tile, y, x, 1);
+            }
+            else {
+                sprintfTileElement(&kstr[0], tile, x, y, 1);
+            }
+            genAdd(stmtStr[0], x);
+            genAdd(stmtStr[1], y);
+            if (mulOpts->flags & TILEMUL_SKEW_B) {
+                sprintf(tmp1, "Ktail <= %i || ", y);
+            }
+            sprintf(tmp, "%s = %s%s%s %s n%s ? 0 : %s;\n",
+                    kstr[0].buf, tmp1, name, stmtStr[0], cmp, stmtStr[1],
+                    kstr[0].buf);
+            kgenAddStmt(ctx, tmp);
+        }
+        pfPriv->fetchNumA++;
+    }
+    return 0;
+}
+
+// global memory based kernel generator
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+    struct KgenContext *ctx;
+    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    KernelExtraFlags kflags = kextra->flags;
+    bool upper = ((kflags & KEXTRA_UPPER_TRIANG) != 0) ^
+                  ((kflags & KEXTRA_COLUMN_MAJOR) != 0);
+    char tmp[2048];
+    const char *typeName;
+    DataType dtype = kextra->dtype;
+    BlasGenSettings gset, tgset, lset, gset1;
+    CLBLASKernExtra kextraTmp;
+    TileMulOpts mulOpts, tmulOpts;
+    KernelVarNames *vnames = &gset.varNames;
+    ssize_t ret;
+    size_t vecLen = kextra->vecLen;
+    const char *outTypeName;
+    bool b;
+    TilePostFetchPrivate pfPriv;
+    struct symvPrivate priv;
+    size_t wgSize;
+    bool tailM = (kflags & KEXTRA_TAILS_M) != 0;
+    bool tailK = (kflags & KEXTRA_TAILS_K) != 0;
+    bool tra = (kflags & KEXTRA_COLUMN_MAJOR) != 0;
+    bool rowMaj = !isMatrixAccessColMaj(CLBLAS_SYMV, kflags, MATRIX_A);
+    bool isComplex = isComplexType(dtype);
+    Tile tileb;
+    const char *gid = "get_group_id(0)";
+    const char *lid = "get_local_id(0)";
+    bool isHoriz = subdims[1].bwidth >= subdims[1].y;
+    unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth;
+    unsigned int cLocal;
+    unsigned int nPlans;
+
+    wgSize = (subdims[0].y / subdims[1].y) *
+            (subdims[0].bwidth / subdims[1].bwidth);
+    assert(pgran->wgSize[0] == wgSize);
+    assert(subdims[0].x == 1);
+    assert(subdims[1].x == 1);
+
+    memset(&gset, 0, sizeof(gset));
+    memset(&mulOpts, 0, sizeof(mulOpts));
+    memset(&pfPriv, 0, sizeof(pfPriv));
+    memset(&priv, 0, sizeof(priv));
+    ctx = createKgenContext(buf, buflen, true);
+    if (ctx == NULL) {
+        return -ENOMEM;
+    }
+
+    // at first, generate needed declarations
+    b = isDoubleBasedType(dtype);
+    kgenDeclareUptrs(ctx, b);
+
+    typeName = dtypeBuiltinType(dtype);
+
+    declareSymvKernel(ctx, dtype, pgran, kflags);
+
+    ret = kgenBeginFuncBody(ctx);
+    /* 1D work space. Matrix is divided among wi, each calculates it's own
+     * part of vector y */
+
+    kgenAddStmt(ctx, "#define M actualN\n");
+    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
+    gset.subdims[0].itemX = gset.subdims[0].x = 1;
+    gset.subdims[1].itemX = gset.subdims[1].x = 1;
+    gset.subdims[0].bwidth = gset.subdims[1].bwidth;
+    gset.flags |= BGF_WHOLE_A | BGF_UPTRS;
+
+    gset.kextra = kextra;
+    gset.pgran = pgran;
+
+    initDefaultTiles(&gset, CLBLAS_SYMV, 0, PRIV_STORAGE_VARIABLE_SET);
+    gset.tileA.vecLen = umin(8u, tra ? gset.tileA.nrCols : gset.tileA.nrRows);
+
+    if (isComplex) {
+         gset.tileCY.vecLen = 1;
+    }
+    declareTileStorages(ctx, &gset);
+    genZeroTile(ctx, &gset.tileCY);
+    getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL);
+    cLocal = wgSize / bStep;
+    nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen;
+
+    sprintf(tmp, "__local %s localRes[%u][%u];\n",
+                outTypeName, pgran->wgSize[0], nPlans);
+    kgenAddStmt(ctx, tmp);
+    sprintf(tmp, "uint coordA = (%s * %u + %s / %u) * %lu + startN;\n",
+                 gid, cLocal, lid, bStep, subdims[1].y);
+    kgenAddStmt(ctx, tmp);
+    sprintf(tmp, "uint n = coordA;\n");
+    kgenAddStmt(ctx, tmp);
+    sprintf(tmp, "uint k0 = (%s %% %u) * %lu;\n",
+                 lid,  bStep, subdims[1].bwidth);
+    kgenAddStmt(ctx, tmp);
+    kgenAddStmt(ctx, "actualN += startN;\n");
+
+    kgenAddBlankLine(ctx);
+
+    kgenBeginBranch(ctx,"if (coordA < actualN && k0 < N)");
+
+    genIncPointers(ctx, kflags);
+    sprintf(tmp,
+            "const GPtr Ag = {(__global %s*)A};\n"
+            "const GPtr Xg = {(__global %s*)X};\n",
+            typeName, typeName);
+    kgenAddStmt(ctx, tmp);
+
+    kgenAddBlankLine(ctx);
+
+    kgenAddStmt(ctx, "uint k = k0;\n");
+
+    if (tailK) {
+        sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth);
+        kgenAddStmt(ctx, tmp);
+        sprintf(tmp, "uint Ktail = N %% %lu;\n\n", subdims[1].y);
+        kgenAddStmt(ctx, tmp);
+        kgenBeginBranch(ctx, "if (n + Ktail < N)");
+        kgenAddStmt(ctx, "N -= Ntail;\n");
+        kgenAddBlankLine(ctx);
+    }
+
+    mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC;
+    if (tailM) {
+        vnames->sizeM = "N";
+    }
+
+    vnames->A = "Ag";
+    vnames->B = "Xg";
+    vnames->coordA = "coordA";
+    vnames->coordB = ""; //should not be used for vector
+    vnames->k = "k";
+    vnames->lda = "lda";
+    vnames->sizeK = "N";
+    vnames->sizeM = "N";
+
+    mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_NOT_INC_K;
+    if ((kflags & KEXTRA_CONJUGATE_A) != 0) {
+        mulOpts.flags |= TILEMUL_CONJA;
+    }
+    if ((kflags & KEXTRA_ENABLE_MAD) != 0) {
+        mulOpts.core = TILEMUL_MAD;
+    }
+    else {
+        mulOpts.core = TILEMUL_MULADD;
+    }
+    mulOpts.memA = CLMEM_GLOBAL_MEMORY;
+    mulOpts.memB = CLMEM_GLOBAL_MEMORY;
+
+    if (rowMaj) {
+        mulOpts.flags |= TILEMUL_BW_STRIDE;
+    }
+
+    if (upper) {
+        kgenAddStmt(ctx, "// k loop over column from the beginning of the column till the diagonal\n");
+    }
+    else {
+        kgenAddStmt(ctx, "// k loop over row from the beginning of the row till the diagonal\n");
+    }
+    sprintf(tmp, "for (; k < n/%lu*%lu; k += %lu)",
+        subdims[1].bwidth, subdims[1].bwidth, bStep*subdims[1].bwidth);
+    kgenBeginBranch(ctx, tmp);
+
+    genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
+            mulOpts.flags, kflags);
+
+    upper ^= rowMaj;
+    tra ^= rowMaj;
+    if (upper ^ rowMaj && tra) {
+        mulOpts.flags |= TILEMUL_TRA;
+    }
+    gset.tileA.trans ^= !upper;
+    tgset = gset;
+    tmulOpts = mulOpts;
+
+    ret = tileMulGen(ctx, &gset, &mulOpts);
+    if (ret != 0) {
+        return ret;
+    }
+    kgenEndBranch(ctx, NULL); /* k loop */
+
+    if (tailK)
+    {
+            kextraTmp = *kextra;
+            gset1 = gset;
+
+            kextraTmp.vecLen = 1;
+            gset1.kextra = &kextraTmp;
+
+            gset1.subdims[0].bwidth = gset1.subdims[1].bwidth = 1;
+
+            gset1.tileBX.nrRows = 1;
+            gset1.tileA.nrCols = 1;
+            kextraTmp.vecLenA = 1;
+    }
+
+
+    if (isHoriz)
+    {
+        lset = gset;
+        lset.subdims[0].bwidth = lset.subdims[1].bwidth =
+            lset.subdims[1].y = umin(subdims[1].bwidth, subdims[1].y);
+        lset.tileA.nrCols = lset.tileA.nrRows =
+            lset.tileBX.nrRows = lset.subdims[1].y;
+
+        kgenAddStmt(ctx, "// the diagonal\n");
+        kgenBeginBranch(ctx, "if (k <= n)");
+        kgenAddStmt(ctx, "uint k1 = k;\n");
+
+        if (subdims[1].bwidth != subdims[1].y) {
+            kgenAddStmt(ctx, "// the pred diagonal\n");
+            sprintf(tmp, "for (; k < n; k += %lu)", lset.subdims[1].bwidth);
+            kgenBeginBranch(ctx, tmp);
+
+            genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames,
+                    mulOpts.flags, kflags);
+
+            ret = tileMulGen(ctx, &lset, &mulOpts);
+            if (ret != 0) {
+                return ret;
+            }
+            kgenEndBranch(ctx, NULL); /* k loop */
+        }
+
+        initTile(&tileb, "b", lset.subdims[1].bwidth, lset.subdims[1].bwidth,
+            lset.subdims[1].bwidth, lset.tileA.dtype, PRIV_STORAGE_VARIABLE_SET,
+            lset.tileA.trans, lset.tileA.packed);
+        declareOneTileStorage(ctx, &tileb);
+
+        genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames,
+                mulOpts.flags, kflags);
+
+        priv.mulOpts = &mulOpts;
+        priv.pfPriv = &pfPriv;
+        priv.tilea = lset.tileA;
+        priv.diag = false;
+
+        pfPriv.funcID = CLBLAS_SYMV;
+        pfPriv.gset = &lset;
+        lset.tileA = tileb;
+        mulOpts.postFetch = genPostFetchMirror;
+        mulOpts.postFetchPriv = &priv;
+
+        ret = tileMulGen(ctx, &lset, &mulOpts);
+        if (ret != 0) {
+            return ret;
+        }
+
+        if (upper ^ rowMaj && tra) {
+            mulOpts.flags &= ~TILEMUL_TRA;
+        }
+        else {
+            mulOpts.flags |= TILEMUL_TRA;
+        }
+        gset.tileA.trans = lset.tileA.trans ^= true;
+        mulOpts.postFetch = NULL;
+        mulOpts.postFetchPriv = NULL;
+
+        if (subdims[1].bwidth != subdims[1].y) {
+            size_t width = umax(subdims[1].bwidth, subdims[1].y);
+            kgenAddStmt(ctx, "// the post diagonal\n");
+            if (tailK) {
+                kgenBeginBranch(ctx, "if(k < N)");
+            }
+            sprintf(tmp, "for (k += %lu; k < n/%lu*%lu+%lu; k += %lu)",
+                    lset.subdims[1].bwidth,
+                    width, width, width,
+                    lset.subdims[1].bwidth);
+            kgenBeginBranch(ctx, tmp);
+
+            genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames,
+                    mulOpts.flags, kflags);
+
+            ret = tileMulGen(ctx, &lset, &mulOpts);
+            if (ret != 0) {
+                return ret;
+            }
+            kgenEndBranch(ctx, NULL); /* k loop */
+
+            if (tailK) {
+                kgenEndBranch(ctx, NULL);
+                kgenBeginBranch(ctx, "else");
+                /* Handle tail along vector X */
+
+                kgenAddStmt(ctx, "N += Ntail;\n");
+
+                mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
+#if 1
+                sprintf(tmp, "for (k += %lu; k < actualN; k++)",
+                    lset.subdims[1].bwidth);
+                kgenBeginBranch(ctx, tmp);
+
+                gset1.tileA.trans = gset.tileA.trans;
+
+                genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames,
+                          mulOpts.flags, kflags);
+                ret = tileMulGen(ctx, &gset1, &mulOpts);
+                if (ret != 0) {
+                    return ret;
+                }
+                kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */
+#else
+                mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K;
+                genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
+                          mulOpts.flags, kflags);
+                ret = tileMulGen(ctx, &gset, &mulOpts);
+                if (ret != 0) {
+                    return ret;
+                }
+#endif
+
+                mulOpts.flags &= ~TILEMUL_GLOBAL_CYCLIC_A;
+                kgenEndBranch(ctx, NULL);
+            }
+        }
+
+        sprintf(tmp, "k = k1 + %lu;\n", bStep*subdims[1].bwidth);
+        kgenAddStmt(ctx, tmp);
+        kgenEndBranch(ctx, NULL);
+    }
+    else
+    {
+
+        kgenAddStmt(ctx, "// the diagonal\n");
+        sprintf(tmp, "if (k <= (n  + (get_local_id(0)%%%lu)*%lu))",
+            subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
+        kgenBeginBranch(ctx, tmp);
+
+        genFetchX(ctx, &gset.tileBX, gset.subdims[1].bwidth, dtype, vnames,
+                    mulOpts.flags, kflags);
+
+        kgenBeginBranch(ctx, NULL);
+
+        priv.mulOpts = &mulOpts;
+        priv.pfPriv = &pfPriv;
+        priv.diag = true;
+
+        pfPriv.funcID = CLBLAS_SYMV;
+        pfPriv.gset = &gset;
+        mulOpts.postFetch = genPostFetchVertDiag;
+        mulOpts.postFetchPriv = &priv;
+
+        ret = tileMulGen(ctx, &gset, &mulOpts);
+        if (ret != 0) {
+            return ret;
+        }
+        kgenEndBranch(ctx, NULL);
+
+        if (upper ^ rowMaj && tra) {
+            mulOpts.flags &= ~TILEMUL_TRA;
+        }
+        else {
+            mulOpts.flags |= TILEMUL_TRA;
+        }
+        gset.tileA.trans ^= true;
+        lset = gset;
+
+        sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n",
+            subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
+        kgenAddStmt(ctx, tmp);
+        kgenBeginBranch(ctx, NULL);
+
+        priv.diag = false;
+        ret = tileMulGen(ctx, &gset, &mulOpts);
+        if (ret != 0) {
+            return ret;
+        }
+        kgenEndBranch(ctx, NULL);
+
+        mulOpts.postFetch = NULL;
+        mulOpts.postFetchPriv = NULL;
+
+        sprintf(tmp, "k += %lu;\n", bStep*subdims[1].bwidth);
+        kgenAddStmt(ctx, tmp);
+        kgenEndBranch(ctx, NULL); /* if */
+    }
+
+    if (upper) {
+        kgenAddStmt(ctx, "// k loop over row from the diagonal till the right\n");
+    }
+    else {
+        kgenAddStmt(ctx, "// k loop over column from the diagonal till the bottom\n");
+    }
+    sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth);
+    kgenBeginBranch(ctx, tmp);
+
+    genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
+            mulOpts.flags, kflags);
+
+    ret = tileMulGen(ctx, &gset, &mulOpts);
+    if (ret != 0) {
+        return ret;
+    }
+    kgenEndBranch(ctx, NULL); /* k loop */
+
+    if (tailK) {
+        /* Handle tail along vector X */
+        kgenAddStmt(ctx, "N += Ntail;\n");
+
+        mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
+#if 1
+        sprintf(tmp, "for (; k < N; k++)");
+        kgenBeginBranch(ctx, tmp);
+
+        gset1.tileA.trans = gset.tileA.trans;
+
+        genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames,
+                  mulOpts.flags, kflags);
+        ret = tileMulGen(ctx, &gset1, &mulOpts);
+        if (ret != 0) {
+            return ret;
+        }
+        kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */
+#else
+        mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K;
+        genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
+                  mulOpts.flags, kflags);
+        ret = tileMulGen(ctx, &gset, &mulOpts);
+        if (ret != 0) {
+            return ret;
+        }
+#endif
+
+        kgenEndBranch(ctx, NULL);
+
+        kgenBeginBranch(ctx, "else");
+
+        sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth);
+        kgenBeginBranch(ctx, tmp);
+
+        tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A;
+        genFetchX(ctx, &tgset.tileBX, tgset.kextra->vecLen, dtype, vnames,
+                tmulOpts.flags, kflags);
+
+        priv.mulOpts = &tmulOpts;
+        priv.pfPriv = &pfPriv;
+        pfPriv.gset = &tgset;
+        priv.diag = false;
+
+        pfPriv.funcID = CLBLAS_SYMV;
+        tmulOpts.postFetch = genPostFetchDiag;
+        tmulOpts.postFetchPriv = &priv;
+
+        ret = tileMulGen(ctx, &tgset, &tmulOpts);
+        if (ret != 0) {
+            return ret;
+        }
+
+        if (isHoriz) {
+            sprintf(tmp, "if (k + %lu > N) break;\n", subdims[1].bwidth);
+        }
+        else {
+            sprintf(tmp, "if (k + %lu > N + (get_local_id(0)%%%lu)*%lu) break;\n",
+                subdims[1].y, subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
+        }
+        kgenAddStmt(ctx, tmp);
+
+        kgenEndBranch(ctx, NULL); /* k loop */
+
+        kgenBeginBranch(ctx, "if (k < N)");
+        if (isHoriz) {
+            kgenAddStmt(ctx, "k = n;\n");
+        }
+        else {
+            sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n",
+                subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
+            kgenAddStmt(ctx, tmp);
+        }
+
+        genFetchX(ctx, &lset.tileBX, lset.kextra->vecLen, dtype, vnames,
+                tmulOpts.flags, kflags);
+
+        priv.mulOpts = &tmulOpts;
+        priv.pfPriv = &pfPriv;
+        priv.diag = true;
+
+        pfPriv.funcID = CLBLAS_SYMV;
+        pfPriv.gset = &lset;
+        tmulOpts.postFetch = genPostFetchDiag;
+        tmulOpts.postFetchPriv = &priv;
+
+        if (!isHoriz) {
+            if (upper ^ rowMaj && tra) {
+                tmulOpts.flags &= ~TILEMUL_TRA;
+            }
+            else {
+                tmulOpts.flags |= TILEMUL_TRA;
+            }
+            kgenAddStmt(ctx, "Ktail = N - n;\n");
+            priv.coord = true;
+        }
+        else {
+            priv.coord = false;
+        }
+        tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A | TILEMUL_GLOBAL_CYCLIC_K;
+
+
+        ret = tileMulGen(ctx, &lset, &tmulOpts);
+        if (ret != 0) {
+            return ret;
+        }
+
+        kgenEndBranch(ctx, NULL);
+
+        kgenEndBranch(ctx, NULL);
+    }
+
+
+    if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
+        mulOpts.flags &= ~TILEMUL_BW_STRIDE;
+    }
+
+    kgenEndBranch(ctx,NULL);
+
+    genStoreLocalResult(ctx, &gset.tileCY, lid);
+
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+    kgenAddBlankLine(ctx);
+
+    sprintf(tmp, "if ((%s %% %u) == 0 && coordA < actualN && k0 < N)", lid, bStep);
+    kgenBeginBranch(ctx, tmp);
+
+    genAddLocalResult(ctx, &gset.tileCY, lid, bStep, 1);
+
+    /* write back the results */
+    /* y := alpha*A*x + beta*y */
+    sprintf(tmp,"(%s - startN)", vnames->coordA);
+    setResultPos(ctx, kflags, tmp);
+
+    updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY);
+
+    kgenEndBranch(ctx, NULL);
+
+    kgenEndFuncBody(ctx);
+    ret = kgenAddBlankLine(ctx);
+    if (!ret) {
+        ret = (ssize_t)kgenSourceSize(ctx) + 1;
+    }
+
+    destroyKgenContext(ctx);
+    return (ret < 0) ? -EOVERFLOW : ret;
+}
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra)
+{
+    const CLBlasKargs *blasArgs = (const CLBlasKargs*)params;
+    KernelExtraFlags kflags = ((const CLBLASKernExtra*)extra)->flags;
+    cl_int inc;
+    int i;
+
+    initSizeKarg(&args[0], blasArgs->K);
+    assignScalarKarg(&args[1], &(blasArgs->alpha), blasArgs->dtype);
+    INIT_KARG(&args[2], blasArgs->A);
+    INIT_KARG(&args[3], blasArgs->B);
+    i = 4;
+    if (!(kflags & KEXTRA_BETA_ZERO)) {
+        assignScalarKarg(&args[i++], &(blasArgs->beta), blasArgs->dtype);
+    }
+    initMemobjKarg(&args[i++], blasArgs->C, NULL, 0, 0);
+    initSizeKarg(&args[i++], blasArgs->lda.matrix);
+    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
+        initSizeKarg(&args[i++], blasArgs->offA);
+    }
+    if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
+        initSizeKarg(&args[i++], blasArgs->offBX);
+    }
+    if (kflags & KEXTRA_CY_OFF_NOT_ZERO) {
+        initSizeKarg(&args[i++], blasArgs->offCY);
+    }
+    if (!(kflags & KEXTRA_INCX_ONE)) {
+        inc = blasArgs->ldb.vector;
+        INIT_KARG(&args[i], inc);
+        i++;
+    }
+    if (!(kflags & KEXTRA_INCY_ONE)) {
+        inc = blasArgs->ldc.vector;
+        INIT_KARG(&args[i], inc);
+        i++;
+    }
+
+    initSizeKarg(&args[i++], blasArgs->offsetN);
+    initSizeKarg(&args[i++], blasArgs->N); //Actual N
+}
+
+static void
+fixupArgs(void *args, SubproblemDim *subdims, void *extra)
+{
+    CLBlasKargs *kargs = (CLBlasKargs*)args;
+
+    (void)extra;
+    (void)subdims;
+
+    if (kargs->offsetN) {
+        if (kargs->ldc.vector < 0) {
+            // K store the original height of the matrix A
+            kargs->offCY += (kargs->K - kargs->offsetN) *
+                            abs(kargs->ldc.vector);
+        }
+        else {
+            kargs->offCY += kargs->offsetN * kargs->ldc.vector;
+        }
+    }
+}
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs)
+{
+    cl_ulong size;
+    (void)kernelArgs;
+
+    /*
+     * One needs y1 * wgSize size of local memory in elements,
+     * but y1 is not calculated yet. The expression below produces
+     * reliable a larger value. It is larger in dims[1].bwidth times.
+     */
+    size = dim[0].y * dim[0].bwidth * dtypeSize(dtype);
+
+    return (size <= ldsSize);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra)
+{
+    const CLBlasKargs *kargs = args;
+    unsigned int subgr = subdims[0].bwidth / subdims[1].bwidth;
+    (void)extra;
+
+    //each work item handles y1 lines
+    threads[0] = divRoundUp(kargs->N, subdims[1].y) * subgr;
+    threads[0] = roundUp(threads[0], pgran->wgSize[0]);
+    threads[1] = 0;
+}
+
+static SolverFlags
+solverFlags(void)
+{
+    return (SF_WSPACE_1D);
+}
+
+static int
+symvSubgGetDefaultDecomp(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    void * pArgs)
+{
+    (void)subdimsNum;
+    DUMMY_ARG_USAGE(pArgs);
+
+    pgran->wgDim = 1;
+    pgran->wgSize[0] = 64;
+    pgran->wgSize[1] = 1;
+
+    subdims[1].bwidth = 4;
+    subdims[1].itemX = subdims[1].x = 1;
+    subdims[1].itemY = subdims[1].y = 4;
+
+    subdims[0].bwidth = 8 * subdims[1].bwidth;
+    subdims[0].itemX = subdims[0].x = 1;
+    subdims[0].itemY = subdims[0].y = 8 * subdims[1].y;
+
+    return 0;
+}
+
+static bool
+subgCheckCalcDecomp(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    DataType dtype,
+    int check)
+{
+    unsigned int divider1 = dtypeSize(dtype)/sizeof(cl_float);
+
+    //EINVAL
+    if( (subdimsNum<2)||
+        (NULL==pgran)||
+        (NULL==subdims) ){
+
+        return false;
+    }
+
+    if( 0 == subdims[0].x ||
+        0 == subdims[0].y ||
+        0 == subdims[0].bwidth ||
+        0 == subdims[1].x ||
+        0 == subdims[1].y ||
+        0 == subdims[1].bwidth ){
+
+        return false;
+    }
+
+    if( subdims[1].x != subdims[1].itemX ||
+        subdims[1].y != subdims[1].itemY ){
+
+        return false;
+    }
+
+    // the group block must consist of integer number of subgroup blocks
+    if( subdims[0].x % subdims[1].x ||
+        subdims[0].y % subdims[1].y ||
+        subdims[0].bwidth % subdims[1].bwidth ){
+
+        return false;
+    }
+
+    //check fitting of bw to common vector sizes
+    if( isComplexType(dtype) ){
+
+        if( 2*subdims[1].bwidth > 32 ){
+
+            return false;
+        }
+    }
+
+    // check dimensions
+    if( subdims[1].bwidth > 16 / divider1 ||
+        subdims[1].x > 1 ||
+        subdims[1].y > 16 / divider1 ){
+
+        return false;
+    }
+
+    if( subdims[0].bwidth > 128 ||
+        subdims[0].x > 1 ||
+        subdims[0].y > 128 ){
+
+        return false;
+    }
+
+    if (64 != (subdims[0].y / subdims[1].y) *
+        (subdims[0].bwidth / subdims[1].bwidth)) {
+        return false;
+    }
+
+    if (subdims[0].y > subdims[0].bwidth &&
+        subdims[0].y / subdims[0].bwidth < (subdims[0].bwidth / subdims[1].bwidth)) {
+        return false;
+    }
+
+    // passed PGranularity should be checked
+    if( PGRAN_CHECK == check ){
+        if( pgran->wgSize[0] * pgran->wgSize[1] != 64 ){
+            return false;
+        }
+    }
+    // PGranularity should be calculated
+    else{
+        pgran->wgDim = 1;
+        pgran->wgSize[1] = 1;
+        pgran->wgSize[0] = 64;
+        //subdims[0].bwidth = (pgran->wgSize[0] * subdims[1].bwidth) /
+        //    (subdims[0].y / subdims[1].y);
+    }
+    /*Debug out for Tune*/
+
+    return true;
+}
+
+void
+initSymvPattern(MemoryPattern *mempat)
+{
+    mempat->name = "Cached global memory based block symv";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &symvSops;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L1;
+    mpatExtra.bMset = CLMEM_LEVEL_L1;
+    mpatExtra.mobjA = CLMEM_BUFFER;
+    mpatExtra.mobjB = CLMEM_BUFFER;
+    mempat->extra = &mpatExtra;
+}
diff --git a/src/library/blas/gens/syr2_lds.cpp b/src/library/blas/gens/syr2_lds.cpp
new file mode 100644
index 0000000..9fccb05
--- /dev/null
+++ b/src/library/blas/gens/syr2_lds.cpp
@@ -0,0 +1,372 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/*
+ * SYR2 Generator
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include "blas_kgen.h"
+
+#include <kprintf.hpp>
+#include <syr2_her2.clT>
+#include <solution_seq.h>
+//#define DEBUG_SYR2
+
+extern "C"
+unsigned int dtypeSize(DataType type);
+
+
+static char Prefix[4];
+
+static SolverFlags
+solverFlags(void)
+{
+	#ifdef DEBUG_SYR2
+	printf("solverFlags called......\n");
+	#endif
+
+    return (SolverFlags)(SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+
+static void
+assignKargs(KernelArg *args, const void *params, const void*);
+
+extern "C"
+void initSyr2DefaultPattern(MemoryPattern *mempat);
+
+static  KernelExtraFlags
+selectVectorization(
+    void *kargs,
+    unsigned int vlen );
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *kArgs);
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs);
+
+static SolverOps syr2Ops = {
+    generator,
+    assignKargs,
+    isFitToLDS,
+    NULL, // Prepare Translate Dims
+    NULL, // Inner Decomposition Axis
+    calcNrThreads,
+    NULL,
+    solverFlags,
+	NULL,
+	NULL,
+	NULL,
+	setBuildOpts,
+	selectVectorization
+};
+
+static  KernelExtraFlags
+selectVectorization(
+	void *args,
+	unsigned int vlen )
+{
+	KernelExtraFlags kflags = KEXTRA_NO_FLAGS;
+	CLBlasKargs *kargs  = (CLBlasKargs *)args;
+
+	if(kargs->uplo == clblasUpper)
+	{
+		if( (kargs->N) % vlen)
+        {
+			kflags = KEXTRA_NO_COPY_VEC_A;
+		}
+	}
+
+    if( kargs->pigFuncID == CLBLAS_SPR2 )
+    {
+        kflags = KEXTRA_NO_COPY_VEC_A;      // Packed-case never do aligned access
+    }
+
+	return kflags;
+}
+
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+	const SolutionStep *step = (const SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+	if ( kargs->dtype == TYPE_DOUBLE )
+	{
+		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		#ifdef DEBUG_SYR2
+		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
+		#endif
+	}
+    if( kargs->pigFuncID == CLBLAS_SPR2 )
+    {
+        strcat( buildOptStr, " -DPACKED ");
+    }
+
+	return;
+}
+
+static CLBLASMpatExtra mpatExtra;
+
+extern "C"
+void initSyr2DefaultPattern(MemoryPattern *mempat)
+{
+	#ifdef DEBUG_SYR2
+	printf("initSyrDefaultPattern called with mempat = 0x%p\n", (void *)mempat);
+	fflush(stdout);
+	#endif
+
+    mempat->name = "LDS based syr";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &syr2Ops;
+
+    mpatExtra.aMset = 0;
+    mpatExtra.bMset = CLMEM_LEVEL_LDS; // For "x" vector
+	//mpatExtra.cMset = CLMEM_LEVEL_LDS; // For "y" vector
+    mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY;
+    mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY;
+	//mpatExtra.mobjC = CLMEM_GLOBAL_MEMORY;
+
+    mempat->extra = &mpatExtra;
+
+	Prefix[TYPE_FLOAT] = 'S';
+	Prefix[TYPE_DOUBLE] = 'D';
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *_extra)
+{
+	int BLOCKSIZE = pgran->wgSize[0]; // 1D Block
+	#ifdef DEBUG_SYR2
+	printf("calcNrThreads called from syr2_lds.cpp\n");
+	#endif
+
+    const CLBlasKargs *kargs = (const CLBlasKargs *)args;
+	const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra;
+
+	clblasOrder order = ( extra->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor;
+
+    if ( order == clblasRowMajor )
+    {
+        order = clblasColumnMajor;
+    }
+	#ifdef DEBUG_SYR2
+	printf("subdims->y : %d, subdims->x : %d\n", (int)subdims->y, (int)subdims->x);
+	#endif
+	size_t TARGETROWS = subdims->y ;
+
+	#ifdef DEBUG_SYR2
+	printf("kargs-> N : %d, TARGETROWS: %d\n", (int)kargs->N, TARGETROWS);
+	#endif
+
+	size_t blocks = ((kargs->N - 1)/ TARGETROWS) + 1;
+	#ifdef DEBUG_SYR2
+	printf("blocks : %d\n", blocks);
+	#endif
+
+	threads[0] = ((blocks * (blocks + 1)) / 2) * BLOCKSIZE;
+	#ifdef DEBUG_SYR2
+	printf("pgran-wgSize[0] : %d, globalthreads[0] : %d\n", (int)pgran->wgSize[0], (int)threads[0]);
+	#endif
+	threads[1] = 1;
+}
+
+
+//
+// FIXME: Report correct return value - Needs change in KPRINTF
+//
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+	int BLOCKSIZE  = pgran->wgSize[0];
+	char tempTemplate[32*1024];
+	char targetRows[10], blockSize[10];
+
+	if ( buf == NULL) // return buffer size
+	{
+		buflen = (64 * 1024 * sizeof(char));
+		return (ssize_t)buflen;
+	}
+	CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
+
+	#ifdef DEBUG_SYR2
+ 	printf("SYR2 GENERATOR called....\n");
+	#endif
+
+	clblasUplo uplo   = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower;
+	clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor;
+
+
+	if ((subdims->y % extraFlags->vecLenA) != 0)
+	{
+		printf("WARNING: SYR2: generator: TARGETROWS must be divisible by Vector Length\n");
+		return 0;
+	}
+
+	size_t TARGETROWS = 0;
+	if(order == clblasColumnMajor)
+	{
+		( uplo == clblasLower )?
+			     (strcpy(tempTemplate, (char*)syr2_her2_CL_kernel)) : (strcpy(tempTemplate, (char*)syr2_her2_CU_kernel));
+	}
+	else
+	{
+		printf("WARNING: SYR2: Rowmajor order is implemented in columnMajor. This part should never get executed.\n");
+		return 0;
+	}
+
+	TARGETROWS = subdims->y;
+	if ((BLOCKSIZE % TARGETROWS) != 0)
+	{
+		printf("WARNING: SYR2: generator: Invalid Block Size\n");
+		return 0;
+	}
+
+	#ifdef DEBUG_SYR2
+	printf("dataType : %c\n", Prefix[extraFlags->dtype]);
+	#endif
+
+	// FIXME: VECTORSIZE HARD CODED
+	// FIXME : SetKernelArgs.. sends offa, offx, and lda should be received as uint
+    unsigned int vecLenA = extraFlags->vecLenA;
+
+	#ifdef DEBUG_SYR2
+	printf("Vector length used : %d\n\n", vecLenA);
+	#endif
+
+	bool doVLOAD = false;
+	if( extraFlags->flags &  KEXTRA_NO_COPY_VEC_A )
+	{
+		doVLOAD = true;
+		#ifdef DEBUG_SYR2
+			printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
+		#endif
+	}
+	else
+	{
+		#ifdef DEBUG_SYR2
+			printf("Using Aligned Data Pointer .........................\n");
+		#endif
+	}
+	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD);
+
+	sprintf( targetRows, "%d", TARGETROWS );
+	sprintf( blockSize, "%d", BLOCKSIZE );
+
+	#ifdef DEBUG_SYR2
+    printf("TARGET ROWS = %s\n", targetRows);
+    printf("BLOCK SIZE = %s\n", blockSize);
+	#endif
+
+    kobj.put("%TARGET_ROWS", (const char *)targetRows);
+    kobj.put("%BLOCKSIZE", (const char *) blockSize);
+    kobj.spit((char*)buf, tempTemplate);
+
+	return (64 * 1024 * sizeof(char));
+    // return 0;//(ret < 0) ? -EOVERFLOW : ret;
+}
+
+/*
+( __global %TYPE* _A, __global const %TYPE* _X, __global const %TYPE* _Y, int N, int offx, int incx, int offy, int incy, int offa, int lda, %TYPE alpha)
+*/
+static void
+assignKargs(KernelArg *args, const void *params, const void*)
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+    cl_int inc;
+
+    INIT_KARG(&args[0], blasArgs->A); 	//A - input/output matrix - argument
+    INIT_KARG(&args[1], blasArgs->B); 	//X - x vector
+	INIT_KARG(&args[2], blasArgs->C); 	//Y - y vector
+	initSizeKarg(&args[3], blasArgs->N);
+	initSizeKarg(&args[4], blasArgs->offBX);
+    inc = blasArgs->ldb.vector;
+    INIT_KARG(&args[5], inc);
+	initSizeKarg(&args[6], blasArgs->offCY);
+	inc = blasArgs->ldc.vector;
+	INIT_KARG(&args[7], inc);
+    initSizeKarg(&args[8], blasArgs->offa);
+	initSizeKarg(&args[9], blasArgs->lda.matrix);
+    assignScalarKarg(&args[10], &(blasArgs->alpha), blasArgs->dtype);
+	return;
+}
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs)
+{
+    cl_ulong maxSize;
+    CLBlasKargs *blasArgs;
+
+	blasArgs = (CLBlasKargs *)kernelArgs;
+
+	// 4  buffers for xShared, yShared, xSharedTrans and ySharedTrans and 2 integers for the values of iShared and jShared.
+
+	maxSize = (dim->y * 4 * sizeof(dtype)) + (2 * sizeof(int));
+
+    return ((maxSize) <= ldsSize);
+
+}
+//#undef DEBUG_SYR2
+
diff --git a/src/library/blas/gens/syr_lds.cpp b/src/library/blas/gens/syr_lds.cpp
new file mode 100644
index 0000000..0a12ef4
--- /dev/null
+++ b/src/library/blas/gens/syr_lds.cpp
@@ -0,0 +1,367 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/*
+ * SYR Generator
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include "blas_kgen.h"
+
+#include <kprintf.hpp>
+#include <syr_her.clT>
+#include <solution_seq.h>
+//#define DEBUG_SYR
+
+extern "C"
+unsigned int dtypeSize(DataType type);
+
+
+static char Prefix[4];
+
+static SolverFlags
+solverFlags(void)
+{
+	#ifdef DEBUG_SYR
+	printf("solverFlags called......\n");
+	#endif
+
+    return (SolverFlags)(SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+
+static void
+assignKargs(KernelArg *args, const void *params, const void*);
+
+extern "C"
+void initSyrDefaultPattern(MemoryPattern *mempat);
+
+static  KernelExtraFlags
+selectVectorization(
+    void *kargs,
+    unsigned int vlen );
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *kArgs);
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs);
+
+static SolverOps syrOps = {
+    generator,
+    assignKargs,
+    isFitToLDS,
+    NULL, // Prepare Translate Dims
+    NULL, // Inner Decomposition Axis
+    calcNrThreads,
+    NULL,
+    solverFlags,
+	NULL,
+	NULL,
+	NULL,
+	setBuildOpts,
+	selectVectorization
+};
+
+static  KernelExtraFlags
+selectVectorization(
+	void *args,
+	unsigned int vlen )
+{
+	KernelExtraFlags kflags = KEXTRA_NO_FLAGS;
+	CLBlasKargs *kargs  = (CLBlasKargs *)args;
+
+	if(kargs->uplo == clblasUpper)
+	{
+		if( (kargs->N) % vlen)
+        {
+			kflags = KEXTRA_NO_COPY_VEC_A;
+		}
+	}
+
+    if( kargs->pigFuncID == CLBLAS_SPR )
+    {
+        kflags = KEXTRA_NO_COPY_VEC_A;      // Packed-case never do aligned access
+    }
+
+#ifdef DEBUG_SYR
+	printf("SYR: selectVectorization being called\n");
+#endif
+	return kflags;
+}
+
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+	const SolutionStep *step = (const SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+	if ( kargs->dtype == TYPE_DOUBLE )
+	{
+		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		#ifdef DEBUG_SYR
+		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
+		#endif
+	}
+    if( kargs->pigFuncID == CLBLAS_SPR )
+    {
+        strcat( buildOptStr, " -DPACKED ");
+    }
+
+	return;
+}
+
+static CLBLASMpatExtra mpatExtra;
+
+extern "C"
+void initSyrDefaultPattern(MemoryPattern *mempat)
+{
+	#ifdef DEBUG_SYR
+	printf("initSyrDefaultPattern called with mempat = 0x%p\n", (void *)mempat);
+	fflush(stdout);
+	#endif
+
+    mempat->name = "LDS based syr";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &syrOps;
+
+    mpatExtra.aMset = 0;
+    mpatExtra.bMset = CLMEM_LEVEL_LDS; // For "x" vector
+    mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY;
+    mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY;
+    mempat->extra = &mpatExtra;
+
+	Prefix[TYPE_FLOAT] = 'S';
+	Prefix[TYPE_DOUBLE] = 'D';
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *_extra)
+{
+	int BLOCKSIZE = pgran->wgSize[0]; // 1D Block
+	#ifdef DEBUG_SYR
+	printf("calcNrThreads called from syr_reg.cpp\n");
+	#endif
+
+    const CLBlasKargs *kargs = (const CLBlasKargs *)args;
+	const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra;
+
+	clblasOrder order = ( extra->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor;
+
+    if ( order == clblasRowMajor )
+    {
+        order = clblasColumnMajor;
+    }
+	#ifdef DEBUG_SYR
+	printf("subdims->y : %d, subdims->x : %d\n", (int)subdims->y, (int)subdims->x);
+	#endif
+	size_t TARGETROWS = subdims->y ;
+
+	#ifdef DEBUG_SYR
+	printf("kargs-> N : %d, TARGETROWS: %d\n", (int)kargs->N, TARGETROWS);
+	#endif
+
+	size_t blocks = ((kargs->N - 1)/ TARGETROWS) + 1;
+	#ifdef DEBUG_SYR
+	printf("blocks : %d\n", blocks);
+	#endif
+
+	threads[0] = ((blocks * (blocks + 1)) / 2) * BLOCKSIZE;
+	#ifdef DEBUG_SYR
+	printf("pgran-wgSize[0] : %d, globalthreads[0] : %d\n", (int)pgran->wgSize[0], (int)threads[0]);
+	#endif
+	threads[1] = 1;
+}
+
+
+//
+// FIXME: Report correct return value - Needs change in KPRINTF
+//
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+	int BLOCKSIZE  = pgran->wgSize[0];
+	char tempTemplate[32*1024];
+	char targetRows[10], blockSize[10];
+
+	if ( buf == NULL) // return buffer size
+	{
+		buflen = (64 * 1024 * sizeof(char));
+		return (ssize_t)buflen;
+	}
+	CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
+
+	#ifdef DEBUG_SYR
+ 	printf("SYR GENERATOR called....\n");
+	#endif
+
+	clblasUplo uplo   = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower;
+	clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor;
+
+
+	if ((subdims->y % extraFlags->vecLenA) != 0)
+	{
+		printf("WARNING: SYR: generator: TARGETROWS must be divisible by Vector Length\n");
+		return 0;
+	}
+
+	size_t TARGETROWS = 0;
+	if(order == clblasColumnMajor)
+	{
+		( uplo == clblasLower )?
+			     (strcpy(tempTemplate, (char*)syr_her_CL_kernel)) : (strcpy(tempTemplate, (char*)syr_her_CU_kernel));
+	}
+	else
+	{
+		printf("WARNING: SYR: Rowmajor order is implemented in columnMajor. This part should never get executed.\n");
+		return 0;
+	}
+
+	TARGETROWS = subdims->y;
+	if ((BLOCKSIZE % TARGETROWS) != 0)
+	{
+		printf("WARNING: SYR: generator: Invalid Block Size\n");
+		return 0;
+	}
+
+	#ifdef DEBUG_SYR
+	printf("dataType : %c\n", Prefix[extraFlags->dtype]);
+	#endif
+
+	// FIXME: VECTORSIZE HARD CODED
+	// FIXME : SetKernelArgs.. sends offa, offx, and lda should be received as uint
+    unsigned int vecLenA = extraFlags->vecLenA;
+
+	#ifdef DEBUG_SYR
+	printf("Vector length used : %d\n\n", vecLenA);
+	#endif
+
+	bool doVLOAD = false;
+	if( extraFlags->flags &  KEXTRA_NO_COPY_VEC_A )
+	{
+		doVLOAD = true;
+		#ifdef DEBUG_SYR
+			printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
+		#endif
+	}
+	else
+	{
+		#ifdef DEBUG_SYR
+			printf("Using Aligned Data Pointer .........................\n");
+		#endif
+	}
+	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD);
+
+	sprintf( targetRows, "%d", TARGETROWS );
+	sprintf( blockSize, "%d", BLOCKSIZE );
+
+	#ifdef DEBUG_SYR
+    printf("TARGET ROWS = %s\n", targetRows);
+    printf("BLOCK SIZE = %s\n", blockSize);
+	#endif
+
+    kobj.put("%TARGET_ROWS", (const char *)targetRows);
+    kobj.put("%BLOCKSIZE", (const char *) blockSize);
+    kobj.spit((char*)buf, tempTemplate);
+
+	return (64 * 1024 * sizeof(char));
+    // return 0;//(ret < 0) ? -EOVERFLOW : ret;
+}
+
+/*
+	 __global %TYPE* _A, __global const %TYPE* _X, int N, int offx, int incx, int offa, int lda, %PTYPE alpha
+*/
+static void
+assignKargs(KernelArg *args, const void *params, const void*)
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+    cl_int inc;
+
+    INIT_KARG(&args[0], blasArgs->A); 	//A - input/output matrix - argument
+    INIT_KARG(&args[1], blasArgs->B); 	//x - x vector
+    initSizeKarg(&args[2], blasArgs->N);
+	initSizeKarg(&args[3], blasArgs->offBX);
+    inc = blasArgs->ldb.vector;
+    INIT_KARG(&args[4], inc);
+    initSizeKarg(&args[5], blasArgs->offA);
+	initSizeKarg(&args[6], blasArgs->lda.matrix);
+    assignScalarKarg(&args[7], &(blasArgs->alpha), blasArgs->dtype);
+	return;
+}
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs)
+{
+    cl_ulong maxSize;
+    CLBlasKargs *blasArgs;
+
+	blasArgs = (CLBlasKargs *)kernelArgs;
+
+	// 2 buffers for xShared and yShared and 2 integers for the values of iShared and jShared.
+
+	maxSize = (dim->y * 2 * sizeof(dtype)) + (2 * sizeof(int));
+
+    return ((maxSize) <= ldsSize);
+
+}
+//#undef DEBUG_SYR
diff --git a/src/library/blas/gens/syrxk.c b/src/library/blas/gens/syrxk.c
new file mode 100644
index 0000000..a0f6a29
--- /dev/null
+++ b/src/library/blas/gens/syrxk.c
@@ -0,0 +1,2594 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/**
+ *  SYRk and SYR2K kernel generator
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include <clBLAS.h>
+#include <clblas_stddef.h>
+#include <blas_mempat.h>
+#include <solution_seq.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include <matrix_dims.h>
+#include <dis_warning.h>
+
+#include "init.h"
+#include "blas_kgen.h"
+#include "gen_helper.h"
+#include "blas_subgroup.h"
+#include "tile_iter.h"
+
+/*
+ * Priority within a statement batch of different kind
+ * of statements consisting update around the diagonal.
+ */
+enum {
+    CALC_COORDS_STMT_PRIO,
+    FETCH_STMT_PRIO,
+    MAD_STMT_PRIO,
+    STORE_STMT_PRIO
+};
+
+enum {
+    MAX_DIAG_UPRES_STORAGE_SIZE = 95,
+    MAX_FETCH_CLAUSE_SIZE = 8
+};
+
+typedef struct {
+    size_t staggered;
+} extraData_t;
+
+struct SetupPtrAttrs {
+    MatrixRole mrole;
+    const char *basePtr;
+    const char *ldName;
+    const char *offName;
+    KernelExtraFlags offMask;
+};
+
+typedef struct SyrxkExtraPriv {
+    unsigned int maxVlenC;
+} MAY_ALIAS SyrxkExtraPriv;
+
+static CLBLASMpatExtra mpatExtra;
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra,
+   BlasFunctionID funcID);
+
+static void
+assignKargs(
+    KernelArg *args,
+    const CLBlasKargs *blasArgs,
+    KernelExtraFlags kflags,
+    BlasFunctionID funcID);
+
+static void
+syrkAssignKargs(KernelArg *args, const void *params, const void *extra);
+
+static void
+syr2kAssignKargs(KernelArg *args, const void *params, const void *extra);
+
+static SolverFlags
+solverFlags(void);
+
+static void fixupArgs(void *args, SubproblemDim *subdims, void *extra);
+
+static bool
+checkCalcDecomp(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    DataType dtype,
+    int check);
+
+static void
+syrkCalcThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static ssize_t
+syrkGenerator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+    return generator(buf, buflen, subdims, pgran, extra, CLBLAS_SYRK);
+}
+
+static ssize_t
+syr2kGenerator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+    return generator(buf, buflen, subdims, pgran, extra, CLBLAS_SYR2K);
+}
+
+static bool
+subgCheckCalcDecomp(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    DataType dtype,
+    int check);
+
+static int
+syrkSubgGetPerf(
+    unsigned int kflags,
+    const void *args);
+
+static int
+syrkSubgGetDefaultDecomp( PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    void *pArgs );
+
+static int
+syrkBlockGetPerf(
+    unsigned int kflags,
+    const void *args);
+
+#if 0
+static int
+syrkBlockGetDefaultDecomp(
+        PGranularity *pgran,
+        SubproblemDim *subdims,
+        unsigned int subdimsNum);
+#endif
+
+// ----------------------------------------------------------------------------
+
+static SolverOps syrkSolverOps = {
+    syrkGenerator,
+    syrkAssignKargs,
+    NULL,
+    syrkBlockGetPerf,
+    NULL,
+    syrkCalcThreads,
+    NULL,
+    solverFlags,
+    fixupArgs,
+    NULL,//getDefaultDecomp
+	checkCalcDecomp,
+	NULL,
+	NULL
+};
+
+static SolverOps syr2kSolverOps = {
+    syr2kGenerator,
+    syr2kAssignKargs,
+    NULL,
+    syrkBlockGetPerf,
+    NULL,
+    syrkCalcThreads,
+    NULL,
+    solverFlags,
+    fixupArgs,
+    NULL,//getDefaultDecomp
+   	checkCalcDecomp,
+   	NULL,
+   	NULL
+};
+
+static SolverOps syrkSubgSops = {
+    syrkGenerator,
+    syrkAssignKargs,
+    NULL,
+    syrkSubgGetPerf,
+    NULL,
+    syrkCalcThreads,
+    NULL,
+    solverFlags,
+    fixupArgs,
+    syrkSubgGetDefaultDecomp,
+    subgCheckCalcDecomp,
+	NULL,
+	NULL
+};
+
+static SolverOps syr2kSubgSops = {
+    syr2kGenerator,
+    syr2kAssignKargs,
+    NULL,
+    syrkSubgGetPerf,
+    NULL,
+    syrkCalcThreads,
+    NULL,
+    solverFlags,
+    fixupArgs,
+    syrkSubgGetDefaultDecomp,
+    subgCheckCalcDecomp,
+   	NULL,
+   	NULL
+};
+
+//-----------------------------------------------------------------------------
+
+static void
+genPanelBlocksStmt(
+    struct KgenContext *ctx,
+    const char *varName,
+    int roundDir,
+    const SubproblemDim *dim,
+    const char *start,
+    const char *end)
+{
+    char tmp[1024];
+    char *p;
+
+    p = tmp + sprintf(tmp, "%s = (%s", varName, end);
+    if (start[0] != '\0') {
+        p += sprintf(p, " - %s", start);
+    }
+
+    if (roundDir) {
+        p += sprintf(p, " + %lu", dim->y - 1);
+    }
+    sprintf(p, ") / %lu;\n", dim->y);
+
+    kgenAddStmt(ctx, tmp);
+}
+
+//-----------------------------------------------------------------------------
+
+static void
+genSetupPointers(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    BlasFunctionID funcID,
+    FetchAddrMode addrMode,
+    int rank)
+{
+    const CLBLASKernExtra *kextra = gset->kextra;
+    char dstPtr[64];
+    const char *coordName;
+    struct SetupPtrAttrs attrs[3] = {
+        {MATRIX_A, "A", "lda", "offA", KEXTRA_A_OFF_NOT_ZERO},
+        {MATRIX_B, "B", "ldb", "offB", KEXTRA_BX_OFF_NOT_ZERO},
+        {MATRIX_C, "C", "ldc", "offC", KEXTRA_CY_OFF_NOT_ZERO}
+    };
+    int idx = 0;
+    int i;
+    Kstring k1, k2, k3;
+    Kstring madExpr;
+    unsigned int scale;
+    unsigned int vecLen;
+    FetchAddrMode relFlag;
+
+    /*
+     * Pointers are serviced in the following order:
+     * B for tilemul, A for tilemul, C
+     */
+    for (i = 0; i < 3; i++) {
+        // The output pointer should be shifted once in case of 2-rank update
+        if ((i == 2) && rank) {
+            break;
+        }
+
+        emptyKstring(&k1);
+        emptyKstring(&k2);
+        emptyKstring(&k3);
+        scale = 0;
+
+        // select start coordinate
+        relFlag = (i) ? FETCH_ADDR_A_RELATIVE : FETCH_ADDR_B_RELATIVE;
+        if (addrMode & relFlag) {
+            coordName = (i) ? "coord.y" : "coord.x";
+            kstrcpy(&k2, coordName);
+        }
+
+        // fill destination pointer to assign
+        if (i == 2) {
+            strcpy(dstPtr, "C");
+        }
+        else {
+            const char *p;
+
+            p = (i) ? gset->varNames.A : gset->varNames.B;
+            strcpy(dstPtr, p);
+        }
+
+        // select index in the attribute array
+        switch (i) {
+        case 0:
+            idx = (funcID == CLBLAS_SYRK) ? 0 : (1 - rank);
+            break;
+        case 1:
+            idx = (funcID == CLBLAS_SYRK) ? 0 : rank;
+            break;
+        case 2:
+            idx = 2;
+            break;
+        }
+
+        vecLen = getVecLen(gset, funcID, attrs[idx].mrole);
+
+        // construct expression
+        if (attrs[idx].mrole != MATRIX_C) {
+            if (isMatrixAccessColMaj(funcID, gset->kextra->flags,
+                                     attrs[idx].mrole)) {
+
+                kstrcpy(&k1, "1");
+                scale = vecLen;
+            }
+            else {
+                kstrcpy(&k1, attrs[idx].ldName);
+            }
+        }
+
+        if (kextra->flags & attrs[idx].offMask) {
+            if ((attrs[idx].mrole == MATRIX_C) || (vecLen == 1)) {
+                kstrcpy(&k3, attrs[idx].offName);
+            }
+            else {
+                int shift = findHighestSetBit(vecLen);
+
+                ksprintf(&k3, "(%s >> %d)", attrs[idx].offName, shift);
+            }
+        }
+        sprintfFastScalarMad(&madExpr, &k1, &k2, scale, &k3);
+
+        // check if it is not "0" or empty string
+        if (strlen(madExpr.buf) <= 1) {
+            if (attrs[idx].mrole != MATRIX_C) {
+                kgenPrintf(ctx, "%s = %s;\n", dstPtr, attrs[idx].basePtr);
+            }
+        }
+        else {
+            kgenPrintf(ctx, "%s = %s + %s;\n",
+                       dstPtr, attrs[idx].basePtr, madExpr.buf);
+        }
+    }
+}
+
+//-----------------------------------------------------------------------------
+
+static void
+declareKernel(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    BlasFunctionID funcID,
+    const char* nameSuffix )
+{
+    char tmp[1024], betaStr[64], bstr[64], strOffABC[256];
+    DataType dtype = gset->kextra->dtype;
+    KernelExtraFlags kflags = gset->kextra->flags;
+    const PGranularity *pgran = gset->pgran;
+    const char *tnameOrig, *tnameA;
+    unsigned int vecLen;
+    char fpref;
+    const char *rank;
+
+    tnameOrig = dtypeBuiltinType(dtype);
+    vecLen = getVecLen(gset, funcID, MATRIX_A);
+    getVectorTypeName(dtype, vecLen, &tnameA, NULL);
+    fpref = dtypeToBlasPrefix(dtype);
+
+    if (kflags & KEXTRA_BETA_ZERO) {
+        betaStr[0] = '\0';
+    }
+    else {
+        sprintf(betaStr, "    const %s beta,\n", tnameOrig);
+    }
+
+    if (funcID == CLBLAS_SYR2K) {
+        const char *tnameB;
+
+        rank = "2";
+        vecLen = getVecLen(gset, funcID, MATRIX_B);
+        getVectorTypeName(dtype, vecLen, &tnameB, NULL);
+        sprintf(bstr, "    const __global %s *restrict B,\n"
+                      "    uint ldb,\n",
+                tnameB);
+    }
+    else {
+        rank = "";
+        bstr[0] = '\0';
+    }
+
+    strOffABC[0] = '\0';
+    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
+        strcpy(strOffABC, ",\n    uint offA");
+    }
+    if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
+        strcat(strOffABC, ",\n    uint offB");
+    }
+    if (kflags & KEXTRA_CY_OFF_NOT_ZERO) {
+        strcat(strOffABC, ",\n    uint offC");
+    }
+
+    sprintf(tmp, "__attribute__((reqd_work_group_size(%u, 1, 1)))\n"
+                 "void __kernel\n"
+                 "%csyr%sk%s(\n"
+                 "    uint N,\n"
+                 "    const uint K,\n"
+                 "    const %s alpha,\n"
+                 "    const __global %s *restrict A,\n"
+                 "    uint lda,\n"
+                 "%s"   // B and ldb
+                 "%s"   // beta
+                 "    __global %s *C,\n"
+                 "    uint ldc,\n"
+                 "    const uint startN,\n"
+                 "    const uint origN%s)\n",
+            pgran->wgSize[0], fpref, rank, nameSuffix, tnameOrig, tnameA, bstr,
+            betaStr, tnameOrig, strOffABC);
+
+    kgenDeclareFunction(ctx, tmp);
+}
+
+//-----------------------------------------------------------------------------
+
+static void
+genHead(
+    struct KgenContext *ctx,
+    BlasGenSettings *gset,
+    BlasFunctionID funcID,
+    SubgVarNames *pSubgVNames,
+    bool subgMode)
+{
+    char tmp[1024], tmp1[128];
+    char start[128], end[128];
+    char *p;
+    const char *vecTypeA;
+    unsigned int vlenA, vlenB;
+    unsigned int l1Pans;
+    const SubproblemDim *dim = gset->subdims;
+    const CLBLASKernExtra *kextra = gset->kextra;
+    KernelExtraFlags kflags = kextra->flags;
+    KernelExtraFlags diagFlags = KEXTRA_SYRK_SEPARATE_DIAGONAL |
+                                 KEXTRA_SYRK_EVALUATE_DIAGONAL;
+    bool isDiagSep= ((kflags & KEXTRA_SYRK_SEPARATE_DIAGONAL) != 0);
+    bool isEvalOnlyDiag = ((kflags & diagFlags) == diagFlags);
+
+    l1Pans = (unsigned int)(dim[0].y / dim[1].y);
+
+    vlenA = getVecLen(gset, funcID, MATRIX_A);
+    vlenB = getVecLen(gset, funcID, MATRIX_B);
+    getVectorTypeName(kextra->dtype, vlenA, &vecTypeA, NULL);
+
+    // the variable stores N, passed as argument.
+    // this variable is used for C matrix hit check
+    kgenPrintf( ctx, "uint argN = N;\n" );
+
+    if ( subgMode ) {
+
+        gset->varNames.LDS = "scratch";
+
+        // declaring variables used by subgroup mode
+        pSubgVNames->itemId = "itemId";
+        pSubgVNames->subgCoord = "subgCoord";
+
+        kgenAddBlankLine( ctx );
+        kgenAddBlankLine(ctx);
+
+        kgenPrintf(ctx, "int skipTilemul = 0;\n" );
+        kgenPrintf(ctx, "int2 %s;\n", pSubgVNames->itemId );
+        kgenPrintf(ctx, "int2 %s;\n", pSubgVNames->subgCoord);
+
+        // item ID
+        kgenPrintf( ctx,
+            "%s.x = get_local_id(0)%%%d;\n",
+            pSubgVNames->itemId,
+            dim[0].bwidth/dim[1].bwidth);
+
+        // subgroup ID
+        kgenPrintf( ctx,
+            "%s.y = get_local_id(0)/%d;\n",
+            pSubgVNames->itemId,
+            dim[0].bwidth/dim[1].bwidth);
+
+        // subgroup coordX
+        kgenPrintf( ctx,
+            "%s.x = %s.y/%d;\n",
+            pSubgVNames->subgCoord,
+            pSubgVNames->itemId,
+            dim[0].y/dim[1].y );
+
+        // subgroup coordY
+        kgenPrintf( ctx,
+            "%s.y = %s.y%%%d;\n",
+            pSubgVNames->subgCoord,
+            pSubgVNames->itemId,
+            dim[0].y/dim[1].y );
+
+    }
+
+    if (funcID == CLBLAS_SYRK) {
+        sprintf(tmp, "__global %s *B;\n", vecTypeA);
+        kgenAddStmt(ctx, tmp);
+    }
+
+    if (kflags & KEXTRA_SYRK_2K_RANK) {
+        const char *vecTypeB;
+
+        getVectorTypeName(kextra->dtype, vlenB, &vecTypeB, NULL);
+        sprintf(tmp, "__global %s *wiA;\n"
+                     "__global %s *wiB;\n", vecTypeA, vecTypeB);
+        kgenAddStmt(ctx, tmp);
+    }
+
+    kgenAddStmt(ctx, "uint4 coord = 0;\n"   /* contains coordB, coordA, k */
+                     "uint k0 = 0;\n\n");
+
+    // extra variables needed for the upper triangular case
+    if ( kflags & KEXTRA_UPPER_TRIANG ) {
+        if (kflags & KEXTRA_TAILS_N) {
+            kgenAddStmt(ctx, "uint step;\n");
+        }
+        kgenAddStmt(ctx, "uint w;\n");
+    }
+
+    kgenAddStmt(ctx, "const int lid = get_local_id(0);\n"
+                     "uint block = get_group_id(0);\n\n");
+
+    /*
+     * Increase/decrease the outer block coordinate while the inner block number
+     * exceeds the number of blocks. Inner block number is counted from the
+     * diagonal up to the matrix edge. A is always the inner matrix. It is from
+     * the largest panel. The resulting block number determines starting
+     * coordinates.
+     *
+     * In the case of separate evaluating of the area around the diagonal it's
+     * critically important that at least on step would be aligned.
+     * Otherwise, solution areas will overlap that will lead to a wrong result.
+     */
+    if ( kflags & KEXTRA_UPPER_TRIANG ) {
+        char step[128], tmp2[128], *stepCalc = NULL;
+        int roundDir;
+
+        if ((kflags & KEXTRA_TAILS_N)) {
+            sprintf(tmp2, "step = (coord.x %% %lu) ? (coord.x %% %lu) : %lu;\n",
+                    dim[0].x, dim[0].x, dim[0].x);
+            stepCalc = tmp2;
+            sprintf(step, "step");
+        }
+        else {
+            tmp2[0] = '\0';
+            sprintf(step, "%lu", dim[0].x);
+        }
+
+        if (!isEvalOnlyDiag) {
+            start[0] = '\0';
+        }
+        else {
+            sprintf(start, "(coord.x - %s) / %lu * %lu",
+                    step, dim[0].y, dim[0].y);
+        }
+
+        if (!isDiagSep || isEvalOnlyDiag) {
+            strcpy(end, "coord.x");
+            roundDir = 1;            // round up
+        }
+        else {
+            sprintf(end, "(coord.x - %s) / %lu * %lu",
+                    step, dim[0].y, dim[0].y);
+            roundDir = 0;            // round down
+        }
+
+        if (!isEvalOnlyDiag) {
+            kgenAddStmt(ctx, "coord.x = origN;\n");
+            kgenAddStmt(ctx, stepCalc);
+            sprintf(tmp, "w = (origN - startN - N + %lu) / %lu * %lu;\n"
+                         "k0 = (N + %lu) / %lu;\n"
+                         "if (block <= k0 * (w / %lu)) {\n"
+                         "    coord.x -= (block / k0) * %lu;\n"
+                         "    block %%= k0;\n"
+                         "}\n",
+                    dim[0].x - 1, dim[0].x, dim[0].x, dim[0].y - 1,
+                    dim[0].y, dim[0].x, dim[0].x);
+            kgenAddStmt(ctx, tmp);
+            kgenBeginBranch(ctx, "else");
+            sprintf(tmp, "coord.x = N;\n"
+                         "block -= k0 * (w / %lu);\n",
+                    dim[0].x);
+            kgenAddStmt(ctx, tmp);
+            kgenAddStmt(ctx, stepCalc);
+        }
+        else {
+            kgenAddStmt(ctx, "coord.x = N;\n");
+            kgenAddStmt(ctx, stepCalc);
+        }
+
+        if (isDiagSep) {
+            genPanelBlocksStmt(ctx, "k0", roundDir, dim, start, end);
+        }
+
+        kgenBeginBranch(ctx, "while (block >= k0)");
+        kgenAddStmt(ctx, "block -= k0;\n");
+        sprintf(tmp, "coord.x -= %s;\n", step);
+        kgenAddStmt(ctx, tmp);
+        kgenAddStmt(ctx, stepCalc);
+        genPanelBlocksStmt(ctx, "k0", roundDir, dim, start, end);
+        kgenEndBranch(ctx, NULL);
+        kgenAddStmt(ctx, "coord.x += startN;\n");
+
+        if (!isEvalOnlyDiag) {
+            kgenEndBranch(ctx, NULL);
+        }
+
+        if (isEvalOnlyDiag) {
+            sprintf(tmp1, "%s", start);
+            p = tmp1;
+        }
+        else {
+            p = (char*)"startN";
+        }
+
+        if ( subgMode ) {
+
+            kgenPrintf( ctx,
+                "coord.y = %s + block * %lu + %s.y * %lu;\n",
+                p,
+                dim[0].y,
+                pSubgVNames->subgCoord,
+                dim[1].y );
+
+            kgenPrintf( ctx,
+                "coord.x = coord.x - %s + %s.x * %lu;\n",
+                step,
+                pSubgVNames->subgCoord,
+                dim[1].x);
+
+            kgenBeginBranch( ctx,
+                "if (coord.y >= startN + argN || coord.x >= origN)");
+            kgenPrintf( ctx, "skipTilemul = 1;\n" );
+            kgenEndBranch( ctx, NULL );
+
+            sprintf( tmp, "if (coord.y >= coord.x + %lu)", dim[1].x );
+            kgenBeginBranch( ctx, tmp );
+            kgenPrintf( ctx, "skipTilemul = 1;\n" );
+            kgenEndBranch( ctx, NULL );
+
+        }
+        else {
+
+            sprintf(tmp, "coord.y = %s + block * %lu + lid %% %u * %lu;\n"
+                         "coord.x = coord.x - %s + lid / %u * %lu;\n"
+                         "\n"
+                         "if (coord.y >= startN + N || coord.x >= origN) {\n"
+                         "    return;\n"
+                         "}\n\n"
+                         // Check if the tile is fully out of diagonal
+                         "if (coord.y >= coord.x + %lu) {\n"
+                         "    return;\n"
+                         "}\n\n",
+                    p, dim[0].y, l1Pans, dim[1].y,
+                    step, l1Pans, dim[1].x,
+                    dim[1].x);
+            kgenAddStmt(ctx, tmp);
+        }
+    }
+    else {
+        int vecAlign = 1;
+
+        if (!isDiagSep || isEvalOnlyDiag) {
+            strcpy(start, "coord.x");
+        }
+        else {
+            sprintf(start, "(coord.x + %lu) / %lu * %lu",
+                    dim[0].x + dim[0].y - 1, dim[0].y, dim[0].y);
+        }
+
+        if (isEvalOnlyDiag) {
+            sprintf(end, "(coord.x + %lu) / %lu * %lu",
+                    dim[0].x + dim[0].y - 1, dim[0].y, dim[0].y);
+        }
+        else {
+            vecAlign = umax(vlenA, vlenB);
+            if (isMatrixAccessColMaj(funcID, kflags, MATRIX_A) &&
+                (vecAlign > 1)) {
+
+                sprintf(end, "(N + %u) / %u * %u",
+                        vecAlign - 1, vecAlign, vecAlign);
+            }
+            else {
+                strcpy(end, "N");
+            }
+        }
+
+        if (!isEvalOnlyDiag) {
+            sprintf(tmp, "k0 = (N + %lu) / %lu;\n"
+                         "if (block < k0 * (startN / %lu)) {\n"
+                         "    coord.x = (block / k0) * %lu;\n"
+                         "    block %%= k0;\n"
+                         "}\n",
+                    dim[0].y - 1, dim[0].y, dim[0].x, dim[0].x);
+            kgenAddStmt(ctx, tmp);
+            kgenBeginBranch(ctx, "else");
+            sprintf(tmp, "block -= k0 * (startN / %lu);\n", dim[0].x);
+            kgenAddStmt(ctx, tmp);
+        }
+
+        if (isDiagSep) {
+            genPanelBlocksStmt(ctx, "k0", 1, dim, start, end);
+        }
+
+        kgenBeginBranch(ctx, "while (block >= k0)");
+        sprintf(tmp, "block -= k0;\n"
+                     "coord.x += %lu;\n",
+                dim[0].x);
+        kgenAddStmt(ctx, tmp);
+        genPanelBlocksStmt(ctx, "k0", 1, dim, start, end);
+        kgenEndBranch(ctx, NULL);
+        kgenAddStmt(ctx, "coord.x += startN;\n");
+
+        if (!isEvalOnlyDiag) {
+            kgenEndBranch(ctx, NULL);
+        }
+
+        if (!isDiagSep && (kflags & KEXTRA_TAILS_M)) {
+            sprintf(tmp, "coord.y = (%s >= startN + N %% %lu) ? "
+                                "(N - (block + 1) * %lu) : "
+                                "(N - N %% %lu - block * %lu);\n",
+                    start, dim[0].y, dim[0].y, dim[0].y, dim[0].y);
+        }
+        else if ((isDiagSep && !isEvalOnlyDiag) && (kflags & KEXTRA_TAILS_M)) {
+            sprintf(tmp, "coord.y = (N - N %% %lu - block * %lu);\n",
+                    dim[0].y, dim[0].y);
+        }
+        else {
+            sprintf(tmp, "coord.y = %s - (block + 1) * %lu;\n", end, dim[0].y);
+        }
+        kgenAddStmt(ctx, tmp);
+
+        if (isMatrixAccessColMaj(funcID, kflags, MATRIX_A) && (vecAlign > 1)) {
+            sprintf(tmp, "coord.y = (coord.y + %u) / %u * %u;\n",
+                    vecAlign - 1, vecAlign, vecAlign);
+            kgenAddStmt(ctx, tmp);
+        }
+
+        if ( subgMode ) {
+
+            kgenPrintf( ctx,
+                "coord.y += startN + %s.y * %lu;\n",
+                pSubgVNames->subgCoord,
+                dim[1].y );
+
+            kgenPrintf( ctx,
+                "coord.x += %s.x * %lu;\n",
+                pSubgVNames->subgCoord,
+                dim[1].x );
+
+            kgenBeginBranch( ctx,
+                "if (coord.y >= startN + argN || coord.x >= startN + argN)" );
+            kgenPrintf( ctx, "skipTilemul = 1;\n" );
+            kgenEndBranch( ctx, NULL );
+
+            sprintf( tmp, "if (coord.x >= coord.y + %lu)", dim[1].y );
+            kgenBeginBranch( ctx, tmp );
+            kgenPrintf( ctx, "skipTilemul = 1;\n" );
+            kgenEndBranch( ctx, NULL );
+        }
+        else {
+
+            sprintf(tmp, "coord.y += startN + lid %% %u * %lu;\n",
+                    l1Pans, dim[1].y);
+            kgenAddStmt(ctx, tmp);
+
+            sprintf(tmp, "coord.x += lid / %u * %lu;\n"
+                         "if (coord.y >= startN + N || coord.x >= startN + N) {\n"
+                         "    return;\n"
+                         "}\n"
+                         // check if the tile is fully out of the diagonal
+                         "if (coord.x >= coord.y + %lu) {\n"
+                         "    return;\n"
+                         "}\n\n",
+                    l1Pans, dim[1].x, dim[1].y);
+            kgenAddStmt(ctx, tmp);
+
+        }
+    }
+
+    kgenAddBlankLine(ctx);
+}
+
+//-----------------------------------------------------------------------------
+
+static void
+declareComplexMults(
+    struct KgenContext *ctx,
+    DataType dtype,
+    UpdateResultFlags uflags)
+{
+    const char *tname;
+
+    if (isComplexType(dtype)) {
+        tname = dtypeBuiltinType(dtype);
+        declareComplexMultParts(ctx, "alpha", tname);
+        if (uflags & UPRES_WITH_BETA) {
+            declareComplexMultParts(ctx, "beta", tname);
+        }
+    }
+}
+
+//-----------------------------------------------------------------------------
+
+static void
+genUpdateSingleOptimized(
+    struct StatementBatch *batch,
+    const BlasGenSettings *gset,
+    const Kstring *tempC,
+    const Kstring *result,
+    const Kstring *complexOpTmp)
+{
+    const char *alphaName;
+    const char *betaName;
+    bool useMad;
+    const CLBLASKernExtra *kextra = gset->kextra;
+
+    alphaName = gset->varNames.alpha;
+    betaName = (kextra->flags & KEXTRA_BETA_ZERO) ?
+                    NULL : gset->varNames.beta;
+    useMad = (kextra->flags & KEXTRA_ENABLE_MAD) != 0;
+
+    if (isComplexType(kextra->dtype)) {
+        TileMulCore core;
+        Kstring expr;
+        Kstring alphaStr;
+        const Kstring *k3;
+        bool isDouble;
+
+        isDouble = isDoubleBasedType(kextra->dtype);
+        core = (useMad) ? TILEMUL_MAD : TILEMUL_MULADD;
+        kstrcpy(&alphaStr, alphaName);
+        k3 = ((betaName != NULL) && (core == TILEMUL_MAD) && complexOpTmp) ?
+                        complexOpTmp : tempC;
+
+        if (betaName != NULL) {
+            Kstring betaStr;
+
+            kstrcpy(&betaStr, betaName);
+            sprintfComplexMulUpdate(&expr, k3, tempC, &betaStr, NULL,
+                                    isDouble, false, false, core);
+            kgenAddStmtToBatch(batch, MAD_STMT_PRIO, expr.buf);
+        }
+
+        sprintfComplexMulUpdate(&expr, tempC, result, &alphaStr, k3,
+                                isDouble, false, false, core);
+        kgenAddStmtToBatch(batch, MAD_STMT_PRIO, expr.buf);
+    }
+    else {
+        if (betaName != NULL) {
+            if (useMad) {
+                kgenBatchPrintf(batch, MAD_STMT_PRIO,
+                                "%s = mad(%s, %s, 0);\n"
+                                "%s = mad(%s, %s, %s);\n",
+                                tempC->buf, tempC->buf, betaName,
+                                tempC->buf, result->buf, alphaName, tempC->buf);
+            }
+            else {
+                kgenBatchPrintf(batch, MAD_STMT_PRIO,
+                                "%s = %s * %s + %s * %s;\n",
+                                tempC->buf, result->buf, alphaName,
+                                tempC->buf, betaName);
+            }
+        }
+        else {
+            if (useMad) {
+                kgenBatchPrintf(batch, MAD_STMT_PRIO,
+                                "%s = mad(%s, %s, 0);\n",
+                                tempC->buf, result->buf, alphaName);
+            }
+            else {
+                kgenBatchPrintf(batch, MAD_STMT_PRIO,
+                                "%s = %s * %s;\n",
+                                tempC->buf, result->buf, alphaName);
+            }
+        }
+    }
+}
+
+//-----------------------------------------------------------------------------
+
+// Init temporary file for diagonal result update
+static void
+initTmpResTile(Tile *tile, const BlasGenSettings *gset, bool forceNoTrans)
+{
+    KernelExtraFlags kflags = gset->kextra->flags;
+    bool cmaj = ((kflags & KEXTRA_COLUMN_MAJOR) != 0) && !forceNoTrans;
+    const Tile *tc = &gset->tileCY;
+
+    memcpy(tile, tc, sizeof(Tile));
+
+    if (!(kflags & KEXTRA_BETA_ZERO)) {
+        unsigned int maxTmpSize;
+        unsigned int pitch;
+
+        maxTmpSize = tileStorageSize(&gset->tileA) +
+                     tileStorageSize(&gset->tileBX);
+
+        tile->baseName = "tempC";
+        tile->vecLen = getVecLen(gset, CLBLAS_SYRK, MATRIX_C);
+        tile->trans = cmaj;
+        pitch = (cmaj) ? tile->nrRows : tile->nrCols;
+        tile->vecLen = (unsigned int)roundDownPow2(pitch);
+        tile->vecLen = umin(tile->vecLen, MAX_TILE_VECLEN);
+
+        /*
+         * restrict number of rows or columns of the new tile according
+         * to the maximum tile size evaluated above
+         */
+        if (cmaj) {
+            pitch = (unsigned int)roundUp(tile->nrRows, tile->vecLen);
+            tile->nrCols = umin(maxTmpSize / pitch, tile->nrCols);
+            tile->nrCols = (unsigned int)roundDownPow2(tile->nrCols);
+        }
+        else {
+            pitch = (unsigned int)roundUp(tile->nrCols, tile->vecLen);
+            tile->nrRows = umin(maxTmpSize / pitch, tile->nrRows);
+            tile->nrRows = (unsigned int)roundDownPow2(tile->nrRows);
+        }
+    }
+}
+
+//-----------------------------------------------------------------------------
+
+// Declare and setup pointer to the start of updated outpu tile
+const char
+*declareSetupOutputPtr(struct KgenContext *ctx, const BlasGenSettings *gset)
+{
+    const KernelVarNames *kvars = &gset->varNames;
+    const char *coords[2] = {kvars->coordA, kvars->coordB};
+    const char *tname;
+    int cmaj;
+
+    tname = dtypeBuiltinType(gset->kextra->dtype);
+    cmaj = ((gset->kextra->flags & KEXTRA_COLUMN_MAJOR) != 0);
+
+    kgenPrintf(ctx, "__global %s *dst = %s + %s * %s + %s;\n\n",
+               tname, kvars->C, coords[cmaj], kvars->ldc, coords[1 - cmaj]);
+
+    return "dst";
+}
+
+//-----------------------------------------------------------------------------
+
+/*
+ * Check if an additional temporary variable is need for updating complex
+ * result. It is needed if using "mad" buit-in OpenCL functions because
+ * a single operation is evaluated with 2 statements. Without that the result
+ * part evaluated with the first statement is used as an input argument
+ * in the second one that leads to wrong evaluation.  Declare and put its
+ * name to the passed string if it's really needed or just empty the
+ * string otherwise
+ */
+static void
+checkDeclareUpcomTmp(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    Kstring *kstr)
+{
+    DataType dtype = gset->kextra->dtype;
+    const char *tname;
+
+    if (isComplexType(dtype) &&
+        (gset->kextra->flags & KEXTRA_ENABLE_MAD)) {
+
+        tname = dtypeBuiltinType(dtype);
+        kgenPrintf(ctx, "%s sctmp;\n", tname);
+        kstrcpy(kstr, "sctmp");
+    }
+    else {
+        emptyKstring(kstr);
+    }
+}
+
+//-----------------------------------------------------------------------------
+
+// Declare set of variables differing with trailing index
+static void
+declareDiagUpresIndexedVars(
+    struct KgenContext *ctx,
+    const char *type,
+    const char *baseName,
+    unsigned int nrVars)
+{
+    Kstring kstr;
+    unsigned int i;
+
+    ksprintf(&kstr, "%s %s0", type, baseName);
+    for (i = 1; i < nrVars; i++) {
+           kstrcatf(&kstr, ", %s%u", baseName, i);
+    }
+    kstrcatf(&kstr, ";\n");
+    kgenAddStmt(ctx, kstr.buf);
+}
+
+//-----------------------------------------------------------------------------
+
+/*
+ * Add blank line for each diagonal update statement priority
+ * to make the code more readable
+ */
+static void
+addDiagUpdateBlanks(struct StatementBatch *batch)
+{
+    kgenAddStmtToBatch(batch, FETCH_STMT_PRIO, "\n");
+    kgenAddStmtToBatch(batch, MAD_STMT_PRIO, "\n");
+    kgenAddStmtToBatch(batch, STORE_STMT_PRIO, "\n");
+}
+
+//-----------------------------------------------------------------------------
+
+/*
+ * The function update result around the diagonal in case of
+ * 'y' and 'x' subdimensions equal at the tile level, and not
+ * having tails along those subdimensions.
+ */
+static int
+genUpdateIsoscelesDiagTile(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset)
+{
+    KernelExtraFlags kflags = gset->kextra->flags;
+    DataType dtype = gset->kextra->dtype;
+    struct StatementBatch *batch;
+    PhysTileIterator iter;
+    unsigned int vlen;
+    const Tile *tileC = &gset->tileCY;
+    Tile tileTempC;
+    bool isPhysUpper;
+    bool isHit;
+    bool withBeta;
+    bool cmaj;
+    unsigned int nrStored;
+    unsigned int skipCnt = 0;
+    const char *glbType;
+    const char *dstPtr;
+    Kstring tempElem, resElem;
+    Kstring k1, k2, ldcName;
+    Kstring comTmp;
+    const Kstring *ptmp;
+    Kstring offExpr;
+    unsigned int tempRows, tempCols;
+    unsigned int madLen;
+
+    batch = createStmtBatch();
+    if (batch == NULL) {
+        return -ENOMEM;
+    }
+
+    cmaj = (kflags & KEXTRA_COLUMN_MAJOR) != 0;
+    isPhysUpper = ((kflags & KEXTRA_UPPER_TRIANG) != 0) ^ cmaj;
+    withBeta = !(kflags & KEXTRA_BETA_ZERO);
+
+    iterInit(&iter, tileC, 1, 0);
+    vlen = getVecLen(gset, CLBLAS_SYRK, MATRIX_C);
+    kstrcpy(&ldcName, gset->varNames.ldc);
+
+    initTmpResTile(&tileTempC, gset, false);
+    tempRows = tileTempC.nrRows;
+    tempCols = tileTempC.nrCols;
+
+    // declare and initialize needed variables
+    dstPtr = declareSetupOutputPtr(ctx, gset);
+    checkDeclareUpcomTmp(ctx, gset, &comTmp);
+    ptmp = (isKstringEmpty(&comTmp)) ? NULL : &comTmp;
+    if (tileTempC.baseName != tileC->baseName) {
+        declareOneTileStorage(ctx, &tileTempC);
+        kgenAddBlankLine(ctx);
+    }
+
+    while (!iterIsEnd(&iter)) {
+        if (!(iter.row % tempRows ||
+              iter.col % tempCols)) {
+
+            addDiagUpdateBlanks(batch);
+            flushStmtBatch(ctx, batch);
+        }
+
+        isHit = (isPhysUpper) ? (iter.vec >= iter.line) :
+                                (iter.line >= iter.vec);
+
+        skipCnt = (skipCnt) ? (skipCnt - 1) : 0;
+        if (!isHit) {
+            iterIterate(&iter);
+            continue;
+        }
+
+        if (skipCnt) {
+            nrStored = 0;
+        }
+        else if (isPhysUpper) {
+            if (iter.vec && !isRoundedPow2(iter.vec)) {
+                size_t s = iter.vec;
+
+                s = szmin(roundUpPow2(s) - s, s - roundDownPow2(s));
+                nrStored = (unsigned int)s;
+            }
+            else {
+                nrStored = (iter.vec) ? umin(iter.vec, iter.nrVecs - iter.vec) :
+                                        (unsigned int)iter.nrVecs;
+            }
+        }
+        else {
+            nrStored = (unsigned int)roundDownPow2(iter.line - iter.vec + 1);
+        }
+
+        nrStored = umin(nrStored, vlen);
+        skipCnt = umax(skipCnt, nrStored);
+
+        if (nrStored) {
+            getVectorTypeName(dtype, nrStored, &glbType, NULL);
+            ksprintf(&k1, "%u", iter.line);
+            ksprintf(&k2, "%u", iter.vec);
+            sprintfFastScalarMad(&offExpr, &k1, &ldcName, 0, &k2);
+
+            if (withBeta) {
+                sprintfTileElement(&tempElem, &tileTempC, iter.row % tempRows,
+                                   iter.col % tempCols, nrStored);
+
+                kgenBatchPrintf(batch, FETCH_STMT_PRIO,
+                                "%s = *(__global %s*)(&%s[%s]);\n",
+                                tempElem.buf, glbType, dstPtr, offExpr.buf);
+            }
+        }
+
+        madLen = (isComplexType(dtype) || (tileC->trans != cmaj)) ?
+                        1 : nrStored;
+        if (madLen) {
+            sprintfTileElement(&tempElem, &tileTempC, iter.row % tempRows,
+                               iter.col % tempCols, madLen);
+            sprintfTileElement(&resElem, tileC, iter.row, iter.col, madLen);
+            genUpdateSingleOptimized(batch, gset, &tempElem, &resElem, ptmp);
+        }
+
+        if (nrStored) {
+            sprintfTileElement(&tempElem, &tileTempC, iter.row % tempRows,
+                               iter.col % tempCols, nrStored);
+
+            kgenBatchPrintf(batch, STORE_STMT_PRIO,
+                            "*(__global %s*)(&%s[%s]) = %s;\n",
+                            glbType, dstPtr, offExpr.buf, tempElem.buf);
+        }
+
+        iterIterate(&iter);
+    }
+
+    addDiagUpdateBlanks(batch);
+    flushStmtBatch(ctx, batch);
+    destroyStmtBatch(batch);
+
+    return 0;
+}
+
+//-----------------------------------------------------------------------------
+
+/*
+ * Update diagonal tile of arbitrary shape in case of not having tails
+ * along 'x' and 'y' subdimensions at the tile level.
+ */
+static int
+genUpdateGenericDiagTile(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset)
+{
+    KernelExtraFlags kflags = gset->kextra->flags;
+    DataType dtype = gset->kextra->dtype;
+    const char *typeName;
+    struct StatementBatch *batch;
+    PhysTileIterator iter;
+    TileIterFlags tifl;
+    BlasGenSettings gsetNew;
+    const Tile *tileC = &gset->tileCY;
+    Tile tileTempC;
+    bool withBeta;
+    bool isUpper;
+    const char *dstPtr;
+    const char *s;
+    Kstring tempElem, resElem;
+    Kstring comTmp;
+    const Kstring *ptmp;
+    Kstring kstr, alphaStr, betaStr;
+    unsigned int nrRows, nrCols;
+    unsigned int tempRows;
+    // type of the vectorized coordinates
+    Kstring vctype;
+    Kstring constOffs, constShifts, constMasks;
+    unsigned int i, j, nops;
+    unsigned int maxFetches = 0;
+    const char *yname, *xname;
+    const char *ldcName;
+
+    batch = createStmtBatch();
+    if (batch == NULL) {
+        return -ENOMEM;
+    }
+
+    typeName = dtypeBuiltinType(dtype);
+
+    nrRows = tileC->nrRows;
+    nrCols = tileC->nrCols;
+    withBeta = !(kflags & KEXTRA_BETA_ZERO);
+    isUpper = ((kflags & KEXTRA_UPPER_TRIANG) != 0);
+
+    yname = gset->varNames.coordA;
+    xname = gset->varNames.coordB;
+    ldcName = gset->varNames.ldc;
+
+    memcpy(&gsetNew, gset, sizeof(BlasGenSettings));
+
+    /*
+     * Fetches are done by single element. Non transposed shape
+     * is forced to facilitate further size restriction and tile
+     * manipulation
+     */
+    memcpy(&tileTempC, tileC, sizeof(Tile));
+    tileTempC.trans = false;
+
+    tifl = (isUpper) ? TILE_ITER_BACKWARD_ROWS :
+                       TILE_ITER_BACKWARD_COLS;
+    iterInit(&iter, &tileTempC, 1, tifl);
+
+    initTmpResTile(&tileTempC, gset, true);
+
+    if (nrCols == 1) {
+        kstrcpy(&vctype, "uint");
+    }
+    else {
+        ksprintf(&vctype, "uint%u", nrCols);
+    }
+
+    /*
+     * fill constant offsets, shifts and masks within each line
+     * for vectorized coorinates
+     */
+    ksprintf(&constOffs, "(%s)(", vctype.buf);
+    ksprintf(&constShifts, "(%s)(", vctype.buf);
+    ksprintf(&constMasks, "(%s)(", vctype.buf);
+    for (i = 0; i < nrCols; i++) {
+        s = (i == nrCols - 1) ? "" : ", ";
+
+        j = (isUpper) ? (nrCols - i - 1) : i;
+        kstrcatf(&constOffs, "%uu%s", j, s);
+        kstrcatf(&constShifts, "%uu%s", i, s);
+        kstrcatf(&constMasks, "%#x%s", 1 << i, s);
+    }
+    kstrcatf(&constOffs, ")");
+    kstrcatf(&constShifts, ")");
+    kstrcatf(&constMasks, ")");
+
+
+    // declare and initialize needed variables
+
+    dstPtr = declareSetupOutputPtr(ctx, gset);
+    checkDeclareUpcomTmp(ctx, gset, &comTmp);
+    ptmp = (isKstringEmpty(&comTmp)) ? NULL : &comTmp;
+
+    if (tileTempC.baseName != tileC->baseName) {
+        /*
+         * Make additional temporary tile size restrition because of the
+         * following factors:
+         *
+         * No more than 16 fetches can be combined into single clause.
+         * So, there is no need to maintain larger temporary tile as well
+         * as more vector coordinates to reduce number of consumed registers.
+         * However, actually, the compiler pains even 16 fetches merged into
+         * single clause and allocate huge number of registers.
+         */
+        if (tileStorageSize(&tileTempC) > MAX_FETCH_CLAUSE_SIZE) {
+            tileTempC.nrRows = (unsigned int)roundDownPow2(
+                                        MAX_FETCH_CLAUSE_SIZE / nrCols);
+            if (!tileTempC.nrRows) {
+                tileTempC.nrRows = 1;
+            }
+        }
+    }
+
+    tempRows = tileTempC.nrRows;
+    maxFetches = MAX_FETCH_CLAUSE_SIZE / nrCols * nrCols;
+    maxFetches = umin(maxFetches, tempRows * nrCols);
+    i = tileStorageSize(&tileTempC);
+    maxFetches = umin(maxFetches, i);
+
+    // declare vectorized coordinates
+    declareDiagUpresIndexedVars(ctx, vctype.buf, "cc", tempRows);
+
+    /*
+     * real y coordinate, offset mask and
+     * substituted beta and alpha (one value per temporary line)
+     */
+    kgenAddStmt(ctx, "unsigned int ry;\n"
+                     "unsigned int mask;\n"
+                     "int hit;\n");
+    if (withBeta) {
+        declareDiagUpresIndexedVars(ctx, typeName, "alphaNew", tempRows);
+        declareDiagUpresIndexedVars(ctx, typeName, "betaNew", tempRows);
+    }
+
+    // declare tile
+    if (tileTempC.baseName != gset->tileCY.baseName) {
+        declareOneTileStorage(ctx, &tileTempC);
+        kgenAddBlankLine(ctx);
+    }
+
+    // set start mask value
+    if (isUpper) {
+        kgenPrintf(ctx, "if (%s + %u <= %s) {\n"
+                        "    mask = ~0;\n"
+                        "}\n"
+                        "else {\n"
+                        "    mask = (%s + %u < %s + %u) "
+                        "    ? ~((1 << (%s + %u - %s)) - 1) : 0;\n"
+                        "}\n\n",
+                   yname, nrRows - 1, xname,
+                   yname, nrRows - 1, xname, nrCols - 1,
+                   yname, nrRows, xname);
+    }
+    else {
+        kgenPrintf(ctx, "if (%s + %u <= %s) {\n"
+                        "    mask = ~0;\n"
+                        "}\n"
+                        "else {\n"
+                        "    mask = (%s > %s) ? ((1 << (%s - %s)) - 1) : 0;\n"
+                        "}\n\n",
+                   xname, nrCols - 1, yname,
+                   yname, xname, yname, xname);
+    }
+
+    // let's go
+    nops = 0;
+    while (!iterIsEnd(&iter)) {
+        if (nops == maxFetches) {
+            addDiagUpdateBlanks(batch);
+            flushStmtBatch(ctx, batch);
+            nops = 0;
+        }
+
+        // index for all temporary coordinates
+        i = nops / nrCols;
+
+        // prepare vectorized coordinates for the next line
+        if (nops % tileTempC.nrCols == 0) {
+            if (isUpper) {
+                kgenBatchPrintf(batch, CALC_COORDS_STMT_PRIO,
+                                "hit = (%s + %u <= %s + %u);\n",
+                                yname, iter.row, xname, nrCols - 1);
+            }
+            else {
+                kgenBatchPrintf(batch, CALC_COORDS_STMT_PRIO,
+                                "hit = (%s + %u >= %s);\n",
+                                yname, iter.row, xname);
+            }
+
+            if (withBeta) {
+                kgenBatchPrintf(batch, CALC_COORDS_STMT_PRIO,
+                                "betaNew%u = (hit) ? %s : %s;\n"
+                                "alphaNew%u = (hit) ? %s : (%s)0;\n",
+                                i, gset->varNames.beta, strOne(dtype),
+                                i, gset->varNames.alpha, typeName);
+            }
+
+            if (isUpper) {
+                kgenBatchPrintf(batch, CALC_COORDS_STMT_PRIO,
+
+                                "ry = select(0, %u, hit);\n"
+                                "mask = select(mask, mask >> 1 | %#x, hit);\n"
+                                "cc%u = ((%s)mask &\n"
+                                "       %s) >>\n"
+                                "      %s;\n"
+                                "cc%u = %u - mad24(cc%u, %s, 0);\n",
+
+                                iter.row,
+                                (1 << (nrCols - 1)),
+                                i, vctype.buf, constMasks.buf, constShifts.buf,
+                                i, nrCols - 1, i, constOffs.buf);
+            }
+            else {
+                kgenBatchPrintf(batch, CALC_COORDS_STMT_PRIO,
+
+                                "ry = select(%u, %u, hit);\n"
+                                "mask = select(mask, mask << 1 | 1, hit);\n"
+                                "cc%u = ((%s)mask &\n"
+                                "       %s) >>\n"
+                                "      %s;\n"
+                                "cc%u = mad24(cc%u, %s, 0);\n",
+
+                                nrRows - 1, iter.row,
+                                i, vctype.buf, constMasks.buf, constShifts.buf,
+                                i, i, constOffs.buf);
+            }
+
+            if (kflags & KEXTRA_COLUMN_MAJOR) {
+                kgenBatchPrintf(batch, CALC_COORDS_STMT_PRIO,
+                                "cc%u = mad24(cc%u, (%s)%s, (%s)ry);\n\n",
+                                i, i, vctype.buf, ldcName, vctype.buf);
+            }
+            else {
+                kgenBatchPrintf(batch, CALC_COORDS_STMT_PRIO,
+                                "cc%u = mad24((%s)ry, (%s)%s, cc%u);\n\n",
+                                i, vctype.buf, vctype.buf, ldcName, i);
+            }
+        }
+
+        // prepare for the immediate update
+        sprintfTileElement(&tempElem, &tileTempC,
+                           iter.row % tempRows, iter.col, 1);
+        sprintfTileElement(&resElem, tileC, iter.row, iter.col, 1);
+        if (nrCols == 1) {
+            ksprintf(&kstr, "cc%u", i);
+        }
+        else {
+            ksprintf(&kstr, "cc%u.s%u", i, iter.col);
+        }
+
+        // prepare multipliers and fetch
+        if (withBeta) {
+            ksprintf(&alphaStr, "alphaNew%u", i);
+            ksprintf(&betaStr, "betaNew%u", i);
+            gsetNew.varNames.alpha = alphaStr.buf;
+            gsetNew.varNames.beta = betaStr.buf;
+
+            kgenBatchPrintf(batch, FETCH_STMT_PRIO, "%s = %s[%s];\n",
+                            tempElem.buf, dstPtr, kstr.buf);
+        }
+
+        genUpdateSingleOptimized(batch, &gsetNew, &tempElem, &resElem, ptmp);
+
+
+        // store
+        kgenBatchPrintf(batch, STORE_STMT_PRIO, "%s[%s] = %s;\n",
+                        dstPtr, kstr.buf, tempElem.buf);
+
+        nops++;
+        iterIterate(&iter);
+    }
+
+    addDiagUpdateBlanks(batch);
+    flushStmtBatch(ctx, batch);
+    destroyStmtBatch(batch);
+
+    return 0;
+}
+
+//-----------------------------------------------------------------------------
+
+static int
+genUpdateTailedDiagTile(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    UpdateResultFlags uflags)
+{
+    char tmp[1024];
+    char s1[1024], s2[256];
+    char src[32], dst[32];
+    char *p;
+    const char *vfield;
+    size_t pitch;
+    struct KgenContext *ctx1;
+    const CLBLASKernExtra *kextra = gset->kextra;
+    DataType dtype = kextra->dtype;
+    KernelExtraFlags kflags = kextra->flags;
+    const SubproblemDim *dims = gset->subdims;
+    UpdateResultOp op;
+    /*
+     * solution tile coordinate without consideration of
+     * row/column order
+     */
+    const char *trow, *tcol, *s3, *s4;
+
+    vfield = dtypeUPtrField(dtype);
+    pitch = roundUp(gset->tileCY.nrCols, gset->tileCY.vecLen);
+
+    tcol = gset->varNames.coordB;
+    trow = gset->varNames.coordA;
+
+    s3 = (kflags & KEXTRA_COLUMN_MAJOR) ? tcol : trow;
+    s4 = (kflags & KEXTRA_COLUMN_MAJOR) ? trow : tcol;
+
+    // declare and initialize variables
+    sprintf(s1, "uint m = min(%luu, N - %s);\n"
+                "uint n = min(%luu, N - %s);\n",
+            dims[1].y, trow, dims[1].x, tcol);
+
+    p = s1 + strlen(s1);
+    sprintf(p, "uint i, j, j0;\n"
+                "PPtr res;\n"
+                "GPtr uC;\n"
+                "\n"
+                "res.%s = c;\n"
+                "uC.%s = C + %s * ldc + %s;\n",
+            vfield, vfield, s3, s4);
+
+    if (uflags & (UPRES_TAIL_ROW | UPRES_TAIL_COL)) {
+        char offStr[64];
+        char *p = offStr;
+
+        offStr[0] = '\0';
+        if (uflags & UPRES_TAIL_ROW) {
+            sprintf(offStr, " + (%lu - m) * %lu", dims[1].y, pitch);
+            p += strlen(offStr);
+        }
+        if (uflags & UPRES_TAIL_COL) {
+            sprintf(p, " + (%lu - n)", dims[1].x);
+        }
+
+        p = s1 + strlen(s1);
+        sprintf(p, "res.%s = res.%s%s;\n", vfield, vfield, offStr);
+    }
+
+    kgenAddBlankLine(ctx);
+
+    ctx1 = createKgenContext(s2, sizeof(s2), true);
+    if (ctx1 == NULL) {
+        return -ENOMEM;
+    }
+
+    kgenSyncFormatting(ctx1, ctx, 1);
+
+    // update logic
+    sprintf(src, "res.%s[i * %lu + j]", vfield, pitch);
+    if (uflags & UPRES_COLUMN_MAJOR) {
+        sprintf(dst, "uC.%s[j * ldc + i]", vfield);
+    }
+    else {
+        sprintf(dst, "uC.%s[i * ldc + j]", vfield);
+    }
+    op = (kflags & KEXTRA_BETA_ZERO) ? UPRES_SET : UPRES_SUM;
+    genUpdateResultSingle(ctx1, dst, src, gset, op, uflags);
+
+    if ( kflags & KEXTRA_UPPER_TRIANG ) {
+        declareComplexMults(ctx, dtype, uflags);
+
+        sprintf(tmp, "%s"   // variables
+                     /*
+                      * setup number of rows to update
+                      * and start column to update from
+                      */
+                     "j = min(%s + %lu, %s + %lu) - %s;\n"
+                     "m = min(m, j);\n"
+                     "j0 = (%s < %s) ? (%s - %s) : 0;\n"
+                     "\n"
+                     "for (i = 0; i < m; i++) {\n"
+                     "    for (j = j0; j < n; j++) {\n"
+                     "%s" // update logic
+                     "    }\n"
+                          /*
+                           * increment row, increment start column
+                           * if the diagonal is reached
+                           */
+                     "    %s++;\n"
+                     "    j0 = (%s >= %s) ? j0 : (j0 + 1);\n"
+                     "}\n",
+                s1, trow, dims[1].y, tcol, dims[1].x, trow,
+                tcol, trow, trow, tcol, s2, trow, tcol, trow);
+    }
+    else {
+        declareComplexMults(ctx, dtype, uflags);
+
+        sprintf(tmp, "uint i0;\n"
+                     "%s"       // variables
+                     "i0 = (%s < %s) ? (%s - %s) : 0;\n"
+                     "j = min(%s + %lu, %s + %lu) - %s;\n"
+                     "n = min(j, n);\n"
+                     "j0 = (%s < %s) ? (%s - %s + 1) : 1;\n"
+                     "\n"
+                     "for (i = i0; i < m; i++) {\n"
+                     "    for (j = 0; j < j0; j++) {\n"
+                     "%s"       // update logic
+                     "    }\n"
+                     "    j0 = min(j0 + 1, n);\n"
+                     "}\n",
+                s1, trow, tcol, tcol, trow, trow, dims[1].y, tcol,
+                dims[1].x, tcol, tcol, trow, trow, tcol, s2);
+    }
+
+    destroyKgenContext(ctx1);
+
+    return kgenAddStmt(ctx, tmp);
+}
+
+//-----------------------------------------------------------------------------
+
+static int
+genUpdateResult(
+    struct KgenContext *ctx,
+    BlasFunctionID funcID,
+    BlasGenSettings *gset,
+    UpdateResultFlags upresFlags,
+    const char * d1, // dummy parameters for compatibility with callback ptr
+    const char * d2,
+    const char * d3)
+{
+    KernelExtraFlags kflags = gset->kextra->flags;
+    KernelExtraFlags diagFlags = KEXTRA_SYRK_SEPARATE_DIAGONAL |
+                                 KEXTRA_SYRK_EVALUATE_DIAGONAL;
+    int ret;
+    char tmp[1024];
+
+    DUMMY_ARGS_USAGE_3(d1, d2, d3);
+
+    if ( gset->kextra->flags & KEXTRA_UPPER_TRIANG ) {
+
+        sprintf( tmp,
+            "if ( !( (coord.y >= startN + argN) || "
+                "(coord.x >= origN) || "
+                "(coord.y >= coord.x + %lu) ) )",
+             gset->subdims[1].x );
+
+        kgenBeginBranch( ctx, tmp );
+    }
+    else {
+
+        sprintf( tmp,
+            "if ( !( (coord.y >= startN + argN) || "
+                  "(coord.x >= startN + argN) || "
+                  "(coord.x >= coord.y + %lu) ) )",
+            gset->subdims[1].y );
+
+        kgenBeginBranch( ctx, tmp );
+
+    }
+
+    // update diagonal if the chosen mode implies its processing
+    if ((kflags & diagFlags) != KEXTRA_SYRK_SEPARATE_DIAGONAL) {
+        const char *tcol = gset->varNames.coordB;
+        const char *trow = gset->varNames.coordA;
+        bool areTails;
+
+        areTails = ((kflags & (KEXTRA_TAILS_M_LOWER |
+                               KEXTRA_TAILS_N_LOWER)) != 0);
+
+        if (areTails || (gset->subdims[1].y == gset->subdims[1].x)) {
+            if ( kflags & KEXTRA_UPPER_TRIANG ) {
+                sprintf(tmp, "if (%s + %lu > %s)",
+                        trow, gset->subdims[1].y, tcol);
+            }
+            else {
+                sprintf(tmp, "if (%s + %lu > %s)",
+                        tcol, gset->subdims[1].x, trow);
+            }
+
+            kgenBeginBranch(ctx, tmp);
+            if (!areTails) {
+                ret = genUpdateIsoscelesDiagTile(ctx, gset);
+            }
+            else {
+                ret = genUpdateTailedDiagTile(ctx, gset, upresFlags);
+            }
+        }
+        else {
+            unsigned int xb, yb;
+
+            xb = (unsigned int)gset->subdims[0].x;
+            yb = (unsigned int)gset->subdims[0].y;
+
+            if ( kflags & KEXTRA_UPPER_TRIANG ) {
+                sprintf(tmp, "if (%s / %u * %u + %u > %s / %u * %u)",
+                        trow, yb, yb, yb - 1, tcol, xb, xb);
+            }
+            else {
+                sprintf(tmp, "if (%s / %u * %u + %u > %s / %u * %u)",
+                        tcol, xb, xb, xb - 1, trow, yb, yb);
+            }
+
+            kgenBeginBranch(ctx, tmp);
+            ret = genUpdateGenericDiagTile(ctx, gset);
+        }
+
+        if (ret) {
+            return ret;
+        }
+
+        kgenEndBranch(ctx, NULL);
+        // the function above put a respective code into a conditional path
+        kgenBeginBranch(ctx, "else");
+    }
+
+    ret = genResultUpdateWithFlags( ctx,
+        funcID,
+        gset,
+        upresFlags,
+        NULL,
+        NULL,
+        NULL );
+
+    if ((kflags & diagFlags) != KEXTRA_SYRK_SEPARATE_DIAGONAL) {
+        ret = kgenEndBranch(ctx, NULL);
+    }
+
+    kgenEndBranch( ctx, NULL );
+
+    return ret;
+}
+
+//-----------------------------------------------------------------------------
+
+static void
+initGenSettings(
+    BlasGenSettings *gset,
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const CLBLASKernExtra *kextra,
+    BlasFunctionID funcID)
+{
+    KernelVarNames *vnames = &gset->varNames;
+    unsigned int vecLen;
+
+    memset(gset, 0, sizeof(BlasGenSettings));
+
+    memcpy(gset->subdims, subdims, sizeof(gset->subdims));
+    gset->flags = BGF_LD_IN_VECTORS;
+    if ((funcID == CLBLAS_SYR2K) && !(kextra->flags & KEXTRA_SYRK_2K_RANK)) {
+        gset->flags |= BGF_DISTINCT_VECLEN;
+    }
+
+    gset->pgran = pgran;
+    gset->kextra = kextra;
+
+    // !!! WORKAROUND; some cases fails with fetched fully tile of A
+    vecLen = getVecLen(gset, funcID, MATRIX_A);
+    if (vecLen != 1) {
+        gset->flags |= BGF_WHOLE_A;
+    }
+    ///////////////////////////////////////////////////////////////////////
+
+    if ((funcID == CLBLAS_SYR2K) && kextra->flags & KEXTRA_SYRK_2K_RANK) {
+        vnames->A = "wiA";
+        vnames->B = "wiB";
+    }
+    else {
+        vnames->A = "A";
+        vnames->B = "B";
+    }
+
+    vnames->C = "C";
+    vnames->lda = "lda";
+    vnames->ldb = (funcID == CLBLAS_SYR2K) ? "ldb" : vnames->lda;
+    vnames->alpha = "alpha";
+    if (!(kextra->flags & KEXTRA_BETA_ZERO)) {
+        vnames->beta = "beta";
+    }
+
+    vnames->coordA = "coord.y";
+    vnames->coordB = "coord.x";
+    vnames->k = "coord.z";
+
+    vnames->sizeM = "N";
+    vnames->sizeN = "N";
+    vnames->sizeK = "K";
+    vnames->skewA = NULL;
+    vnames->skewB = NULL;
+    vnames->skewK = NULL;
+}
+
+//-----------------------------------------------------------------------------
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra,
+   BlasFunctionID funcID)
+{
+    ssize_t ret;
+    struct KgenContext *ctx;
+    char tmp[1024];
+    CLBLASKernExtra kextraNew;
+    TileCreationFlags tcflags;
+    DataType dtype;
+    KernelExtraFlags kflags;
+    UpdateResultFlags uflags;
+    BlasGenSettings gset;
+    TileMulOpts mulOpts;
+    KernelVarNames *vnames = &gset.varNames;
+    int i, numRanks;
+    TilePostFetchPrivate pfPriv;
+    TailStatus tailStatus = 0;
+    FetchAddrMode addrMode;
+    SyrxkExtraPriv *priv;
+    bool subgMode = 0;
+    SubgVarNames subgVNames;
+    bool areTailsMN;
+
+    memcpy(&kextraNew, extra, sizeof(kextraNew));
+
+    subgMode = ( subdims[0].bwidth != subdims[1].bwidth );
+
+    // fixup tail flags in respect with the selected separate diagonal mode
+    kflags = kextraNew.flags;
+    if (kflags & KEXTRA_SYRK_SEPARATE_DIAGONAL) {
+        bool isUpper = ((kflags & KEXTRA_UPPER_TRIANG) != 0);
+
+        if ((kflags & (KEXTRA_SYRK_SEPARATE_DIAGONAL |
+                       KEXTRA_SYRK_EVALUATE_DIAGONAL)) ==
+                           KEXTRA_SYRK_SEPARATE_DIAGONAL) {
+            if (isUpper) {
+                kflags &= ~(KEXTRA_TAILS_M | KEXTRA_TAILS_M_LOWER);
+            }
+            else {
+                kflags &= ~(KEXTRA_TAILS_N | KEXTRA_TAILS_N_LOWER);
+            }
+        }
+
+        kextraNew.flags = kflags;
+    }
+    dtype = kextraNew.dtype;
+
+    ctx = createKgenContext(buf, buflen, true);
+    if (ctx == NULL) {
+        return -ENOMEM;
+    }
+
+    kgenDeclareUptrs(ctx, isDoubleBasedType(dtype));
+
+    initGenSettings(&gset, subdims, pgran, &kextraNew, funcID);
+    /*
+     * fixup vectorization for C if some restrictions for it has been set
+     * during the generic solve stage
+     */
+    priv = (SyrxkExtraPriv*)&kextraNew.solverPriv;
+    if (priv->maxVlenC) {
+        kextraNew.vecLenC = umin(kextraNew.vecLenC, priv->maxVlenC);
+        if (!(gset.flags & BGF_DISTINCT_VECLEN)) {
+            kextraNew.vecLen = umin(kextraNew.vecLenC, kextraNew.vecLen);
+        }
+    }
+
+    mulOpts.memA = mulOpts.memB = CLMEM_GLOBAL_MEMORY;
+    mulOpts.core = (kflags & KEXTRA_ENABLE_MAD) ? TILEMUL_MAD : TILEMUL_MULADD;
+    mulOpts.postFetch = NULL;
+    mulOpts.flags = TILEMUL_NO_FLAGS;
+    if (isMatrixAccessColMaj(funcID, kflags, MATRIX_A)) {
+        mulOpts.flags |= TILEMUL_TRA;
+    }
+    else {
+        mulOpts.flags |= TILEMUL_TRB;
+    }
+
+    mulOpts.fctx = createFetchContext();
+    if (mulOpts.fctx == NULL) {
+        destroyKgenContext(ctx);
+        return -ENOMEM;
+    }
+
+    if (kflags & KEXTRA_TAILS_K_LOWER) {
+        // setup post fetch callback
+        memset(&pfPriv, 0, sizeof(pfPriv));
+        pfPriv.wholeA = 1;
+        pfPriv.funcID = funcID;
+        pfPriv.gset = &gset;
+        mulOpts.postFetch = defaultTilePostFetch;
+        mulOpts.postFetchPriv = &pfPriv;
+    }
+
+    if( subgMode ) {
+        declareKernel( ctx, &gset, funcID, "Subg" );
+    }
+    else {
+        declareKernel( ctx, &gset, funcID, "Block" );
+    }
+
+    kgenBeginFuncBody(ctx);
+
+    areTailsMN = (kflags & (KEXTRA_TAILS_M_LOWER | KEXTRA_TAILS_N_LOWER)) != 0;
+    tcflags = areTailsMN ? TILE_C_FORCE_NOTRANS : 0;
+    initDefaultTiles(&gset, funcID, tcflags, PRIV_STORAGE_VARIABLE_SET);
+
+    /*
+     * FIXME: since now it is used PPtr for updating diagonal
+     *        in case of tails variables cannot be used
+     */
+    if (areTailsMN) {
+        gset.tileCY.storType = PRIV_STORAGE_ARRAY;
+    }
+    declareTileStorages(ctx, &gset);
+
+    genHead( ctx, &gset, funcID, &subgVNames, subgMode );
+    genZeroTile(ctx, &gset.tileCY);
+    /* For adjusting coordinates, skews and updating result */
+    kgenAddStmt(ctx,
+            "// Set N to initial argument of blas function, not divided one\n"
+            "N = origN;\n");
+
+    if ( kflags & KEXTRA_UPPER_TRIANG ) {
+        tailStatus = checkGenAdjustTailCoords(ctx, funcID, &gset, NULL);
+        kgenAddBlankLine(ctx);
+    }
+
+    // generate multiplication logic
+    numRanks = (kflags & KEXTRA_SYRK_2K_RANK) ? 2 : 1;
+    addrMode = setDefaultFetchAddrMode(mulOpts.fctx, &gset, 0, tailStatus,
+                                      (kflags & KEXTRA_TAILS_K_LOWER) != 0);
+
+    genScaleLeadingDimensions(ctx, &gset);
+    // ldc should not be scaled, so it is initialized after that
+    gset.varNames.ldc = "ldc";
+
+    // Begin loop over the small panel
+
+    for (i = 0; i < numRanks; i++) {
+        if (i) {
+            kgenAddStmt(ctx, "// begin the second rank update\n");
+
+            /*
+             * For the second rank, reset coordinates and swap leading
+             * dimensions
+             */
+            if (!(addrMode & FETCH_ADDR_K_RELATIVE)) {
+                kgenAddStmt(ctx, "coord.z = 0;\n");
+            }
+            vnames->lda = "ldb";
+            vnames->ldb = "lda";
+        }
+        genSetupPointers(ctx, &gset, funcID, addrMode, i);
+
+        if (i) {
+            kgenBeginBranch(ctx, NULL);
+        }
+        prepareFetchLoop(ctx, mulOpts.fctx, &gset, CLMEM_GLOBAL_MEMORY,
+                         CLMEM_GLOBAL_MEMORY);
+
+        if ( subgMode ) {
+
+            mulOpts.flags |= TILEMUL_BW_STRIDE;
+            mulOpts.flags |= TILEMUL_NOT_INC_K;
+            mulOpts.postFetch = NULL;
+            setFetchAddrMode(mulOpts.fctx, (addrMode&~FETCH_ADDR_K_RELATIVE));
+
+            sprintf( tmp, "if( skipTilemul == 0 )");
+            kgenBeginBranch( ctx, tmp );
+
+            if ( kflags & KEXTRA_TAILS_K_LOWER ) {
+
+                kgenPrintf( ctx, "uint kBase = K - (K%%%lu);\n", subdims[0].bwidth );
+                sprintf( tmp,
+                    "for ( k0 = %s.x * %lu; k0 < kBase; k0 += %lu )",
+                    subgVNames.itemId,
+                    subdims[1].bwidth,
+                    subdims[0].bwidth );
+            }
+            else {
+
+                sprintf( tmp,
+                    "for ( k0 = %s.x * %lu; k0 < K; k0 += %lu )",
+                    subgVNames.itemId,
+                    subdims[1].bwidth,
+                    subdims[0].bwidth );
+            }
+
+            // main loop branch
+            kgenBeginBranch( ctx, tmp );
+            gset.varNames.k = "k0";
+        }
+        else {
+
+            sprintf(tmp, "for (k0 = 0; k0 < K; k0 += %lu)", subdims[1].bwidth);
+            kgenBeginBranch(ctx, tmp);
+        }
+
+        pfPriv.fetchNumA = 0;
+        tileMulGen(ctx, &gset, &mulOpts);
+        // main loop branch
+        kgenEndBranch(ctx, NULL);
+
+        if ( subgMode ) {
+
+            // lowerK tails for subgroup mode
+            if( kflags & KEXTRA_TAILS_K_LOWER ) {
+
+                setFetchAddrMode(mulOpts.fctx, addrMode | FETCH_ADDR_TAILK_PADD);
+                mulOpts.postFetch = defaultTilePostFetch;
+                mulOpts.flags |= TILEMUL_EXTERN_RDECL;
+
+                kgenPrintf( ctx,
+                    "%s = kBase + %s.x*%lu;\n",
+                    vnames->k,
+                    subgVNames.itemId,
+                    subdims[1].bwidth );
+
+                tileMulGen( ctx, &gset, &mulOpts );
+            }
+
+            // skipTilemul branch
+            kgenEndBranch( ctx, NULL );
+
+        }
+
+        if (i) {
+            kgenEndBranch(ctx, NULL);
+        }
+
+        kgenAddBlankLine(ctx);
+    }
+
+    if ( kflags & KEXTRA_UPPER_TRIANG ) {
+        checkGenRestoreTailCoords(ctx, &gset, tailStatus);
+    }
+    kgenAddBlankLine(ctx);
+    gset.flags &= ~BGF_LD_IN_VECTORS;
+
+    uflags = kextraToUpresFlags(funcID, kflags);
+    uflags |= tailStatusToUpresFlags(tailStatus);
+
+    if ( subgMode ) {
+
+        mergeUpdateResult( ctx,
+            funcID,
+            &gset,
+            &subgVNames,
+            //uflags | UPRES_EXCEED_PROBLEM_CONDITION,
+            uflags,
+            (UpresProcPtr)genUpdateResult );
+    }
+    else {
+        genUpdateResult( ctx,
+            funcID,
+            &gset,
+            uflags,
+            NULL,
+            NULL,
+            NULL );
+    }
+
+    ret = kgenEndFuncBody(ctx);
+    if (!ret) {
+        ret = (ssize_t)kgenSourceSize(ctx) + 1;
+    }
+
+    destroyFetchContext(mulOpts.fctx);
+    destroyKgenContext(ctx);
+
+    return (ret < 0) ? -EOVERFLOW : ret;
+}
+
+//-----------------------------------------------------------------------------
+
+static void
+assignKargs(
+    KernelArg *args,
+    const CLBlasKargs *blasArgs,
+    KernelExtraFlags kflags,
+    BlasFunctionID funcID)
+{
+    int i = 5;
+
+    // height of the diagonal part
+    initSizeKarg(&args[0], blasArgs->M);
+    initSizeKarg(&args[1], blasArgs->K);
+    assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype);
+    initMemobjKarg(&args[3], blasArgs->A, NULL, 0, 0);
+    initSizeKarg(&args[4], blasArgs->lda.matrix);
+    if (funcID == CLBLAS_SYR2K) {
+        initMemobjKarg(&args[i++], blasArgs->B, NULL, 0, 0);
+        initSizeKarg(&args[i++], blasArgs->ldb.matrix);
+    }
+
+    if (!(kflags & KEXTRA_BETA_ZERO)) {
+        assignScalarKarg(&args[i++], &(blasArgs->beta), blasArgs->dtype);
+    }
+
+    initMemobjKarg(&args[i++], blasArgs->C, NULL, 0, 0);
+    initSizeKarg(&args[i++], blasArgs->ldc.matrix);
+    initSizeKarg(&args[i++], blasArgs->offsetM);
+    /* Original N */
+    initSizeKarg(&args[i++], blasArgs->N);
+    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
+        initSizeKarg(&args[i++], blasArgs->offA);
+    }
+    if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
+        initSizeKarg(&args[i++], blasArgs->offBX);
+    }
+    if (kflags & KEXTRA_CY_OFF_NOT_ZERO) {
+        initSizeKarg(&args[i++], blasArgs->offCY);
+    }
+}
+
+//-----------------------------------------------------------------------------
+
+static void
+syrkAssignKargs(KernelArg *args, const void *params, const void *extra)
+{
+    (void)extra;
+
+    assignKargs(args, (const CLBlasKargs*)params,
+                ((const CLBLASKernExtra*)extra)->flags, CLBLAS_SYRK);
+}
+
+//-----------------------------------------------------------------------------
+
+static void
+syr2kAssignKargs(KernelArg *args, const void *params, const void *extra)
+{
+    (void)extra;
+
+    assignKargs(args, (const CLBlasKargs*)params,
+                ((const CLBLASKernExtra*)extra)->flags, CLBLAS_SYR2K);
+}
+
+//-----------------------------------------------------------------------------
+
+static void
+syrkCalcThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra)
+{
+    const CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    CLBlasKargs *blasArgs = (CLBlasKargs*)args;
+    size_t nrGroups = 0;
+    size_t x, procX, startN, N, origN, step;
+    bool isU = (blasArgs->uplo == clblasUpper);
+    KernelExtraFlags kflags = ((CLBLASKernExtra*)extra)->flags;
+    KernelExtraFlags diagFlags = KEXTRA_SYRK_SEPARATE_DIAGONAL |
+                                 KEXTRA_SYRK_EVALUATE_DIAGONAL;
+    bool isDiagSep = ((kflags & KEXTRA_SYRK_SEPARATE_DIAGONAL) != 0);
+    bool isEvalOnlyDiag = ((kflags & diagFlags) == diagFlags);
+    size_t start, end;
+    int roundDir = 1;
+    size_t vecAlign = 1;
+
+    /*
+     * Traverse the output matrix with panels from
+     * the largest one
+     */
+    N = blasArgs->M;                // width of the diagonal part
+    startN = blasArgs->offsetM;     // vertical offset of the diagonal part
+    origN = blasArgs->N;
+    x = (isU) ? N : 0;
+    step = subdims[0].x;
+
+    /*
+     * NOTE:
+     *
+     * In the case of separate evaluating of the area around the diagonal it's
+     * critically important that at least on step would be aligned.
+     * Otherwise, solution areas will overlap that will lead to a wrong result.
+     */
+
+    if (isU && (isDiagSep && !isEvalOnlyDiag)) {
+        roundDir = 0;
+    }
+    else {
+        roundDir = 1;
+    }
+
+    if (!isU && (!isDiagSep || isEvalOnlyDiag)) {
+        vecAlign = isMatrixAccessColMaj(CLBLAS_SYRK, kflags, MATRIX_A) ?
+                        (size_t)umax(kextra->vecLenA, kextra->vecLenB) : 1;
+    }
+
+    for (procX = 0; procX < N; procX += step) {
+        if (isU) {
+            step = (isU && (x % subdims[0].x)) ? (x % subdims[0].x) :
+                                                  subdims[0].x;
+            start = (!isEvalOnlyDiag) ? 0 : roundDown(x - step, subdims[0].y);
+            end = (!isDiagSep || isEvalOnlyDiag) ? x :
+                                        roundDown(x - step, subdims[0].y);
+            x -= step;
+        }
+        else {
+            start = (!isDiagSep || isEvalOnlyDiag) ? x :
+                                    roundUp(x + step, subdims[0].y);
+            end = (isEvalOnlyDiag) ? roundUp(x + step, subdims[0].y) : N;
+
+            end = roundUp(end, vecAlign);
+            x += step;
+            if (start >= end) {
+                continue;
+            }
+        }
+
+        if (roundDir) {
+            nrGroups += divRoundUp(end - start, subdims[0].y);
+        }
+        else {
+            nrGroups += (end - start) / subdims[0].y;
+        }
+    }
+
+    /* rectangular part of trapezium */
+    if (!isEvalOnlyDiag) {
+        if (isU) {
+            nrGroups += divRoundUp(N, subdims[0].y) *
+                divRoundUp(origN - N - startN, subdims[0].x);
+        }
+        else {
+            nrGroups += (startN / subdims[0].x) * divRoundUp(N, subdims[0].y);
+        }
+    }
+
+    if (nrGroups == 0) { // in case we got N==0
+        nrGroups = 1;
+    }
+    threads[0] = nrGroups * pgran->wgSize[0];
+    threads[1] = 1;
+}
+
+//-----------------------------------------------------------------------------
+
+static SolverFlags
+solverFlags(void)
+{
+    return SF_WSPACE_1D;
+}
+
+//-----------------------------------------------------------------------------
+
+static void
+fixupArgs(void *args, SubproblemDim *subdims, void *extra)
+{
+    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    const CLBlasKargs *blasArgs = (const CLBlasKargs*)args;
+    size_t moddim;
+
+    extraData_t *extraData = (extraData_t*)&((CLBLASKernExtra*)extra)->solverPriv;
+
+    const size_t nChans = 8; // !!!DEVICE DEPENDED!!!
+    const size_t wideChans = 64; // !!!DEVICE DEPENDED!!!
+    const size_t sizeType[] = {1,2,2,4};
+
+    size_t sizeBlock = wideChans * nChans / sizeType[blasArgs->dtype];
+    size_t off = blasArgs->K % sizeBlock;
+    if (off == 0) {
+        extraData->staggered = roundUp(subdims[1].bwidth * sizeType[blasArgs->dtype]
+                                    , wideChans / sizeType[blasArgs->dtype]);
+    }
+    else {
+        extraData->staggered = 0;
+    }
+    extraData->staggered = 64 / sizeType[blasArgs->dtype]; //fixed, not calculated
+
+    /*
+     * Save maxium possible vectorization for C in case of column-major order
+     * and lower triangular matrix C. It is needed because the 'y' problem
+     * dimensions expands in backward direction and aligned access to memory
+     * can occur.
+     */
+    moddim = (unsigned int)(blasArgs->N % subdims[1].y);
+    if (isMatrixAccessColMaj(CLBLAS_SYRK, kextra->flags, MATRIX_C) &&
+        (blasArgs->uplo == clblasLower) && moddim) {
+
+        SyrxkExtraPriv *priv = (SyrxkExtraPriv*)kextra->solverPriv;
+        size_t tsize;
+
+        tsize = dtypeSize(kextra->dtype);
+        priv->maxVlenC = appropriateVecLen(blasArgs->N, tsize, subdims[1].y, 3);
+    }
+}
+
+//-----------------------------------------------------------------------------
+
+static bool
+checkCalcDecomp(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    DataType dtype,
+    int check)
+{
+    bool ret = true;
+
+    DUMMY_ARG_USAGE(subdimsNum);
+
+    if (check == PGRAN_CHECK) {
+        unsigned int minSize, maxSize;
+
+        maxSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 4 : 8;
+        minSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 1 : 2;
+        ret = decompSanityCheck(subdims, minSize, maxSize, 24, dtype, true);
+        ret = ret && (subdims[0].bwidth == subdims[1].bwidth);
+        ret = ret && (pgran->wgSize[0] == 64);
+    }
+    else {
+        calcPgranDedicated(pgran, subdims, -1, 3);
+    }
+
+    return ret;
+}
+
+//-----------------------------------------------------------------------------
+
+void
+initSyr2kBlockPattern(MemoryPattern *mempat)
+{
+    mempat->name = "Cached global memory based block syr2k";
+    mempat->nrLevels = 2;
+    mempat->sops = &syr2kSolverOps;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L1;
+    mpatExtra.bMset = CLMEM_LEVEL_L1;
+    mempat->extra = &mpatExtra;
+}
+
+//-----------------------------------------------------------------------------
+
+void
+initSyrkBlockPattern(MemoryPattern *mempat)
+{
+    mempat->name = "Cached global memory based block syrk";
+    mempat->nrLevels = 2;
+    mempat->sops = &syrkSolverOps;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L1;
+    mpatExtra.bMset = CLMEM_LEVEL_L1;
+    mempat->extra = &mpatExtra;
+}
+
+//-----------------------------------------------------------------------------
+
+void
+initSyrkSubgPattern(MemoryPattern *mempat)
+{
+    mempat->name = "Cached global memory based subgroup syrk";
+    mempat->nrLevels = 2;
+    mempat->sops = &syrkSubgSops;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L1;
+    mpatExtra.bMset = CLMEM_LEVEL_L1;
+    mempat->extra = &mpatExtra;
+}
+
+//-----------------------------------------------------------------------------
+
+void
+initSyr2kSubgPattern(MemoryPattern *mempat)
+{
+    mempat->name = "Cached global memory based subgroup syr2k";
+    mempat->nrLevels = 2;
+    mempat->sops = &syr2kSubgSops;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L1;
+    mpatExtra.bMset = CLMEM_LEVEL_L1;
+    mempat->extra = &mpatExtra;
+}
+
+// ----------------------------------------------------------------------------
+
+static int
+syrkSubgGetPerf(
+        unsigned int kflags,
+        const void *args)
+{
+    DUMMY_ARG_USAGE(args);
+
+    if ( !isMatrixAccessColMaj( CLBLAS_SYRK, kflags, MATRIX_A ) &&
+         !isMatrixAccessColMaj( CLBLAS_SYRK, kflags, MATRIX_B ) ) {
+
+        return PPERF_GOOD;
+    }
+
+    return PPERF_NOT_SUPPORTED;
+}
+
+//-----------------------------------------------------------------------------
+
+static int
+syrkBlockGetPerf(
+        unsigned int kflags,
+        const void *args)
+{
+    DUMMY_ARG_USAGE(args);
+
+    if ( !isMatrixAccessColMaj( CLBLAS_SYRK, kflags, MATRIX_A ) &&
+         !isMatrixAccessColMaj( CLBLAS_SYRK, kflags, MATRIX_B ) ) {
+
+        return PPERF_AVERAGE;
+    }
+
+    return PPERF_GOOD;
+}
+
+//-----------------------------------------------------------------------------
+
+static int
+syrkSubgGetDefaultDecomp( PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    void *pArgs )
+{
+    DUMMY_ARG_USAGE(subdimsNum);
+    pgran->wgDim = 1;
+    return subgGetDefaultDecomp( pgran, subdims, pArgs );
+}
+
+//-----------------------------------------------------------------------------
+
+#if 0
+
+// for debug
+static int
+syrkBlockGetDefaultDecomp(
+        PGranularity *pgran,
+        SubproblemDim *subdims,
+        unsigned int subdimsNum)
+{
+    // !!! DEBUG
+#if 1
+    subdims[0].itemX = subdims[0].x = 64;
+    subdims[0].itemY = subdims[0].y = 32;
+    subdims[0].bwidth = subdims[1].bwidth = 2;
+    subdims[1].itemX = subdims[1].x = 8;
+    subdims[1].itemY = subdims[1].y = 4;
+#else
+    subdims[0].itemX = subdims[0].x = 32;
+    subdims[0].itemY = subdims[0].y = 32;
+    subdims[0].bwidth = subdims[1].bwidth = 4;
+    subdims[1].itemX = subdims[1].x = 4;
+    subdims[1].itemY = subdims[1].y = 4;
+#endif
+    pgran->wgDim = 1;
+    pgran->wgSize[0] = 64;
+
+    return 0;
+    //////////////////////////////////////////////////
+
+    if( (subdimsNum<2)||
+        (NULL==pgran)||
+        (NULL==subdims) ){
+
+        return EINVAL;
+    }
+
+    pgran->wgDim = 1;
+    pgran->wgSize[0] = 64;
+
+    subdims[1].bwidth = 4;
+    subdims[1].itemX = subdims[1].x = 4;
+    subdims[1].itemY = subdims[1].y = 4;
+
+    //subdims[0].bwidth = subdims[1].bwidth * itemsPerSubg;
+    subdims[0].bwidth = subdims[1].bwidth;
+    subdims[0].itemX = subdims[0].x = subdims[1].x * 8;
+    subdims[0].itemY = subdims[0].y = subdims[1].y * 8;
+
+    return 0;
+
+}
+
+#endif
+
+//-----------------------------------------------------------------------------
+
+static bool
+subgCheckCalcDecomp(
+        PGranularity *pgran,
+        SubproblemDim *subdims,
+        unsigned int subdimsNum,
+        DataType dtype,
+        int check)
+{
+    size_t subgA = 0;
+    size_t subgB = 0;
+    size_t regUse = 0;
+    unsigned int itemsPerSubg = 0;
+
+    DUMMY_ARG_USAGE(subdimsNum);
+
+    if( 0 == subdims[0].x ||
+        0 == subdims[0].y ||
+        0 == subdims[0].bwidth ||
+        0 == subdims[1].x ||
+        0 == subdims[1].y ||
+        0 == subdims[1].bwidth ){
+
+        return false;
+    }
+
+    subgA = subdims[0].y/subdims[1].y;
+    subgB = subdims[0].x/subdims[1].x;
+    itemsPerSubg = subdims[0].bwidth/subdims[1].bwidth;
+
+    if( itemsPerSubg < 4 ){
+        return false;
+    }
+
+    if( subdims[1].y < 4 ||
+        subdims[1].x < 4 ||
+        subdims[1].bwidth < 4 ){
+        return false;
+    }
+
+    if( subdims[1].x != subdims[1].itemX ||
+        subdims[1].y != subdims[1].itemY ){
+
+        return false;
+    }
+
+    // the group block must consist of integer number of subgroup blocks
+    if( subdims[0].x % subdims[1].x ||
+        subdims[0].y % subdims[1].y ||
+        subdims[0].bwidth % subdims[1].bwidth ){
+
+        return false;
+    }
+
+    //check fitting of bw to common vector sizes
+    if( isComplexType(dtype) ){
+
+        if( 2*subdims[1].bwidth > 16 ){
+
+            return false;
+        }
+    }
+
+    // check dimensions
+    if( subdims[1].bwidth > 16 ||
+        subdims[1].x > 16 ||
+        subdims[1].y > 16 ){
+
+        return false;
+    }
+
+    // estimate register usage, drop
+    // inevitably slowed decompositions
+    regUse =
+        (   subdims[1].bwidth * subdims[1].x +
+            subdims[1].bwidth * subdims[1].y +
+            subdims[1].x * subdims[1].y ) *
+        dtypeSize(dtype);
+
+    regUse /= 16; // 16 bytes per register
+
+    if( regUse >= 64 ){
+        return false;
+    }
+
+    // passed PGranularity should be checked
+    if( PGRAN_CHECK == check ){
+
+        if( pgran->wgDim != 1 ){
+            return false;
+        }
+        if( pgran->wgSize[0] != 64 ){
+            return false;
+        }
+
+        if( pgran->wgSize[0] != subgA*subgB*itemsPerSubg ){
+            return false;
+        }
+    }
+    // PGranularity should be calculated
+    else{
+        pgran->wgDim = 1;
+        pgran->wgSize[0] = subgA * subgB * itemsPerSubg;
+    }
+
+    return true;
+}
diff --git a/src/library/blas/gens/tests/CMakeLists.txt b/src/library/blas/gens/tests/CMakeLists.txt
new file mode 100644
index 0000000..f945b1e
--- /dev/null
+++ b/src/library/blas/gens/tests/CMakeLists.txt
@@ -0,0 +1,60 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+
+set(SRC_TILEMUL
+    ../tilemul.c
+    ../fetch.c
+    ${clBLAS_SOURCE_DIR}/library/common/kerngen_core.c
+    ${clBLAS_SOURCE_DIR}/library/common/kgen_basic.c
+    ${clBLAS_SOURCE_DIR}/library/common/kgen_loop_helper.c
+    ${clBLAS_SOURCE_DIR}/library/common/misc.c
+    ${clBLAS_SOURCE_DIR}/library/blas/gens/blas_kgen.c
+    ${clBLAS_SOURCE_DIR}/library/blas/gens/tile.c
+    ${clBLAS_SOURCE_DIR}/library/blas/gens/tile_iter.c
+    ${clBLAS_SOURCE_DIR}/library/blas/gens/gen_helper.c
+    ${clBLAS_SOURCE_DIR}/library/blas/generic/blas_funcs.c
+    ${clBLAS_SOURCE_DIR}/library/blas/generic/matrix_dims.c
+    ${clBLAS_SOURCE_DIR}/library/blas/generic/matrix_props.c
+    ${clBLAS_SOURCE_DIR}/library/common/gens/dblock_kgen.c
+    ${clBLAS_SOURCE_DIR}/library/common/kgen_guard.c
+    ${clBLAS_SOURCE_DIR}/library/common/list.c
+    ${clBLAS_SOURCE_DIR}/library/common/mutex.c 
+    ${clBLAS_SOURCE_DIR}/library/common/trace_malloc.c
+    t_tilemul.c
+)
+
+include_directories(${OPENCL_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_DIR}/include
+                    ${clBLAS_SOURCE_DIR}/library/blas/include ${clBLAS_SOURCE_DIR}/library/blas/gens)
+
+add_executable(t_tilemul ${SRC_TILEMUL})
+target_link_libraries(t_tilemul ${OPENCL_LIBRARIES})
+
+if( TARGET_PLATFORM EQUAL 64 )
+	# CPack configuration; include the executable into the package
+	install( TARGETS t_tilemul
+			RUNTIME DESTINATION bin64
+			LIBRARY DESTINATION lib64
+			ARCHIVE DESTINATION lib64/import
+			)
+else()
+	# CPack configuration; include the executable into the package
+	install( TARGETS t_tilemul
+			RUNTIME DESTINATION bin32
+			LIBRARY DESTINATION lib32
+			ARCHIVE DESTINATION lib32/import
+			)
+endif()
diff --git a/src/library/blas/gens/tests/t_tilemul.c b/src/library/blas/gens/tests/t_tilemul.c
new file mode 100644
index 0000000..ba4b49c
--- /dev/null
+++ b/src/library/blas/gens/tests/t_tilemul.c
@@ -0,0 +1,1099 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <CL/cl.h>
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <getopt.h>
+#include <kerngen.h>
+#include <blas_kgen.h>
+#include <clblas_stddef.h>
+
+#define JUST_MULTIPLICATION 0
+
+#if JUST_MULTIPLICATION
+enum {
+    ITEM_WORK_M = 1,
+    ITEM_WORK_N = 1,
+    ITEM_BLOCKS_K = 1,
+};
+#else
+enum {
+    ITEM_WORK_M = 4,
+    ITEM_WORK_N = 4,
+    ITEM_BLOCKS_K = 3,
+    RAND_BOUND = 10
+};
+#endif
+
+const char *kernelName = "tilemul_test";
+
+
+// float types based unified pointer
+typedef union FPtr {
+  void *v;
+  cl_float *f;
+  cl_double *d;
+  cl_float2 *f2;
+  cl_double2 *d2;
+} FPtr;
+
+// float type based unified data type
+typedef union FType {
+    unsigned char u[sizeof(cl_double)];
+    cl_float f;
+    cl_float2 f2;
+    cl_double d;
+    cl_double2 d2;
+} FType;
+
+static void
+printUsage(const char *programName, int exitCode)
+{
+    printf( "USAGE: %s [options] <M N K>\n"
+            "  --help, -h                Print this help message.\n"
+            "  --device, -d <device>     OpenCL device used. <device> can "
+            "be \"gpu\" or \"cpu\". Default is \"gpu\".\n"
+            "  --type, -t <type>         Type can be s, d, c or z. Default "
+            "is s.\n"
+            "  --fetch, -f <vector size> Size of used fetch vectors, in used "
+            "types. Default is 1.\n"
+            "  --local, -l <matrix>      If matrix is local or global. Matrix "
+            "can be A or B. By default, both are global.\n"
+            "  --verbose, -v             Turn on verbose mode.\n"
+            "  --a, -a <order>\n"
+            "  --b, -b <order>\n         Set order for tiles a and b fetching. "
+            "Order can be are \"r\" for row major and \"c\" for "
+            "column major. Default values are \"r\" for A and \"c\" for B.\n"
+            "  --skew, -s <skew_value>   Set skews for tiles along M, N, and K "
+            "directions. skew_value can be \"a\" for tile A skew along M, \"b\""
+            " for tile B skew along N and \"k\" for both tiles skew along K. "
+            "There is no skews by default.\n"
+            "  -g, --globalcycling <global_cycling_value>\n"
+            "                            Set global cycling for tiles along M, "
+            "N and K directions. global_cycling_value can be \"a\" for tile A "
+            "global cycling along M, \"b\" for tile B global cycling along N "
+            "and \"k\" for both tiles global cycling along K. There is no "
+            "global cycling enabled by default.\n"
+            "  --iter, -i <num>          Number of iterations.\n"
+            "  --core, -c <mulcore>      Multiplier core. <mulcore> can "
+            "be \"muladd\", \"mad\" or \"dot\". Default is \"mad\".\n"
+            "  --old, -o                 Use old tilemul generator interface "
+            "with one generator function call for both fetching and "
+            "multiplication. Separate generators functions are used by "
+            "default.\n"
+            "  M N K                     Size of block.\n",
+           programName);
+    exit(exitCode);
+}
+
+void
+genFillTileWithNAN(struct KgenContext *ctx, const Tile *tile)
+{
+    char tmp[1024];
+    Kstring elem;
+    unsigned int incRows, incCols;
+    unsigned int i, j, v;
+
+    if (!tile->trans) {
+        incRows = 1;
+        v = incCols = umin(tile->vecLen, tile->nrCols);
+    }
+    else {
+        v = incRows = umin(tile->vecLen, tile->nrRows);
+        incCols = 1;
+    }
+
+    for (i = 0; i < tile->nrRows; i += incRows) {
+        for (j = 0; j < tile->nrCols; j += incCols) {
+            sprintfTileElement(&elem, tile, i, j, v);
+            sprintf(tmp, "%s = NAN;\n", elem.buf);
+            kgenAddStmt(ctx, tmp);
+        }
+    }
+
+    kgenAddBlankLine(ctx);
+}
+
+void
+addTestPrefix(struct KgenContext *ctx, bool isDouble)
+{
+    kgenDeclareUptrs(ctx, isDouble);
+}
+
+static void checkRet(int ret, const char *genName)
+{
+    if (ret != 0) {
+        printf("%s generator failed: %s\n", genName, strerror(-ret));
+        exit(EXIT_FAILURE);
+    }
+}
+
+void
+genTest(
+    struct KgenContext *ctx,
+    BlasGenSettings *gset,
+    TileMulOpts *mulOpts,
+    bool separateFetch)
+{
+    char s[1024];
+    Kstring kstr;
+    char *tName, tVect[64], *ptrName;
+    KernelVarNames *vnames = &gset->varNames;
+    DataType dtype = gset->kextra->dtype;
+    const SubproblemDim *subdims = gset->subdims;
+    unsigned int vecLen = gset->kextra->vecLen;
+    size_t m, n, k;
+    unsigned int i, j;
+    bool tra, trb, localA, localB, vecCoords;
+    int ret;
+    TileMulFlags flags = mulOpts->flags;
+    FetchOpts fetchOpts;
+
+    m = gset->subdims[1].y;
+    n = gset->subdims[1].x;
+    k = gset->subdims[1].bwidth;
+
+    tra = ((flags & TILEMUL_TRA) != 0);
+    trb = ((flags & TILEMUL_TRB) != 0);
+    localA = (mulOpts->memA == CLMEM_LOCAL_MEMORY);
+    localB = (mulOpts->memB == CLMEM_LOCAL_MEMORY);
+
+    vecCoords = ((flags & TILEMUL_OPTIMIZE_VEC_COORDS) != 0);
+
+    tVect[0] = '\0';
+
+    if (vecCoords && vecLen != 1) {
+        sprintf(tVect, "%u", vecLen);
+    }
+
+    switch (dtype) {
+    case TYPE_FLOAT:
+        tName = "float";
+        ptrName = "f";
+        break;
+    case TYPE_DOUBLE:
+        tName = "double";
+        ptrName = "d";
+        break;
+    case TYPE_COMPLEX_FLOAT:
+        tName = "float2";
+        ptrName = "f2v";
+        break;
+    case TYPE_COMPLEX_DOUBLE:
+        tName = "double2";
+        ptrName = "d2v";
+        break;
+    default:
+        return;
+    }
+
+    if (vecCoords) {
+        //Do not use GPtrs in fetching
+        vnames->A = "A";
+        vnames->B = "B";
+    }
+    else {
+        vnames->A = localA ? "LAptr" : "((GPtr)A)";
+        vnames->B = localB ? "LBptr" : "((GPtr)B)";
+    }
+    if (!localA) {
+        vnames->lda = "lda";
+
+    }
+    if (!localB) {
+        vnames->ldb = "ldb";
+    }
+    vnames->sizeM = "M";
+    vnames->sizeN = "N";
+    vnames->sizeK = "K";
+    vnames->skewA = "skewA";
+    vnames->skewB = "skewB";
+    vnames->skewK = "skewK";
+    vnames->coordA = "workItemM";
+    vnames->coordB = "workItemN";
+    vnames->k = "k";
+
+    kgenAddBlankLine(ctx);
+    sprintf(s, "__attribute__((reqd_work_group_size(%i, %i, 1)))\n",
+            ITEM_WORK_M, ITEM_WORK_N);
+    kgenAddStmt(ctx, s);
+    kgenAddStmt(ctx, "__kernel void\n");
+    sprintf(s, "%s(\n", kernelName);
+    kgenAddStmt(ctx, s);
+    sprintf(s,"    %s alpha,\n", tName);
+    kgenAddStmt(ctx, s);
+    sprintf(s,"    __global %s%s *A,\n", tName, tVect);
+    kgenAddStmt(ctx, s);
+    sprintf(s,"    __global %s%s *B,\n", tName, tVect);
+    kgenAddStmt(ctx, s);
+    kgenAddStmt(ctx, "    uint M,\n"
+                     "    uint N,\n"
+                     "    uint K,\n");
+    sprintf(s,
+            "    __global %s *C,\n"
+            "    const uint iter)\n", tName);
+    kgenAddStmt(ctx, s);
+    kgenBeginFuncBody(ctx);
+    sprintf(s, "uint workItemM = %lu * get_global_id(0);\n"
+               "uint workItemN = %lu * get_global_id(1);\n",
+            m, n);
+    kgenAddStmt(ctx, s);
+    if ((flags & TILEMUL_SKEW_A) != 0) {
+        kgenAddStmt(ctx, "uint skewA = 0u;\n");
+    }
+    if ((flags & TILEMUL_SKEW_B) != 0) {
+        kgenAddStmt(ctx, "uint skewB = 0u;\n");
+    }
+    if ((flags & TILEMUL_SKEW_K) != 0) {
+        kgenAddStmt(ctx, "uint skewK = 0u;\n");
+    }
+
+    if (localA) {
+        sprintf(s, "__local %s LA[%lu];\n",
+                tName, subdims[0].bwidth * subdims[0].y);
+        kgenAddStmt(ctx, s);
+    }
+    else { //global A
+        sprintf(s, "uint lda = %s;\n", tra ? "M" : "K");
+        kgenAddStmt(ctx, s);
+    }
+    if (localB) {
+        sprintf(s, "__local %s LB[%lu];\n",
+                tName, subdims[0].bwidth * subdims[0].x);
+        kgenAddStmt(ctx, s);
+    }
+    else { //global B
+        sprintf(s, "uint ldb = %s;\n", trb ? "K" : "N");
+        kgenAddStmt(ctx, s);
+    }
+
+    initDefaultTiles(gset, CLBLAS_GEMM, TILE_PACKED, PRIV_STORAGE_ARRAY);
+    declareTileStorages(ctx, gset);
+
+    if (vecCoords) {
+        size_t ha, hb;
+        char *str;
+
+        ha = tra ? k : m;
+        hb = trb ? n : k;
+
+        if (ha > 1) {
+            str = s;
+            str += sprintf(str, "uint%lu ca = {0", ha);
+            for (i = 1; i < ha; i++) {
+                str += sprintf(str, ", %s * %u / %u", vnames->lda, i, vecLen);
+            }
+            str += sprintf(str, "};\n");
+            kgenAddStmt(ctx, s);
+        }
+        else {
+            kgenAddStmt(ctx, "uint ca = 0;\n");
+        }
+        vnames->vectCoordA = "ca";
+
+        if (hb > 1) {
+            str = s;
+            str += sprintf(str, "uint%lu cb = {0", hb);
+            for (i = 1; i < hb; i++) {
+                str += sprintf(str, ", %s * %u / %u", vnames->ldb, i, vecLen);
+            }
+            str += sprintf(str, "};\n");
+            kgenAddStmt(ctx, s);
+        }
+        else {
+            kgenAddStmt(ctx, "uint cb = 0;\n");
+        }
+        vnames->vectCoordB = "cb";
+
+//        uint4 ca = {0, vecLDA, vecLDA * 2, vecLDA * 3};
+//        uint4 cb = {0, vecLDB, vecLDB * 2, vecLDB * 3};
+    }
+
+    kgenAddBlankLine(ctx);
+
+    sprintf(s, "for (int it = 0; it < iter; it++)");
+    kgenBeginBranch(ctx, s);
+
+    if (!(localA && localB)) {
+        kgenAddStmt(ctx, "uint k = 0;\n");
+    }
+
+    genZeroTile(ctx, &gset->tileCY);
+
+    if (vecCoords) {
+        char *coordsA[2] = {"workItemM", "k"};
+        char *coordsB[2] = {"k", "workItemN"};
+        sprintf(s, "A += %s * (lda / %u) + %s / %u;\n",
+                coordsA[tra], vecLen, coordsA[1 - tra], vecLen);
+        kgenAddStmt(ctx, s);
+        sprintf(s, "B += %s * (ldb / %u) + %s / %u;\n",
+                coordsB[trb], vecLen, coordsB[1 - trb], vecLen);
+        kgenAddStmt(ctx, s);
+    }
+
+    sprintf(s, "for (int k0 = 0; k0 < K; k0 += %lu)", subdims[0].bwidth);
+    kgenBeginBranch(ctx, s);
+
+    /* Copy data to local memory. We know that the size of matrix is the same
+     * that the size of one block and use that.
+     */
+    if (localA) {
+        sprintf(s,
+                "event_t evA = async_work_group_copy(LA, A, %lu, 0);\n"
+                "wait_group_events(1, &evA);\n"
+                "barrier(CLK_LOCAL_MEM_FENCE);\n",
+                subdims[0].y * subdims[0].bwidth);
+        kgenAddStmt(ctx, s);
+        kgenAddStmt(ctx, "LPtr LAptr;\n");
+        if (tra) {
+            sprintf(s,
+                    "LAptr.%s = LA + workItemM;\n", ptrName);
+        }
+        else {
+            sprintf(s,
+                    "LAptr.%s = LA + workItemM * %lu;\n",
+                    ptrName, subdims[0].bwidth);
+        }
+        kgenAddStmt(ctx, s);
+    }
+    if (localB) {
+        sprintf(s,
+                "event_t evB = async_work_group_copy(LB, B, %lu, 0);\n"
+                "wait_group_events(1, &evB);\n"
+                "barrier(CLK_LOCAL_MEM_FENCE);\n",
+                subdims[0].x * subdims[0].bwidth);
+        kgenAddStmt(ctx, s);
+        kgenAddStmt(ctx, "LPtr LBptr;\n");
+        if (trb) {
+            sprintf(s, "LBptr.%s = LB + workItemN * %lu;\n",
+                    ptrName, subdims[0].bwidth);
+        }
+        else {
+            sprintf(s, "LBptr.%s = LB + workItemN;\n", ptrName);
+        }
+        kgenAddStmt(ctx, s);
+    }
+
+    if (!separateFetch) {
+        ret = tileMulGen(ctx, gset, mulOpts);
+        checkRet(ret, "Multiplier");
+    }
+    else {
+        Tile *tileA = &gset->tileA;
+        Tile *tileB = &gset->tileBX;
+
+        memset(&fetchOpts, 0, sizeof(fetchOpts));
+        if (localA) {
+            fetchOpts.memA = CLMEM_LOCAL_MEMORY;
+        }
+        if (localB) {
+            fetchOpts.memB = CLMEM_LOCAL_MEMORY;
+        }
+
+        genFillTileWithNAN(ctx, tileA);
+        genFillTileWithNAN(ctx, tileB);
+
+        if (subdims[0].bwidth != subdims[1].bwidth) {
+            sprintf(s, "for (int k1 = 0; k1 < %lu; k1 += %lu)",
+                    subdims[0].bwidth, k);
+            kgenBeginBranch(ctx, s);
+        }
+
+#if JUST_MULTIPLICATION
+        for (i = 0; i < tileA->nrRows; i++) {
+            for(j = 0; j < tileA->nrCols; j++) {
+                sprintfTileElement(&kstr, tileA, i, j, 1);
+                sprintf(s, "%s = %u;\n", kstr.buf, i * tileA->nrCols + j);
+                kgenAddStmt(ctx, s);
+            }
+        }
+
+        for (i = 0; i < tileB->nrRows; i++) {
+            for(j = 0; j < tileB->nrCols; j++) {
+                sprintfTileElement(&kstr, tileB, i, j, 1);
+                sprintf(s, "%s = %u;\n", kstr.buf, i * tileB->nrCols + j);
+                kgenAddStmt(ctx, s);
+            }
+        }
+#else
+        fetchOpts.mrole = MATRIX_B;
+        fetchOpts.lineOffset = 0;
+        fetchOpts.linesNum = (tileB->trans) ? tileB->nrCols : tileB->nrRows;
+        ret = genFetchInputTile(ctx, NULL, gset, &fetchOpts);
+        checkRet(ret, "Fetching tile b");
+
+        fetchOpts.mrole = MATRIX_A;
+        fetchOpts.linesNum = (tileA->trans) ? tileA->nrCols : tileA->nrRows;
+        kgenAddBlankLine(ctx);
+        fetchOpts.lineOffset = 0;
+        ret = genFetchInputTile(ctx, NULL, gset, &fetchOpts);
+        checkRet(ret, "Fetching tile a");
+#endif
+        ret = genMulTiles(ctx, gset, mulOpts);
+        checkRet(ret, "Multiplier");
+#if ! JUST_MULTIPLICATION
+        sprintf(s, "k += %lu;\n", k);
+        kgenAddStmt(ctx, s);
+#endif
+        if (subdims[0].bwidth != subdims[1].bwidth) {
+            kgenEndBranch(ctx, NULL);
+        }
+    }
+    kgenEndBranch(ctx, NULL); // K loop
+    kgenEndBranch(ctx, NULL); // iterations loop
+
+    kgenAddBlankLine(ctx);
+
+    for (i = 0; i < m; i++) {
+        for (j = 0; j < n; j++) {
+            sprintfTileElement(&kstr, &gset->tileCY, i, j, 1);
+                sprintf(s,
+                        "((GPtr)C).%s"
+                    "[(%d + workItemM) * N  + %d + workItemN] = %s;\n",
+                    ptrName, i, j, kstr.buf);
+                kgenAddStmt(ctx, s);
+            }
+                }
+
+    kgenEndFuncBody(ctx);
+}
+
+cl_int
+run (
+        const char *ker,
+        cl_uint M,
+        cl_uint N,
+        cl_uint K,
+        FType alpha,
+        BlasGenSettings *gset,
+        TileMulFlags flags,
+        cl_device_type deviceType,
+        bool verbose,
+        unsigned int iterNum)
+{
+    cl_int err;
+    cl_platform_id platform;
+    cl_context ctx;
+    cl_device_id device;
+    cl_command_queue queue;
+    cl_event evt;
+    DataType dtype = gset->kextra->dtype;
+
+    cl_mem bufA, bufB, bufC;
+    FPtr A, B, C, C_naive;
+    bool isComplex = isComplexType(dtype);
+    bool isDouble = isDoubleBasedType(dtype);
+    cl_uint nwords = (isComplex) ? 2 : 1;
+    unsigned int tsize = dtypeSize(dtype);
+    cl_kernel kernel;
+    size_t i, j, k;
+    size_t globalWorkSize[2] = {ITEM_WORK_M, ITEM_WORK_N};
+    size_t localWorkSize[2] = {ITEM_WORK_M, ITEM_WORK_N};
+    char log[100000];
+    size_t logSize;
+    cl_long sTime, fTime;
+    cl_program program = NULL;
+
+    clGetPlatformIDs(1, &platform, NULL);
+
+    clGetDeviceIDs(platform, deviceType, 1, &device, NULL);
+
+    ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, CL_QUEUE_PROFILING_ENABLE, &err);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    /* Prepare OpenCL kernel and its arguments */
+
+    program = clCreateProgramWithSource(ctx, 1, &ker, NULL, NULL);
+
+    err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
+    clGetProgramBuildInfo (program,
+            device,
+            CL_PROGRAM_BUILD_LOG,
+            sizeof(log),
+            log,
+            &logSize);
+    printf("%s", log);
+    if (err != CL_SUCCESS){
+        clReleaseProgram(program);
+        return err;
+    }
+
+    kernel = clCreateKernel(program, kernelName, &err);
+    if (err != CL_SUCCESS){
+        clReleaseProgram(program);
+        return err;
+    }
+    /* Memory allocation */
+
+    A.v = malloc(M * K * tsize);
+    B.v = malloc(K * N * tsize);
+    C.v = malloc(M * N * tsize);
+    C_naive.v = malloc(M * N * tsize);
+
+#if JUST_MULTIPLICATION
+    srand(0);
+    if (isDouble) {
+        for(i = 0; i < M * K * nwords; i++){
+            A.d[i] = i;
+        }
+        for(i = 0; i < N * K * nwords; i++){
+            B.d[i] = i + 7;
+        }
+        for(i = 0; i < M * N * nwords; i++){
+            C.d[i] = 0.0;
+            C_naive.d[i] = 0.0;
+        }
+    }
+    else {
+        for(i = 0; i < M * K * nwords; i++){
+            A.f[i] = i;
+        }
+        for(i = 0; i < N * K * nwords; i++){
+            B.f[i] = i + 7;
+        }
+        for(i = 0; i < M * N * nwords; i++){
+            C.f[i] = 0.0;
+            C_naive.f[i] = 0.0;
+        }
+    }
+
+#else
+    srand(0);
+    if (isDouble) {
+        for(i = 0; i < M * K * nwords; i++){
+            A.d[i] = (double)(rand() % RAND_BOUND);
+        }
+        for(i = 0; i < N * K * nwords; i++){
+            B.d[i] = (double)(rand() % RAND_BOUND);
+        }
+        for(i = 0; i < M * N * nwords; i++){
+            C.d[i] = 0.0;
+            C_naive.d[i] = 0.0;
+        }
+    }
+    else {
+        for(i = 0; i < M * K * nwords; i++){
+            A.f[i] = (float)(rand() % RAND_BOUND);
+        }
+        for(i = 0; i < N * K * nwords; i++){
+            B.f[i] = (float)(rand() % RAND_BOUND);
+        }
+        for(i = 0; i < M * N * nwords; i++){
+            C.f[i] = 0.0;
+            C_naive.f[i] = 0.0;
+        }
+    }
+#endif
+
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+            K * M * tsize, A.v, &err);
+    if (err != CL_SUCCESS) {
+        clReleaseKernel(kernel);
+        return err;
+    }
+
+    bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+            K * N * tsize, B.v, &err);
+
+    if (err != CL_SUCCESS) {
+        clReleaseMemObject(bufA);
+        clReleaseKernel(kernel);
+        return err;
+    }
+
+    bufC = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+        M * N * tsize, C.v, &err);
+
+    if (err != CL_SUCCESS) {
+        clReleaseMemObject(bufB);
+        clReleaseMemObject(bufA);
+        clReleaseKernel(kernel);
+        return err;
+    }
+
+    /* Argument setting and kernel execution */
+    err = clSetKernelArg(kernel, 0, tsize, alpha.u);
+    err |= clSetKernelArg(kernel, 1, sizeof(bufA), &bufA);
+    err |= clSetKernelArg(kernel, 2, sizeof(bufB), &bufB);
+    err |= clSetKernelArg(kernel, 3, sizeof(M), &M);
+    err |= clSetKernelArg(kernel, 4, sizeof(N), &N);
+    err |= clSetKernelArg(kernel, 5, sizeof(K), &K);
+    err |= clSetKernelArg(kernel, 6, sizeof(bufC), &bufC);
+    err |= clSetKernelArg(kernel, 7, sizeof(iterNum), &iterNum);
+
+    if (err != CL_SUCCESS) {
+        clReleaseMemObject(bufC);
+        clReleaseMemObject(bufB);
+        clReleaseMemObject(bufA);
+        clReleaseKernel(kernel);
+        return err;
+    }
+
+    err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL,
+        globalWorkSize, localWorkSize, 0,
+        NULL, &evt);
+
+    if (err != CL_SUCCESS) {
+        clReleaseMemObject(bufC);
+        clReleaseMemObject(bufB);
+        clReleaseMemObject(bufA);
+        clReleaseKernel(kernel);
+        return err;
+    }
+
+    err = clFinish(queue);
+    err = clEnqueueReadBuffer (queue,
+        bufC,
+        CL_TRUE,
+        0,
+        M * N * tsize,
+        C.v,
+        0,
+        NULL,
+        NULL);
+
+    /* Naive CPU multiplication */
+    if (isDouble) {
+        for (i = 0; i < M; i++) {
+            for (j = 0; j < N; j++) {
+                if (isComplex) {
+                    cl_double2 val;
+                    for (k = 0; k < K; k++) {
+                        cl_double2 bkj = flags & TILEMUL_TRB ?
+                                B.d2[j * K + k] : B.d2[k * N + j];
+                        cl_double2 aik = flags & TILEMUL_TRA ?
+                                A.d2[k * M + i] : A.d2[i * K + k];
+                        val.s[0] = aik.s[0] * bkj.s[0] - aik.s[1] * bkj.s[1];
+                        val.s[1] = aik.s[0] * bkj.s[1] + aik.s[1] * bkj.s[0];
+                        C_naive.d2[i * N + j].s[0] += val.s[0];
+                        C_naive.d2[i * N + j].s[1] += val.s[1];
+                    }
+                    val.s[0] = C_naive.d2[i * N + j].s[0] * alpha.d2.s[0] -
+                            C_naive.d2[i * N + j].s[1] * alpha.d2.s[1];
+                    val.s[1] = C_naive.d2[i * N + j].s[0] * alpha.d2.s[1] +
+                            C_naive.d2[i * N + j].s[1] * alpha.d2.s[0];
+                    C_naive.d2[i * N + j] = val;
+                }
+                else {
+                    for (k = 0; k < K; k++) {
+                        double bkj = flags & TILEMUL_TRB ?
+                                B.d[j * K + k] : B.d[k * N + j];
+                        double aik = flags & TILEMUL_TRA ?
+                                A.d[k * M + i] : A.d[i * K + k];
+                        C_naive.d[i * N + j] += aik * bkj;
+                    }
+                    C_naive.d[i * N + j] *= alpha.d;
+                }
+            }
+        }
+
+        for (i = 0; i < M * N; i++) {
+            if (C.d[i] != C_naive.d[i]) {
+                printf("Differ at (%lu, %lu): %lf != %lf\n", i / N, i % N,
+                        C.d[i], C_naive.d[i]);
+                break;
+            }
+        }
+        if (i == M * N) {
+            printf("Match\n");
+        }
+    }
+    else {
+        for (i = 0; i < M; i++) {
+            for (j = 0; j < N; j++) {
+                if (isComplex) {
+                    cl_float2 val;
+                    for (k = 0; k < K; k++) {
+                        cl_float2 bkj = flags & TILEMUL_TRB ?
+                                B.f2[j * K + k] : B.f2[k * N + j];
+                        cl_float2 aik = flags & TILEMUL_TRA ?
+                                A.f2[k * M + i] : A.f2[i * K + k];
+                        val.s[0] = aik.s[0] * bkj.s[0] - aik.s[1] * bkj.s[1];
+                        val.s[1] = aik.s[0] * bkj.s[1] + aik.s[1] * bkj.s[0];
+                        C_naive.f2[i * N + j].s[0] += val.s[0];
+                        C_naive.f2[i * N + j].s[1] += val.s[1];
+                    }
+                    val.s[0] = C_naive.f2[i * N + j].s[0] * alpha.f2.s[0] -
+                            C_naive.f2[i * N + j].s[1] * alpha.f2.s[1];
+                    val.s[1] = C_naive.f2[i * N + j].s[0] * alpha.f2.s[1] +
+                            C_naive.f2[i * N + j].s[1] * alpha.f2.s[0];
+                    C_naive.f2[i * N + j] = val;
+                }
+                else {
+                    for (k = 0; k < K; k++) {
+                        float bkj = flags & TILEMUL_TRB ?
+                                B.f[j * K + k] : B.f[k * N + j];
+                        float aik = flags & TILEMUL_TRA ?
+                                A.f[k * M + i] : A.f[i * K + k];
+                        C_naive.f[i * N + j] += aik * bkj;
+                    }
+                    C_naive.f[i * N + j] *= alpha.f;
+                }
+            }
+        }
+
+        for (i = 0; i < M * N; i++) {
+            if (C.f[i] != C_naive.f[i]) {
+                printf("Differ at (%lu, %lu): %lf != %lf\n",
+                        i / N, i % N, C.f[i], C_naive.f[i]);
+                break;
+            }
+        }
+        if (i == M * N) {
+            printf("Match\n");
+        }
+    }
+
+    /* End of naive CPU multiplication */
+    if (verbose) {
+        if (!isDouble) {
+            printf("Matrix A:\n");
+            for (i = 0; i < M; i++) {
+                for (k = 0; k < K; k++) {
+                    if (isComplex) {
+                        cl_float2 aik = flags & TILEMUL_TRA ?
+                                A.f2[k * M + i] : A.f2[i * K + k];
+                        printf("(%4.1f, %4.1f) ", aik.s[0], aik.s[1]);
+                    }
+                    else {
+                        float aik = flags & TILEMUL_TRA ?
+                                A.f[k * M + i] : A.f[i * K + k];
+                        printf("%4.1f ", aik);
+                    }
+                }
+                printf("\n");
+            }
+
+            printf("Matrix B:\n");
+            for (k = 0; k < K; k++) {
+                for (j = 0; j < N; j++) {
+                    if (isComplex) {
+                        cl_float2 bkj = flags & TILEMUL_TRB ?
+                                B.f2[j * K + k] : B.f2[k * N + j];
+                        printf("(%4.1f, %4.1f) ", bkj.s[0], bkj.s[1]);
+                    }
+                    else {
+                        float bkj = flags & TILEMUL_TRB ?
+                                B.f[j * K + k] : B.f[k * N + j];
+                        printf("%4.1f ", bkj);
+                    }
+                }
+                printf("\n");
+            }
+
+            printf("CPU calculated matrix:\n");
+            for (i = 0; i < M; i++) {
+                for (j = 0; j < N; j++) {
+                    if (isComplex) {
+                        printf("(%4.1f, %4.1f) ",
+                                C_naive.f2[i * N + j].s[0],
+                                C_naive.f2[i * N + j].s[1]);
+                    }
+                    else {
+                        printf("%4.1f ", C_naive.f[i * N + j]);
+                    }
+                }
+                printf("\n");
+            }
+
+            printf("GPU calculated matrix:\n");
+            for (i = 0; i < M; i++) {
+                for (j = 0; j < N; j++) {
+                    if (isComplex) {
+                        printf("(%4.1f, %4.1f) ",
+                                C.f2[i * N + j].s[0], C.f2[i * N + j].s[1]);
+                    }
+                    else {
+                        printf("%4.1f ", C.f[i * N + j]);
+                    }
+                }
+                printf("\n");
+            }
+        }
+    }
+
+    clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
+            &sTime, NULL);
+    clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
+            &fTime, NULL);
+
+    printf("Total multiplication time: %d ms\nTime per iteration: %d ns\n",
+            (int)((fTime-sTime)/1000000), (int)((fTime-sTime)/iterNum));
+
+    clReleaseMemObject(bufC);
+    clReleaseMemObject(bufB);
+    clReleaseMemObject(bufA);
+    clReleaseKernel(kernel);
+    return CL_SUCCESS;
+}
+
+int main(int argc, char *argv[])
+{
+    char out[1024*1024];
+    CLBLASKernExtra kextra;
+    BlasGenSettings gset;
+    TileMulOpts mulOpts;
+    int i;
+    cl_uint blockM = 4, blockN = 4, blockK = 8;
+    struct KgenContext *ctx = createKgenContext(out, sizeof(out), 1);
+    FType alpha;
+    cl_int err;
+    unsigned int iterNum = 1;
+    const char* const shortOptions = "hd:f:l:t:a:b:s:g:i:c:ov";
+    const struct option longOptions[] = {
+            {"help", no_argument, NULL, 'h'},
+            {"device", required_argument, NULL, 'd'},
+            {"fetch", required_argument, NULL, 'f'},
+            {"local", required_argument, NULL, 'l'},
+            {"type", required_argument, NULL, 't'},
+            {"a", required_argument, NULL, 'a'},
+            {"b", required_argument, NULL, 'b'},
+            {"skew", required_argument, NULL, 's'},
+            {"globalcycling", required_argument, NULL, 'g'},
+            {"iter", required_argument, NULL, 'i'},
+            {"core", required_argument, NULL, 'c'},
+            {"old", no_argument, NULL, 'o'},
+            {"verbose", no_argument, NULL, 'v'},
+            {NULL, 0, NULL, 0}
+    };
+    int nextOption;
+    cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
+    bool verbose = false;
+    SubproblemDim *subdims = gset.subdims;
+    bool separateFetch = false;
+
+    memset(&gset, 0, sizeof(gset));
+    memset(&mulOpts, 0, sizeof(mulOpts));
+    memset(&kextra, 0, sizeof(kextra));
+    gset.kextra = &kextra;
+    gset.flags |= BGF_WHOLE_A;
+    mulOpts.core = TILEMUL_MAD;
+    mulOpts.flags = TILEMUL_FORCE_VECTORIZATION;
+    kextra.vecLen = 1;
+    kextra.dtype = TYPE_FLOAT;
+
+    alpha.f = 1;
+
+    // parse command line
+    do {
+        nextOption = getopt_long(argc, argv, shortOptions, longOptions, NULL);
+        switch (nextOption) {
+        case 'h':
+            printUsage(argv[0], EXIT_SUCCESS);
+            break;
+        case 'd':
+            if (!strcmp("cpu", optarg)) {
+                deviceType = CL_DEVICE_TYPE_CPU;
+            }
+            else if (!strcmp("gpu", optarg)) {
+                deviceType = CL_DEVICE_TYPE_GPU;
+            }
+            else {
+                printf("Unknown device type %s. Supported values are \"cpu\" "
+                        "and \"gpu\".\n", optarg);
+                exit(EXIT_FAILURE);
+            }
+            break;
+        case 'f':
+            kextra.vecLen = atoi(optarg);
+            break;
+        case 'l':
+            if (!strcmp(optarg, "A")) {
+                mulOpts.memA = CLMEM_LOCAL_MEMORY;
+            }
+            else if (!strcmp(optarg, "B")) {
+                mulOpts.memB = CLMEM_LOCAL_MEMORY;
+            }
+            else {
+                printf("Wrong matrix specified: %s. Supported values are "
+                        "A, B.\n", optarg);
+                exit(EXIT_FAILURE);
+            }
+            break;
+        case 't':
+            if (!strcmp(optarg, "s")) {
+                kextra.dtype = TYPE_FLOAT;
+                alpha.f = 1;
+            }
+            else if (!strcmp(optarg, "d")) {
+                kextra.dtype = TYPE_DOUBLE;
+                alpha.d = 1;
+            }
+            else if (!strcmp(optarg, "c")) {
+                kextra.dtype = TYPE_COMPLEX_FLOAT;
+                alpha.f2.s[0] = 1;
+                alpha.f2.s[1] = 0;
+            }
+            else if (!strcmp(optarg, "z")) {
+                kextra.dtype = TYPE_COMPLEX_DOUBLE;
+                alpha.d2.s[0] = 1;
+                alpha.d2.s[1] = 0;
+            }
+            else {
+                printf("Wrong type specified: %s. Supported values are "
+                        "s, d, c, z.\n", optarg);
+                exit(EXIT_FAILURE);
+            }
+            break;
+        case 'a':
+            if (!strcmp(optarg, "r")) {
+                mulOpts.flags &= ~TILEMUL_TRA;
+            }
+            else if (!strcmp(optarg, "c")) {
+                mulOpts.flags |= TILEMUL_TRA;
+            }
+            else {
+                printf("Wrong tile a parameter specified: %s. Supported values "
+                        "are \"r\", \"c\".\n", optarg);
+                exit(EXIT_FAILURE);
+            }
+            break;
+        case 'b':
+            if (!strcmp(optarg, "r")) {
+                mulOpts.flags &= ~TILEMUL_TRB;
+            }
+            else if (!strcmp(optarg, "c")) {
+                mulOpts.flags |= TILEMUL_TRB;
+            }
+            else {
+                printf("Wrong tile b order specified: %s. Supported values "
+                        "are \"r\", \"c\".\n", optarg);
+                exit(EXIT_FAILURE);
+            }
+            break;
+        case 's':
+            if (!strcmp(optarg, "a")) {
+                mulOpts.flags |= TILEMUL_SKEW_A;
+            }
+            else if (!strcmp(optarg, "b")) {
+                mulOpts.flags |= TILEMUL_SKEW_B;
+            }
+            else if (!strcmp(optarg, "k")) {
+                mulOpts.flags |= TILEMUL_SKEW_K;
+            }
+            else {
+                printf("Wrong skew parameter specified: %s. Supported values "
+                        "are \"a\", \"b\", \"k\"\n", optarg);
+                exit(EXIT_FAILURE);
+            }
+            break;
+        case 'g':
+            if (!strcmp(optarg, "a")) {
+                mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
+            }
+            else if (!strcmp(optarg, "b")) {
+                mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_B;
+            }
+            else if (!strcmp(optarg, "k")) {
+                mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K;
+            }
+            else {
+                printf("Wrong global cycling parameter specified: %s. "
+                        "Supported values are \"a\", \"b\", \"k\"\n", optarg);
+                exit(EXIT_FAILURE);
+            }
+            break;
+        case 'i':
+            iterNum = atoi(optarg);
+            break;
+        case 'c':
+            if (!strcmp("muladd", optarg)) {
+                mulOpts.core = TILEMUL_MULADD;
+            }
+            else if (!strcmp("mad", optarg)) {
+                mulOpts.core = TILEMUL_MAD;
+            }
+            else if (!strcmp("dot", optarg)) {
+                mulOpts.core = TILEMUL_DOT;
+            }
+            else {
+                printf("Unknown multiplier core %s. Supported values"
+                        " are \"muladd\", \"mad\" and \"dot\".\n", optarg);
+                exit(EXIT_FAILURE);
+            }
+            break;
+        case 'o':
+            separateFetch = false;
+            break;
+        case 'v':
+            verbose = true;
+            break;
+        case -1:
+            break;
+        default:
+            printUsage(argv[0], EXIT_FAILURE);
+            break;
+        }
+    } while (nextOption != -1);
+
+    if (optind + 2 >= argc) {
+        printf("Error: Not all sizes are specified\n");
+        printUsage(argv[0], EXIT_FAILURE);
+    }
+    blockM = atoi(argv[optind]);
+    blockN = atoi(argv[optind + 1]);
+    blockK = atoi(argv[optind + 2]);
+
+    if ((mulOpts.memA == CLMEM_LOCAL_MEMORY ||
+            mulOpts.memB == CLMEM_LOCAL_MEMORY) &&
+            ((mulOpts.flags & TILEMUL_GLOBAL_CYCLIC) != 0)) {
+        printf("One of matrixes is in local memory, "
+                "disabling global cycling\n");
+        mulOpts.flags &= ~TILEMUL_GLOBAL_CYCLIC;
+    }
+
+    if (mulOpts.flags & TILEMUL_TRA) {
+        kextra.flags |= KEXTRA_TRANS_A;
+    }
+    if (mulOpts.flags & TILEMUL_TRB) {
+        kextra.flags |= KEXTRA_TRANS_B;
+    }
+
+    subdims[0].y = blockM * ITEM_WORK_M;
+    subdims[0].x = blockN * ITEM_WORK_N;
+    subdims[0].bwidth = blockK * ITEM_BLOCKS_K;
+    subdims[1].y = blockM;
+    subdims[1].x = blockN;
+    subdims[1].bwidth = blockK;
+
+    memset(out, 0, sizeof(out));
+
+    i = isDoubleBasedType(kextra.dtype);
+    kgenDeclareUptrs(ctx, i);
+    genTest(ctx, &gset, &mulOpts, separateFetch);
+    destroyKgenContext(ctx);
+
+    printf("Kernel code: \n\"%s\"\n", out);
+    err = run(out, subdims[0].y, subdims[0].x, subdims[0].bwidth, alpha,
+              &gset, mulOpts.flags, deviceType, verbose, iterNum);
+    if (err != CL_SUCCESS) {
+        printf("Test run failed, error %d\n", err);
+        return EXIT_FAILURE;
+    }
+	return EXIT_SUCCESS;
+}
diff --git a/src/library/blas/gens/tile.c b/src/library/blas/gens/tile.c
new file mode 100644
index 0000000..2ffdebc
--- /dev/null
+++ b/src/library/blas/gens/tile.c
@@ -0,0 +1,517 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <sys/types.h>
+#include <ctype.h>
+#include <stdarg.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include <defbool.h>
+#include <clblas_stddef.h>
+
+#include "blas_kgen.h"
+
+// assign tile's base name to 'name' if it is assigned to zero pointer
+static __inline void
+selectTileBaseName(Tile *tile, const char *name)
+{
+    if (tile->baseName == NULL) {
+        tile->baseName = name;
+    }
+}
+
+static void
+selectDefaultTileVecLen(
+    Tile *tile,
+    TileCreationFlags tflags,
+    const BlasGenSettings *gset,
+    BlasFunctionID funcID,
+    MatrixRole mrole)
+{
+    if (tflags & TILE_WITH_FETCH_VECLEN) {
+        tile->vecLen = getVecLen(gset, funcID, mrole);
+    }
+    else {
+        size_t w;
+
+        w = (tile->trans) ? tile->nrRows : tile->nrCols;
+        if (tile->packed) {
+            size_t wpad, height;
+
+            wpad = roundUpPow2(w);
+            height = (tile->trans) ? tile->nrCols : tile->nrRows;
+            tile->vecLen = (unsigned int)szmin(height * wpad, MAX_TILE_VECLEN);
+        }
+        else {
+            tile->vecLen = (unsigned int)roundUpPow2(w);
+            tile->vecLen = (unsigned int)szmin(tile->vecLen, MAX_TILE_VECLEN);
+        }
+    }
+}
+
+// physical tile pitch, can be less than one vector in case of packed mode
+static unsigned int
+tilePitch(const Tile *tile)
+{
+    unsigned int pitch;
+
+    if (!tile->trans) {
+        if (tile->packed) {
+            pitch = (unsigned int)roundUpPow2(tile->nrCols);
+        }
+        else {
+            pitch = (unsigned int)roundUp(tile->nrCols, tile->vecLen);
+        }
+    }
+    else {
+        if (tile->packed) {
+            pitch = (unsigned int)roundUpPow2(tile->nrRows);
+        }
+        else {
+            pitch = (unsigned int)roundUp(tile->nrRows, tile->vecLen);
+        }
+    }
+
+    return pitch;
+}
+
+void
+initTile(
+    Tile *tile,
+    const char *baseName,
+    unsigned int nrRows,
+    unsigned int nrCols,
+    unsigned int vecLen,
+    DataType dtype,
+    PrivateStorageType storType,
+    bool trans,
+    bool packed)
+{
+    assert(baseName == NULL || strlen(baseName) <= MAX_TILE_BASE_NAMELEN);
+
+    tile->baseName = baseName;
+    tile->nrRows = nrRows;
+    tile->nrCols = nrCols;
+    tile->vecLen = umin(MAX_TILE_VECLEN, vecLen);
+    tile->dtype = dtype;
+    tile->storType = storType;
+    tile->trans = trans;
+    tile->packed = packed;
+}
+
+void
+initDefaultTiles(
+    BlasGenSettings *gset,
+    BlasFunctionID funcID,
+    TileCreationFlags flags,
+    PrivateStorageType storType)
+{
+    const SubproblemDim *dim = &gset->subdims[1];
+    KernelExtraFlags kflags = gset->kextra->flags;
+    DataType dtype = gset->kextra->dtype;
+    Tile *tile;
+    const char *name;
+    int level;
+    bool packed;
+
+    level = funcBlasLevel(funcID);
+    packed = ((flags & TILE_PACKED) != 0);
+
+    tile = &gset->tileA;
+    selectTileBaseName(tile, "a");
+    initTile(tile, tile->baseName, (unsigned int)dim->y,
+             (unsigned int)dim->bwidth, 1, dtype, storType, false, packed);
+
+    tile->trans = isMatrixAccessColMaj(funcID, kflags, MATRIX_A);
+    if (!(gset->flags & BGF_WHOLE_A)) {
+        if (tile->trans) {
+            tile->nrCols = 1;
+        }
+        else {
+            tile->nrRows = 1;
+        }
+    }
+    selectDefaultTileVecLen(tile, flags, gset, funcID, MATRIX_A);
+
+    tile = &gset->tileBX;
+    name = (level == 2) ? "x" : "b";
+    selectTileBaseName(tile, name);
+    initTile(tile, tile->baseName, (unsigned int)dim->bwidth,
+             (unsigned int)dim->x, 1, dtype, storType, false, packed);
+
+    /*
+     * NOTE: Tiles for the level 2 functions are forced to be transposed
+     *       in order to allow user to fetch elements belonging to different
+     *       rows which is very useful in case of unit increment between
+     *       elements because provides faster access to the global memory.
+     */
+    if (level == 2) {
+        tile->trans = true;
+    }
+    else {
+        tile->trans = !isMatrixAccessColMaj(funcID, kflags, MATRIX_B);
+    }
+    selectDefaultTileVecLen(tile, flags, gset, funcID, MATRIX_B);
+
+    tile = &gset->tileCY;
+    name = (level == 2) ? "y" : "c";
+    selectTileBaseName(tile, name);
+
+    initTile(tile, tile->baseName, (unsigned int)dim->y,
+             (unsigned int)dim->x, 1, dtype, storType, false,
+             packed);
+
+    if (level == 2) {
+        tile->trans = true;
+    }
+    else if (!(flags & TILE_C_FORCE_NOTRANS)) {
+        tile->trans = isMatrixAccessColMaj(funcID, kflags, MATRIX_C);
+    }
+    selectDefaultTileVecLen(tile, flags, gset, funcID, MATRIX_C);
+
+    // FIXME: remove the restriction
+    /*if (isComplexType(tile->dtype)) {
+        tile->vecLen = 1;
+    }*/
+}
+
+unsigned int
+tileVectorsNum(const Tile *tile)
+{
+    size_t pitch, height;
+
+    pitch = tilePitch(tile);
+    height = (tile->trans) ? tile->nrCols : tile->nrRows;
+
+    return (unsigned int)divRoundUp(height * pitch, tile->vecLen);
+}
+
+unsigned int
+tileStorageSize(const Tile *tile)
+{
+    unsigned int u;
+
+    u = tileVectorsNum(tile) * tile->vecLen;
+
+    return u;
+}
+
+unsigned int
+tileLineSegmentLen(const Tile *tile)
+{
+    unsigned int pitch;
+    unsigned int len;
+
+    pitch = tilePitch(tile);
+    len = umin(pitch, tile->vecLen);
+    if (tile->trans) {
+        len = umin(len, tile->nrRows);
+    }
+    else {
+        len = umin(len, tile->nrCols);
+    }
+
+    return len;
+}
+
+int
+declareOneTileStorage(struct KgenContext *ctx, const Tile *tile)
+{
+    char tmp[1024];
+    const char *tname;
+    int r;
+    size_t size;
+
+    getVectorTypeName(tile->dtype, tile->vecLen, &tname, NULL);
+    size = tileVectorsNum(tile);
+    if (tile->storType == PRIV_STORAGE_ARRAY) {
+        sprintf(tmp, "%s %s[%lu];\n", tname, tile->baseName, size);
+    }
+    else {
+        size_t i;
+        char *p;
+
+        sprintf(tmp, "%s %s0", tname, tile->baseName);
+        p = tmp + strlen(tmp);
+        for (i = 1; i < size; i++) {
+            sprintf(p, ", %s%lu", tile->baseName, i);
+            p += strlen(p);
+        }
+        strcpy(p, ";\n");
+    }
+
+    r = kgenAddStmt(ctx, tmp);
+
+    return (r) ? -EOVERFLOW : 0;
+}
+
+int
+declareTileStorages(struct KgenContext *ctx, const BlasGenSettings *gset)
+{
+    int ret;
+
+    ret = declareOneTileStorage(ctx, &gset->tileA);
+    if (!ret) {
+        ret = declareOneTileStorage(ctx, &gset->tileBX);
+    }
+    if (!ret) {
+        declareOneTileStorage(ctx, &gset->tileCY);
+    }
+
+    return ret;
+}
+
+void
+sprintfTileElement(
+    Kstring *str,
+    const Tile *tile,
+    unsigned int row,
+    unsigned int col,
+    unsigned int len)
+{
+    unsigned int pitch;
+    unsigned int elemLen;
+    unsigned int off;
+    unsigned int vecLen = tile->vecLen;
+    char vchunk[24];
+
+    if (len == 0) {
+        len = vecLen;
+    }
+
+    pitch = tilePitch(tile);
+    elemLen = isComplexType(tile->dtype) ? 2 : 1;
+    if (!tile->trans) {
+        assert((row < tile->nrRows) && (col + len <= tile->nrCols));
+        off = (row * pitch + col) * elemLen;
+    }
+    else {
+        assert((row + len <= tile->nrRows) && (col < tile->nrCols));
+        off = (col * pitch + row) * elemLen;
+    }
+
+    vecLen *= elemLen;
+    sprintfVecChunk(vchunk, vecLen, len * elemLen, off % vecLen);
+
+    if (tile->storType == PRIV_STORAGE_ARRAY) {
+        sprintf(str->buf, "%s[%u]%s", tile->baseName, off / vecLen, vchunk);
+    }
+    else {
+        sprintf(str->buf, "%s%u%s", tile->baseName, off / vecLen, vchunk);
+    }
+}
+
+void
+sprintfTileElementHalf(
+    Kstring *str,
+    const Tile *tile,
+    unsigned int row,
+    unsigned int col,
+    TileElementHalf half)
+{
+    int len;
+
+    assert(isComplexType(tile->dtype));
+
+    // sprintf the full element and the drop an unneded half
+    sprintfTileElement(str, tile, row, col, 1);
+    len = (int)strlen(str->buf);
+    if (half == TE_HALF_HIGH) {
+        str->buf[len - 2] = str->buf[len - 1];
+    }
+    str->buf[len - 1] = '\0';
+}
+
+int
+forEachTile(Kstring *kstr, unsigned int row, unsigned int col,
+            unsigned int num, Tile *first, ...)
+{
+   unsigned int minVecLen = first->vecLen;
+   unsigned int valRow = first->nrRows;
+   unsigned int valCol = first->nrCols;
+   va_list argptr;
+   unsigned int i;
+
+   va_start(argptr, first);
+   for (i = 1; i < num; i++) {
+       Tile * cur = va_arg( argptr, Tile * );
+       minVecLen = umin(minVecLen, cur->vecLen);
+   }
+   va_end(argptr);
+
+   if (first->trans) {
+       valRow /= minVecLen;
+   }
+   else {
+       valCol /= minVecLen;
+   }
+
+   if (row >= valRow || col >= valCol /*|| row < 0 || col < 0*/) { //would be signed
+       return 0;
+   }
+   if (kstr) {
+       va_start(argptr, first);
+       for (i = 0; i < num; i++) {
+           Tile * cur = i ? va_arg( argptr, Tile * ) : first;
+           if (cur->baseName) {
+               unsigned int vRow = (cur->trans ? row * minVecLen : row);
+               unsigned int vCol = (cur->trans ? col : col * minVecLen);
+               sprintfTileElement(&kstr[i], cur, vRow, vCol, minVecLen);
+           }
+       }
+       va_end(argptr);
+   }
+   return first->trans ? valRow : valCol;
+}
+
+void
+genSetZeroInTile(
+    struct KgenContext *ctx,
+    const Tile *tile,
+    unsigned int row,
+    unsigned int col,
+    unsigned int len)
+{
+    char tmp[1024];
+    Kstring elem;
+
+    sprintfTileElement(&elem, tile, row, col, len);
+    sprintf(tmp, "%s = 0;\n", elem.buf);
+    kgenAddStmt(ctx, tmp);
+}
+
+void
+genSetUnitInTile(
+    struct KgenContext *ctx,
+    const Tile *tile,
+    unsigned int row,
+    unsigned int col)
+{
+    char tmp[1024];
+    Kstring elem;
+    const char *s;
+
+    sprintfTileElement(&elem, tile, row, col, 1);
+    s = strOne(tile->dtype);
+    sprintf(tmp, "%s = %s;\n", elem.buf, s);
+    kgenAddStmt(ctx, tmp);
+}
+
+void
+genZeroTile(struct KgenContext *ctx, const Tile *tile)
+{
+    char tmp[1024];
+    Kstring elem;
+    unsigned int incRows, incCols;
+    unsigned int i, j, v;
+
+    v = tileLineSegmentLen(tile);
+    if (!tile->trans) {
+        incRows = 1;
+        incCols = v;
+    }
+    else {
+        incRows = v;
+        incCols = 1;
+    }
+
+    for (i = 0; i < tile->nrRows; i += incRows) {
+        for (j = 0; j < tile->nrCols; j += incCols) {
+            sprintfTileElement(&elem, tile, i, j, v);
+            sprintf(tmp, "%s = 0;\n", elem.buf);
+            kgenAddStmt(ctx, tmp);
+        }
+    }
+
+    kgenAddBlankLine(ctx);
+}
+
+void
+genTileCopy(
+    struct KgenContext *ctx,
+    const Tile *dst,
+    const Tile *src,
+    TileCopyOps op)
+{
+    char tmp[1024];
+    Kstring el1, el2;
+    unsigned int nrRows, nrCols;
+    unsigned int incRows, incCols;
+    unsigned int vlen;
+    unsigned int i, j;
+
+    nrRows = umin(dst->nrRows, src->nrRows);
+    nrCols = umin(dst->nrCols, src->nrCols);
+    if (dst->trans != src->trans) {
+        vlen = 1;
+        incRows = incCols = 1;
+    }
+    else {
+        vlen = umin(dst->vecLen, src->vecLen);
+        if (!dst->trans) {
+            incRows = 1;
+            incCols = umin(dst->nrCols, src->nrCols);
+            incCols = umin(incCols, vlen);
+        }
+        else {
+            incRows = umin(dst->nrRows, src->nrRows);
+            incRows = umin(incRows, vlen);
+            incCols = 1;
+        }
+    }
+
+    for (i = 0; i < nrRows; i += incRows) {
+        for (j = 0; j < nrCols; j += incCols) {
+            sprintfTileElement(&el1, dst, i, j, vlen);
+            sprintfTileElement(&el2, src, i, j, vlen);
+            switch( op )
+            {
+                case TILECOPY_ASSIGN:
+                    sprintf(tmp, "%s = %s;\n", el1.buf, el2.buf);
+                    break;
+
+                case TILECOPY_ADD_ASSIGN:
+                    sprintf(tmp, "%s += %s;\n", el1.buf, el2.buf);
+                    break;
+
+                case TILECOPY_SUB_ASSIGN:
+                    sprintf(tmp, "%s -= %s;\n", el1.buf, el2.buf);
+                    break;
+
+                case TILECOPY_MUL_ASSIGN:
+                    sprintf(tmp, "%s *= %s;\n", el1.buf, el2.buf);
+                    break;
+
+                case TILECOPY_DIV_ASSIGN:
+                    sprintf(tmp, "%s /= %s;\n", el1.buf, el2.buf);
+                    break;
+
+                case TILECOPY_MOD_ASSIGN:
+                    sprintf(tmp, "%s %%= %s;\n", el1.buf, el2.buf);
+                    break;
+
+                default:
+                    break;
+            }
+            kgenAddStmt(ctx, tmp);
+        }
+    }
+
+    kgenAddBlankLine(ctx);
+}
diff --git a/src/library/blas/gens/tile.h b/src/library/blas/gens/tile.h
new file mode 100644
index 0000000..1539934
--- /dev/null
+++ b/src/library/blas/gens/tile.h
@@ -0,0 +1,424 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * API to manupulate with matrix tiles
+ */
+
+#ifndef TILE_H_
+#define TILE_H_
+
+#include <kerngen.h>
+#include <blas_funcs.h>
+
+#define tileLineElemNum forEachTile
+
+struct BlasGenSettings;
+
+enum {
+    MAX_TILE_BASE_NAMELEN = sizeof(Kstring) - 25,
+    /*
+     * It may be 16 vector components at maximum. Adding the length of the
+     * subscript and selector operator, 2 digit index, and the end-line symbol,
+     * to the maximum base name length we get the maximum tile element string
+     * length
+     */
+    MAX_TILE_ELEMENT_STRLEN = sizeof(Kstring) - 1,
+    MAX_TILE_VECLEN = 8
+};
+
+/**
+ * @internal
+ * @brief Flags showing tile storing specifics
+ * @ignroup TILES
+ */
+typedef enum TileCreationFlags {
+    /** Tile C should be forced to non-transposed form */
+    TILE_C_FORCE_NOTRANS = 0x01,
+    /** tile vector length is equal to the length of fetched vectors */
+    TILE_WITH_FETCH_VECLEN = 0x02,
+    /**
+     * If depending of transposing vector length is greater than
+     * number of rows or columns, store several rows or columns respectively
+     * in each vector
+     */
+    TILE_PACKED = 0x04
+} TileCreationFlags;
+
+/**
+ * @internal
+ * @brief Type of storage in the private memory
+ * @ingroup TILES
+ */
+typedef enum PrivateStorageType {
+    /** Tile is stored in array */
+    PRIV_STORAGE_ARRAY,
+    /** Tile is stored in a set of variables */
+    PRIV_STORAGE_VARIABLE_SET
+} PrivateStorageType;
+
+typedef enum TileCopyOps {
+    TILECOPY_ASSIGN,
+    TILECOPY_ADD_ASSIGN,
+    TILECOPY_SUB_ASSIGN,
+    TILECOPY_MUL_ASSIGN,
+    TILECOPY_DIV_ASSIGN,
+    TILECOPY_MOD_ASSIGN
+} TileCopyOps;
+
+/**
+ * @internal
+ * @brief Tile element half types
+ * @ingroup TILES
+ */
+typedef enum TileElementHalf {
+    TE_HALF_LOW,
+    TE_HALF_HIGH
+} TileElementHalf;
+
+/**
+ * @internal
+ * @brief Matrix tile stored in a private area
+ * @ingroup TILES
+ */
+typedef struct Tile {
+    const char *baseName;
+    unsigned int nrRows;
+    unsigned int nrCols;
+    unsigned int vecLen;
+    DataType dtype;
+    PrivateStorageType storType;
+    /** Flag of storing tile in the transposed form */
+    bool trans;
+    /*
+     * Depending on the transposing several rows or columns can be fit
+     * into single vector. It makes sense only when number of rows or column
+     * respectively is less than vector length
+     */
+    bool packed;
+} Tile;
+
+/**
+ * @internal
+ * @brief Initialize tile
+ *
+ * @param[out] tile      Tile description structure to fill
+ * @param[in] baseName   Tile base name
+ * @param[in] nrRows     Number of rows in the tile
+ * @param[in] nrCols     Number of columns in the tile
+ * @param[in] vecLen     Length of one native OpenCL element being a part of
+ *                       the tile
+ * @param[in] dtype      Data type
+ * @param[in] storType   Tile storate type
+ * @param[in] trans      Shows if tile is stored in the transposed form
+ *                       or direct
+ * @param[in] packed     Tile is stored in packed form. Has not effect if
+ *                       a single line can be fit into the single vector.
+ *
+ * If \b vecLen param is above MAX_TILE_VECLEN then will be truncated into
+ * MAX_TILE_VECLEN.
+ *
+ * @ingroup TILES
+ */
+void
+initTile(
+    Tile *tile,
+    const char *baseName,
+    unsigned int nrRows,
+    unsigned int nrCols,
+    unsigned int vecLen,
+    DataType dtype,
+    PrivateStorageType storType,
+    bool trans,
+    bool packed);
+
+/**
+ * @internal
+ * @brief Initialize matrix tile from generator settings
+ *
+ * @param[out] gset      Generator settings which tile should be initialized in
+ * @param[in] funcID     BLAS function ID
+ * @param[in] flags      Tile creation flags
+ * @param[in] storType   Storage type
+ *
+ * If \b baseName field of a tile structure in the generator settings is zero,
+ * it is initialized with the default value: "a" for the matrix A, "b" for
+ * the matrix B, "x" for the vector X, "c" for the matrix C, and "y" for the
+ * vector Y.
+ *
+ * As X and Y are column-vectors from the math point of view, tiles for them
+ * are always packed irrespectively the TileCreationFlags::TILE_PACKED flag
+ * is specified or not.
+ *
+ *
+ * Transposition of C tile matches transposition of C matrix by default, until
+ * the TILE_C_FORCE_NOTRANS flag is not set. If the flag is set, tile is
+ * forced to be initialized as non-transposed and veclen must be verified.
+ *
+ */
+void
+initDefaultTiles(
+    struct BlasGenSettings *gset,
+    BlasFunctionID funcID,
+    TileCreationFlags flags,
+    PrivateStorageType storType);
+
+/**
+ * @internal
+ * @brief Get entire number of vectors in the tile
+ *
+ * @param[in] tile          Tile to get number of vectors of
+ */
+unsigned int
+tileVectorsNum(const Tile *tile);
+
+/**
+ * @internal
+ * @brief Size of entire tile storage in elements
+ *
+ * @param[in] tile          Tile to get size of
+ */
+unsigned int
+tileStorageSize(const Tile *tile);
+
+/**
+ * @brief Get length of tile line segment
+ *
+ * @param[in] Tile       Source tile
+ *
+ * Under that segment it is assumed such a part of line which doesn't cross over
+ * vector bound and row/column bound depending on the tile is transposed or not.
+ * In the other words, this is a piece of data which provides maximum possible
+ * vectorization don't breaking correctness.
+ */
+unsigned int
+tileLineSegmentLen(const Tile *tile);
+
+/**
+ * @internal
+ * @brief Declare variables needed to store a tile
+ *
+ * @param[out] ctx        Generator context
+ * @param[in] gset        Generator settings containing desctiptors of
+ *                        tiles to declare storages for
+ *
+ * If a tile is fit into a single variable of the native type matching
+ * to the tile's vector length, it is declared a single variable with the name
+ * matching the \b baseName field being a part of the @ref Tile structure.
+ * If not, the following rules are applied. If the tile is needed to be stored
+ * in a private array, variable name matches the base name and array size
+ * is sufficient to fit such a tile. If the tile is needed to be stored
+ * in a set of variables which names are arranged as the base name followed
+ * with an integer index starting from zero and incremented by one for each
+ * subsequent variable.
+ *
+ * @return 0 on success, and -EOVERFLOW if the source buffer is overflowed
+ *
+ * @ingroup TILES
+ */
+int
+declareTileStorages(struct KgenContext *ctx, const struct BlasGenSettings *gset);
+
+/**
+ * @internal
+ * @brief Declare variable needed to store one tile
+ *
+ * @param[out] ctx        Generator context
+ * @param[in] tile        Tile settings containing desctiptors of
+ *                        a tile to declare storages for
+ *
+ * If a tile is fit into a single variable of the native type matching
+ * to the tile's vector length, it is declared a single variable with the name
+ * matching the \b baseName field being a part of the @ref Tile structure.
+ * If not, the following rules are applied. If the tile is needed to be stored
+ * in a private array, variable name matches the base name and array size
+ * is sufficient to fit such a tile. If the tile is needed to be stored
+ * in a set of variables which names are arranged as the base name followed
+ * with an integer index starting from zero and incremented by one for each
+ * subsequent variable.
+ *
+ * @return 0 on success, and -EOVERFLOW if the source buffer is overflowed
+ *
+ * @ingroup TILES
+ */
+int
+declareOneTileStorage(struct KgenContext *ctx, const Tile *tile);
+
+/**
+ * @internal
+ * @brief Sprintf element composed of one or several data elements
+ *        stored in the tile
+ *
+ * @param[out] str          Kernel string object to store tile element
+ *                          expression
+ * @param[in] tile          Tile description structure
+ * @param[in] row           Row of the starting element
+ * @param[in] col           Element column
+ * @param[in] len           Number of tile elements needed to be captured by
+ *                          the expression
+ *
+ * \b row should be less than number of rows and \b col should be less than
+ * number of columns in the tile. Traversal of a tile line is not allowed.
+ * That means \b col plus \b len should be not greater than number of columns
+ * if the tile is stored in direct form, and \b row plus \b len should be not
+ * greater than number of rows if the tile is stored in transposed form.
+ * If it is not hold true in debug mode, an assertion is triggered.
+ * In the release may produce a wrong code which can be even not compilable.
+ *
+ * @ingroup TILES
+ */
+void
+sprintfTileElement(
+    Kstring *str,
+    const Tile *tile,
+    unsigned int row,
+    unsigned int col,
+    unsigned int len);
+
+/**
+ * @internal
+ * @brief Sprintf half of a single complex data element stored in the tile
+ *
+ * @param[out] str          Kernel string object to store tile element
+ *                          expression
+ * @param[in] tile          Tile description structure
+ * @param[in] row           Row of the starting element
+ * @param[in] col           Element column
+ * @param[in] half          Half type
+ *
+ * The restrictions for \b row and \b col are the same as for
+ * sprintfTileElement(). This function is applicable only for tiles containing
+ * complex data and must not be used in case of real data.
+ *
+ * @ingroup TILES
+ */
+void
+sprintfTileElementHalf(
+    Kstring *str,
+    const Tile *tile,
+    unsigned int row,
+    unsigned int col,
+    TileElementHalf half);
+
+/**
+ * @internal
+ * @brief Sprintf element composed of one or several data elements
+ *        stored in each of the tiles
+ *
+ * @param[out] kstrs        Kernel string objects array to store element
+ *                          expression for each tile
+ * @param[in] row           Vectorizable element row
+ * @param[in] col           Vectorizable element column
+ * @param[in] num           Number of tile description structure
+ * @param[in] first         First tile description structure
+ *
+ * Decides how many vectored access in for each line of each tile will be and
+ * does sprintfTileElement() for each of tiles. This function can have got any
+ * value of \b row \b and \b col \b. \b kstrs \b and \b tile->baseName \b can
+ * have NULL, then no sprintfTileElement() will be executed.
+ *
+ * @return 0 if no sprintf tiles, or number of vectors in one line
+ *
+ * @ingroup TILES
+ */
+int
+forEachTile(Kstring *kstrs,
+            unsigned int row,
+            unsigned int col,
+            unsigned int num,
+            Tile *first,
+            ...);
+
+/**
+ * @internal
+ * @brief Generate assigning a tile element with zero
+ *
+ * @param[out] ctx      Generator context
+ * @param[in] tile      Tile description structure
+ * @param[in] row       Row of the starting element
+ * @param[in] col       Element column
+ * @param[in] len       Number of elements needed to be assigned with zero
+ *
+ * See decription of sprintfTileElement() for more details about restrictions
+ * on \b row, \b col and \b len.
+ *
+ * @ingroup TILES
+ */
+void
+genSetZeroInTile(
+    struct KgenContext *ctx,
+    const Tile *tile,
+    unsigned int row,
+    unsigned int col,
+    unsigned int len);
+
+/**
+ * @internal
+ * @brief Generate assigning a tile element with unit
+ *
+ * @internal
+ * @brief Generate assigning a tile element with zero
+ *
+ * @param[out] ctx      Generator context
+ * @param[in] tile      Tile description structure
+ * @param[in] row       Row of the starting element
+ * @param[in] col       Element column
+ *
+ * \b row should be less than number of rows and \b col should be less than
+ * number of columns in the tile. If it is not hold true in debug mode,
+ * an assertion is triggered. In the release may produce a wrong code which
+ * can be even not compilable.
+ *
+ * @ingroup TILES
+ */
+void
+genSetUnitInTile(
+    struct KgenContext *ctx,
+    const Tile *tile,
+    unsigned int row,
+    unsigned int col);
+
+/**
+ * @internal
+ * @brief Generate zeroing an entire tile
+ *
+ * @param[out] ctx      Generator context
+ * @param[in] tile      Tile description structure
+ *
+ * @ingroup TILES
+ */
+void
+genZeroTile(struct KgenContext *ctx, const Tile *tile);
+
+/**
+ * @internal
+ * @brief Generate copying between 2 tiles
+ *
+ * @param[out] ctx      Generator context
+ * @param[in] dst       Destination tile
+ * @param[in] src       Source tile
+ *
+ * @ingroup TILES
+ */
+void
+genTileCopy(
+    struct KgenContext *ctx,
+    const Tile *dst,
+    const Tile *src,
+    TileCopyOps op);
+
+#endif /* TILE_H_ */
diff --git a/src/library/blas/gens/tile_iter.c b/src/library/blas/gens/tile_iter.c
new file mode 100644
index 0000000..65247f2
--- /dev/null
+++ b/src/library/blas/gens/tile_iter.c
@@ -0,0 +1,296 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <errno.h>
+#include <assert.h>
+#include "tile_iter.h"
+
+// Translate coordiates in physical memory block
+// into logical tile coordinates
+static int
+iterCalcLogCoords( PhysTileIterator* iter){
+
+    if( NULL == iter ){
+        return -EINVAL;
+    }
+
+    if ( iter->isLogRowMaj ) {
+
+        iter->row = iter->line;
+        iter->col = iter->vec*iter->vecLen;
+
+    }
+    else {
+
+        iter->col = iter->line;
+        iter->row = iter->vec*iter->vecLen;
+    }
+
+    return 0;
+}
+
+//-----------------------------------------------------------------------------
+
+int
+iterInit(PhysTileIterator *iter,
+    const Tile *tile,
+    int vecLen,
+    unsigned int tileIterFlags)
+{
+    if( NULL == iter ||
+        NULL == tile ){
+
+        return -EINVAL;
+    }
+
+    memset(iter, 0, sizeof(PhysTileIterator));
+    iter->isLogRowMaj = tile->trans ? 0 : 1;
+    iter->vecLen = vecLen;
+
+    if ( iter->isLogRowMaj ) {
+
+        if ( tile->nrCols % vecLen ) {
+            return -EINVAL;
+        }
+
+        if ( tileIterFlags & TILE_ITER_BACKWARD_ROWS ) {
+            iter->phyIterFlags |= PHY_ITER_BACKWARD_LINES;
+        }
+        if ( tileIterFlags & TILE_ITER_BACKWARD_COLS ) {
+            iter->phyIterFlags |= PHY_ITER_BACKWARD_VECS;
+        }
+
+        iter->nrLines = tile->nrRows;
+        iter->nrVecs = tile->nrCols/vecLen;
+
+    }
+    else {
+
+        if ( tile->nrRows % vecLen ) {
+            return -EINVAL;
+        }
+
+        if ( tileIterFlags & TILE_ITER_BACKWARD_ROWS ) {
+            iter->phyIterFlags |= PHY_ITER_BACKWARD_VECS;
+        }
+        if ( tileIterFlags & TILE_ITER_BACKWARD_COLS ) {
+            iter->phyIterFlags |= PHY_ITER_BACKWARD_LINES;
+        }
+
+        iter->nrLines = tile->nrCols;
+        iter->nrVecs = tile->nrRows/vecLen;
+    }
+
+    switch( iter->phyIterFlags & (  PHY_ITER_BACKWARD_VECS |
+                                    PHY_ITER_BACKWARD_LINES ) ){
+
+        // lines - forward, vectors - forward
+        case !( PHY_ITER_BACKWARD_LINES | PHY_ITER_BACKWARD_VECS ):
+
+            iter->vec = 0;
+            iter->line = 0;
+            break;
+
+        // lines - forward, vectors - backward
+        case PHY_ITER_BACKWARD_VECS:
+
+            iter->vec = iter->nrVecs-1;
+            iter->line = 0;
+            break;
+
+        // lines - backward, vectors - forward
+        case PHY_ITER_BACKWARD_LINES:
+
+            iter->vec = 0;
+            iter->line = iter->nrLines-1;
+            break;
+
+        // lines - backward, vectors - backward
+        case PHY_ITER_BACKWARD_LINES | PHY_ITER_BACKWARD_VECS:
+
+            iter->vec = iter->nrVecs-1;
+            iter->line = iter->nrLines-1;
+            break;
+
+    }
+
+    iterCalcLogCoords(iter);
+
+    return 0;
+}
+
+//-----------------------------------------------------------------------------
+
+int iterIterate(PhysTileIterator *iter)
+{
+    if( NULL == iter ){
+        return -EINVAL;
+    }
+
+    //tile end
+    if( iterIsEnd(iter) ){
+        return 1;
+    }
+
+    switch( iter->phyIterFlags & (  PHY_ITER_BACKWARD_LINES |
+                                    PHY_ITER_BACKWARD_VECS) ){
+
+        // lines - forward, vectors - forward
+        case !( PHY_ITER_BACKWARD_LINES | PHY_ITER_BACKWARD_VECS ):
+
+            if( iter->nrVecs-1 == iter->vec ){
+
+                iter->vec = 0;
+                iter->line++;
+            }
+            else{
+                iter->vec++;
+            }
+            break;
+
+        // lines - forward, vectors - backward
+        case PHY_ITER_BACKWARD_VECS:
+
+            if( 0 == iter->vec ){
+
+                iter->vec = iter->nrVecs-1;
+                iter->line++;
+            }
+            else{
+                iter->vec--;
+            }
+            break;
+
+        // lines - backward, vectors - forward
+        case PHY_ITER_BACKWARD_LINES:
+
+            if( iter->nrVecs-1 == iter->vec ){
+
+                iter->vec = 0;
+                iter->line--;
+            }
+            else{
+                iter->vec++;
+            }
+            break;
+
+        // lines - backward, vectors - backward
+        case ( PHY_ITER_BACKWARD_LINES | PHY_ITER_BACKWARD_VECS ):
+
+            if(  0 == iter->vec ){
+
+                iter->vec = iter->nrVecs-1;
+                iter->line--;
+            }
+            else{
+                iter->vec--;
+            }
+            break;
+    }
+
+    iterCalcLogCoords(iter);
+
+    return 0;
+}
+
+//-----------------------------------------------------------------------------
+
+int
+iterSeek( PhysTileIterator *iter,
+    int row,
+    int col )
+{
+    if ( NULL == iter ) {
+        return -EINVAL;
+    }
+
+    iter->row = row;
+    iter->col = col;
+
+    if ( iter->isLogRowMaj ) {
+
+        iter->line = row;
+        iter->vec = col/iter->vecLen;
+    }
+    else {
+
+        iter->line = col;
+        iter->vec = row/iter->vecLen;
+    }
+
+    assert( iter->line < iter->nrLines );
+    assert( iter->vec < iter->nrVecs );
+    return 0;
+}
+
+//-----------------------------------------------------------------------------
+
+int
+iterSeekPhys( PhysTileIterator *iter,
+    int line,
+    int vec )
+{
+    if ( NULL == iter ) {
+        return -EINVAL;
+    }
+
+    iter->line = line;
+    iter->vec = vec;
+
+    if ( iter->isLogRowMaj ) {
+
+        iter->row = line;
+        iter->col = vec * iter->vecLen;
+    }
+    else {
+
+        iter->row = vec * iter->vecLen;
+        iter->col = line;
+    }
+
+    assert( iter->line < iter->nrLines );
+    assert( iter->vec < iter->nrVecs );
+    return 0;
+}
+
+//-----------------------------------------------------------------------------
+
+/*
+ * Check if the entire tile has been iterated. Return true if the iterator is
+ * at the next element beyond the last.
+ */
+int iterIsEnd(const PhysTileIterator *iter)
+{
+    int isEnd = false;
+
+    if( NULL == iter ){
+        return -EINVAL;
+    }
+
+    if( iter->phyIterFlags & PHY_ITER_BACKWARD_LINES ){
+        if( iter->line < 0 ){
+            isEnd = true;
+        }
+    }
+    else{
+        if( iter->line >= iter->nrLines ){
+            isEnd = true;
+        }
+    }
+
+    return isEnd;
+
+}
diff --git a/src/library/blas/gens/tile_iter.h b/src/library/blas/gens/tile_iter.h
new file mode 100644
index 0000000..81028a3
--- /dev/null
+++ b/src/library/blas/gens/tile_iter.h
@@ -0,0 +1,79 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#ifndef TILE_ITER_H
+#define TILE_ITER_H
+
+#include "blas_kgen.h"
+
+typedef enum TileIterFlags {
+    // iterate in the backward direction along logical rows
+    TILE_ITER_BACKWARD_ROWS = 0x01,
+    // iterate in the backward direction along logical columns
+    TILE_ITER_BACKWARD_COLS = 0x02
+} TileIterFlags;
+
+typedef enum PhyIterFlags {
+    PHY_ITER_BACKWARD_LINES = 0x01,
+    PHY_ITER_BACKWARD_VECS = 0x02,
+} PhyIterFlags;
+
+typedef struct PhysTileIterator {
+    int row;   // logical tile row
+    int col;   // logical tile column
+
+    int phyIterFlags;
+    int isLogRowMaj;
+
+    int vecLen;
+
+    int line;     // physical line
+    int vec;      // vector in physical line
+
+    int nrLines;   // physical line number
+    int nrVecs;    // physical vec number
+
+} PhysTileIterator;
+
+//-----------------------------------------------------------------------------
+
+int
+iterInit(PhysTileIterator *iter,
+    const Tile *tile,
+    int vecLen,
+    unsigned int tileIterFlags);
+
+int
+iterIterate(PhysTileIterator *iter);
+
+/*
+ * Check if the entire tile has been iterated. Return true if the iterator is
+ * at the next element beyond the last.
+ */
+int
+iterIsEnd(const PhysTileIterator *iter);
+
+int
+iterSeek( PhysTileIterator *iter,
+    int row,
+    int col );
+
+int
+iterSeekPhys( PhysTileIterator *iter,
+    int line,
+    int vec );
+
+#endif
diff --git a/src/library/blas/gens/tilemul.c b/src/library/blas/gens/tilemul.c
new file mode 100644
index 0000000..1ff00b7
--- /dev/null
+++ b/src/library/blas/gens/tilemul.c
@@ -0,0 +1,952 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include <defbool.h>
+#include <clblas_stddef.h>
+#include <sys/types.h>
+#include <kerngen.h>
+#include <matrix_dims.h>
+#include <dis_warning.h>
+
+#include "blas_kgen.h"
+
+#define MAX_LENGTH 4096
+#define BITS_INT (sizeof(int) * 8)
+
+typedef enum VectMulType {
+    VECT_MULT_REAL,
+    VECT_MULT_COMPLEX_REAL,
+    VECT_MULT_COMPLEX_IMAG
+} VectMulType;
+
+static const char *vectComponents = "0123456789abcdef";
+
+static void
+getVecLens(
+    const BlasGenSettings *gset,
+    unsigned int *vlenA,
+    unsigned int *vlenB,
+    unsigned int *vlenC)
+{
+    const CLBLASKernExtra *kextra = gset->kextra;
+    bool distVect = ((gset->flags & BGF_DISTINCT_VECLEN) != 0);
+
+    if (vlenA != NULL) {
+        *vlenA = (distVect) ? kextra->vecLenA : kextra->vecLen;
+    }
+    if (vlenB != NULL) {
+        *vlenB = (distVect) ? kextra->vecLenB : kextra->vecLen;
+    }
+    if (vlenC != NULL) {
+        *vlenC = (distVect) ? kextra->vecLenC : kextra->vecLen;
+    }
+}
+
+static TileMulCore
+checkReplaceCore(
+    const BlasGenSettings *gset,
+    TileMulCore core,
+    bool tra,
+    bool trb)
+{
+    const SubproblemDim *subdims = gset->subdims;
+    DataType dtype = gset->kextra->dtype;
+    unsigned int vlenC;
+
+    // 'dot' function can't be used for complex types
+    if (isComplexType(dtype) && (core == TILEMUL_DOT)) {
+        core = TILEMUL_MULADD;
+    }
+
+    // 'dot' is supported only for one case of vectors fetch
+    // where A is fetched by rows and B - by columns
+    if (core == TILEMUL_DOT && !(!tra && trb)) {
+        core = TILEMUL_MULADD;
+    }
+
+    // dot is not supported for vector unaligned bwidth
+    getVecLens(gset, NULL, NULL, &vlenC);
+    if (core == TILEMUL_DOT && (subdims[1].bwidth % vlenC != 0)) {
+        core = TILEMUL_MULADD;
+    }
+
+    return core;
+}
+
+static int
+checkTriggerPostFetch(
+    struct KgenContext *ctx,
+    const TileMulOpts *mulOpts,
+    MatrixRole mrole)
+{
+    int ret = 0;
+
+    if (mulOpts->postFetch) {
+        ret = mulOpts->postFetch(ctx, mrole, mulOpts->postFetchPriv);
+        kgenAddBlankLine(ctx);
+    }
+
+    return ret;
+}
+
+/*
+ * In an expression of a complex elements swap real and imaginary parts
+ */
+static void
+swapComplexComponents(Kstring *expr, unsigned int vecLen)
+{
+    char *p;
+    unsigned int i;
+    char tmp;
+
+    /*
+     * If the string doesn't contain a suffix of vector components, then
+     * construct it from scratch in the swapped form right away, otherwise
+     * swap all even and odd components
+     */
+    p = strchr(expr->buf, '.');
+    if (p == NULL) {
+        strcat(expr->buf, ".s");
+        p = expr->buf + strlen(expr->buf);
+        for (i = 0; i < vecLen; i++) {
+            *p++ = vectComponents[2 * i + 1];
+            *p++ = vectComponents[2 * i];
+    }
+        *p = '\0';
+    }
+                else {
+        p = expr->buf;
+        i = (unsigned int)strlen(p) - 1;
+        for (; vecLen != 0; i -= 2, vecLen--) {
+            tmp = p[i];
+            p[i] = p[i - 1];
+            p[i - 1] = tmp;
+                }
+            }
+        }
+
+static void
+takeComplexApart(Kstring *re, Kstring *im, const Kstring *src)
+{
+    char *p;
+    int i;
+
+    p = strchr(src->buf, '.');
+    if (p == NULL) {
+        strcpy(re->buf, src->buf);
+        strcat(re->buf, ".s0");
+        strcpy(im->buf, src->buf);
+        strcat(im->buf, ".s1");
+                }
+                else {
+        i = (int)strlen(src->buf) - 1;
+
+        strcpy(re->buf, src->buf);
+        strcpy(im->buf, src->buf);
+        re->buf[i] = '\0';
+        im->buf[i - 1] = im->buf[i];
+        im->buf[i] = '\0';
+                }
+            }
+
+/*
+ * Select physical row in tile A depending on current row in tile C
+ * and storing mode of A: whole or not, transposed or not
+ */
+static __inline unsigned int
+selectRowA(const Tile *a, unsigned int m, bool wholeA)
+{
+    return (a->trans || wholeA) ? m : 0;
+        }
+
+/*
+ * Select physical column in tile A depending on current column in tile C
+ * and storing mode of A: whole or not, transposed or not
+ */
+static __inline unsigned int
+selectColA(const Tile *a, unsigned int k, bool wholeA)
+{
+    return (!a->trans || wholeA) ? k : 0;
+    }
+
+/*
+ * Common line segment length of 2 tiles being arguments in tile multiplication
+ */
+static unsigned int
+commonTileSegmentLen(const Tile *tile1, const Tile *tile2)
+{
+    unsigned int u1, u2;
+
+    u1 = tileLineSegmentLen(tile1);
+    u2 = tileLineSegmentLen(tile2);
+
+    return umin(u1, u2);
+            }
+
+static void
+genPointerUpdate(
+    struct KgenContext *ctx,
+    const char *ptrName,
+    const char *ldName,
+    size_t bwidth,
+    size_t bheight,
+    unsigned int vecLen,
+    DataType dtype,
+    BlasGenFlags gflags,
+    bool rowMaj,
+    bool isLocal)
+{
+    const char *uptr;
+    Kstring tmp;
+    const char *p;
+
+    if (gflags & BGF_UPTRS) {
+        getVectorTypeName(dtype, vecLen, NULL, &uptr);
+        ksprintf(&tmp, "%s.%s", ptrName, uptr);
+        p = tmp.buf;
+    }
+    else {
+        p = ptrName;
+    }
+
+    if (rowMaj) {
+        kgenPrintf(ctx, "%s += %lu;\n", p, bwidth / vecLen);
+    }
+    else if (isLocal) {
+        kgenPrintf(ctx, "%s += %lu;\n",
+                   p, bwidth * (bheight / vecLen));
+    }
+    else {
+        Kstring ld;
+        Kstring bwStr, madExpr;
+        unsigned int scale;
+
+        kstrcpy(&ld, ldName);
+        ksprintf(&bwStr, "%lu", bwidth);
+        scale = (gflags & BGF_LD_IN_VECTORS) ? 0 : vecLen;
+        sprintfFastScalarMad(&madExpr, &bwStr, &ld, scale, NULL);
+        kgenPrintf(ctx, "%s += %s;\n", p, madExpr.buf);
+    }
+}
+
+static void
+genRealMulUpdate(
+    struct KgenContext *ctx,
+    const Kstring *elA,
+    const Kstring *elB,
+    const Kstring *elC,
+    bool transC,
+    TileMulCore core)
+{
+    char tmp[MAX_LENGTH];
+    const char *src1, *src2;
+
+    /*
+     * Select order of source operands because type of 'mad' result is
+     * determined by the first operand
+     */
+    src1 = (transC) ? elA->buf : elB->buf;
+    src2 = (transC) ? elB->buf : elA->buf;
+
+        if (core == TILEMUL_MAD) {
+        sprintf(tmp, "%s = mad(%s, %s, %s);\n",
+                elC->buf, src1, src2, elC->buf);
+        }
+        else {
+        sprintf(tmp, "%s += %s * %s;\n", elC->buf, src1, src2);
+        }
+
+    kgenAddStmt(ctx, tmp);
+}
+
+// Generate complete vector-vector product
+static void
+genVecMul(
+    struct KgenContext *ctx,
+    unsigned int m,
+    unsigned int n,
+    const Tile *a,
+    const Tile *b,
+    const Tile *c,
+    bool conjA,
+    bool conjB,
+    TileMulCore core,
+    bool wholeA)
+{
+    unsigned int k;
+    char tmp[MAX_LENGTH];
+    Kstring elA, elB, elC;
+    unsigned int vlen = 0;
+    bool isComplex;
+    bool isDouble;
+
+    isDouble = isDoubleBasedType(c->dtype);
+    isComplex = isComplexType(c->dtype);
+    if ((core == TILEMUL_DOT) && !isComplex) {
+        vlen = commonTileSegmentLen(a, b);
+    }
+    else {
+        vlen = 1;
+    }
+
+    sprintfTileElement(&elC, c, m, n, 1);
+    if (!wholeA) {
+        m = 0;
+        }
+
+    for (k = 0; k < a->nrCols; k += vlen) {
+        sprintfTileElement(&elA, a, m, k, vlen);
+        sprintfTileElement(&elB, b, k, n, vlen);
+
+        /*
+         * Using 'dot' is not valid for complex, and replaced with '*' operator
+         * for unvectorized real data
+         */
+        if ((core == TILEMUL_DOT) && (vlen > 1)) {
+            sprintf(tmp, "%s += dot(%s, %s);\n",
+                    elC.buf, elA.buf, elB.buf);
+        }
+        else if (isComplex) {
+            Kstring expr;
+
+            sprintfComplexMulUpdate(&expr, &elC, &elA, &elB, &elC, isDouble,
+                                    conjA, conjB, core);
+            kgenAddStmt(ctx, expr.buf);
+        }
+        else {
+            genRealMulUpdate(ctx, &elA, &elB, &elC, c->trans, core);
+        }
+    }
+}
+
+/*
+ * Generate complete vector-vector product using separate multiple-add
+ * operations and explicit vectorization
+ */
+static void
+genVectorizedVecMulAdd(
+    struct KgenContext *ctx,
+    unsigned int m,
+    unsigned int n,
+    const Tile *a,
+    const Tile *b,
+    const Tile *c,
+    bool conjA,
+    bool conjB,
+    VectMulType type,
+    bool wholeA)
+{
+    unsigned int k;
+    unsigned int sumLen;
+    char tmp[MAX_LENGTH], tmp2[MAX_LENGTH];
+    char *str = tmp;
+    const char *s;
+    char op;
+    Kstring elA, elB, elC;
+    unsigned int vlen;
+    // signs for even and odd components
+    int signs[2] = {0, 0};
+
+    vlen = commonTileSegmentLen(a, b);
+    if (!wholeA) {
+        m = 0;
+    }
+
+    if (type == VECT_MULT_REAL) {
+        sprintfTileElement(&elC, c, m, n, 1);
+        sumLen = vlen;
+    }
+    else {
+        TileElementHalf half = (type == VECT_MULT_COMPLEX_REAL) ?
+            TE_HALF_LOW : TE_HALF_HIGH;
+
+        sprintfTileElementHalf(&elC, c, m, n, half);
+        sumLen = vlen * 2;
+        if (type == VECT_MULT_COMPLEX_REAL) {
+            if ((conjA && conjB) || (!conjA && !conjB)) {
+                signs[1] = 1;
+    }
+        }
+        else if (!(conjA && conjB)) {
+            /*
+             * When both the matrix are conjugated, the sum is substracted
+             * from the temporary result
+             */
+            signs[0] = (int)conjB;
+            signs[1] = (int)conjA;
+        }
+    }
+
+    // initial expression
+    sprintfTileElement(&elA, a, m, 0, vlen);
+    sprintfTileElement(&elB, b, 0, n, vlen);
+    if (type == VECT_MULT_COMPLEX_IMAG) {
+        swapComplexComponents(&elB, vlen);
+    }
+    str += sprintf(str, "sum = %s * %s", elA.buf, elB.buf);
+
+    // add expressions for remaining elements
+    for (k = vlen; k < a->nrCols; k += vlen) {
+        sprintfTileElement(&elA, a, m, k, vlen);
+        sprintfTileElement(&elB, b, k, n, vlen);
+        if (type == VECT_MULT_COMPLEX_IMAG) {
+            swapComplexComponents(&elB, vlen);
+    }
+        str += sprintf(str, " + %s * %s", elA.buf, elB.buf);
+    }
+
+    strcat(tmp, ";\n");
+    kgenAddStmt(ctx, tmp);
+
+    // sum components of the temporary results
+    str = tmp2;
+    s = (signs[0]) ? "-" : "";
+    str += sprintf(tmp2, "%ssum.s0", s);
+        for (k = 1; k < sumLen; k++) {
+        op = signs[k & 1] ? '-' : '+';
+        str += sprintf(str, " %c sum.s%c", op, vectComponents[k]);
+        }
+
+    if ((type == VECT_MULT_COMPLEX_IMAG) && conjA & conjB) {
+        op = '-';
+    }
+    else {
+        op = '+';
+    }
+
+    sprintf(tmp, "%s %c= %s;\n", elC.buf, op, tmp2);
+
+    kgenAddStmt(ctx, tmp);
+}
+
+/*
+ * Generate one stage of vector-vector product. Iterating over M and N having
+ * fixed coordinate over K.
+ */
+static void
+genStagedVecMul(
+    struct KgenContext *ctx,
+    unsigned int lineA,
+    unsigned int k,
+    const Tile *a,
+    const Tile *b,
+    const Tile *c,
+    bool conjA,
+    bool conjB,
+    TileMulCore core,
+    bool wholeA)
+{
+    Kstring elA, elB, elC;
+    unsigned int stepM, endM, stepN, vlenC;
+    unsigned int i, j;
+    unsigned int m, ma, ka;
+    bool isDouble;
+    bool isComplex;
+
+    if (a->trans) {
+        m = 0;
+        endM = a->nrRows;
+    }
+    else {
+        m = lineA;
+        endM = m + 1;
+    }
+
+    isDouble = isDoubleBasedType(c->dtype);
+    isComplex = isComplexType(c->dtype);
+
+    if (( (c->trans == a->trans) || (c->trans == b->trans) ) &&
+        !isComplex) {
+
+        if (c->trans) {
+            stepM = vlenC = commonTileSegmentLen(a, c);
+            stepN = 1;
+    }
+    else {
+            stepM = 1;
+            stepN = vlenC = commonTileSegmentLen(b, c);
+    }
+    }
+    else {
+        stepM = stepN = 1;
+        vlenC = 1;
+    }
+
+    ka = selectColA(a, k, wholeA);
+
+    for (i = m; i < endM; i += stepM) {
+        ma = selectRowA(a, i, wholeA);
+        sprintfTileElement(&elA, a, ma, ka, stepM);
+
+        for (j = 0; j < b->nrCols; j += stepN) {
+            sprintfTileElement(&elB, b, k, j, stepN);
+            sprintfTileElement(&elC, c, i, j, vlenC);
+
+            if (isComplex) {
+                Kstring expr;
+
+                sprintfComplexMulUpdate(&expr, &elC, &elA, &elB, &elC,
+                                        isDouble, conjA, conjB, core);
+                kgenAddStmt(ctx, expr.buf);
+            }
+            else {
+                genRealMulUpdate(ctx, &elA, &elB, &elC, c->trans, core);
+            }
+        }
+    }
+}
+
+/* check input values like x, y, bw to be fetch vector aligned and so on */
+static int
+checkInput(const BlasGenSettings *gset, const TileMulOpts *mulOpts)
+{
+    //bool localA = (mulOpts->memA == CLMEM_LOCAL_MEMORY);
+    //bool localB = (mulOpts->memB == CLMEM_LOCAL_MEMORY);
+    TileMulFlags mflags = mulOpts->flags;
+    //bool cyclicGlobal = ((mflags & TILEMUL_GLOBAL_CYCLIC) != 0);
+    bool isReal = ! isComplexType(gset->kextra->dtype);
+    bool conjA = ((mflags & TILEMUL_CONJA) != 0);
+    bool conjB = ((mflags & TILEMUL_CONJB) != 0);
+
+    // This condition is not validate the case
+    // when the matrix B is in the local memory
+    // and the matrix A in the global memory.
+    //
+
+    //if ((localA ||localB) && cyclicGlobal) {
+    //    return -EINVAL;
+    //}
+
+    if (isReal && (conjA || conjB)) {
+        /* 'Conjugated' flag can be used for complex types only */
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+static void
+genMulLineOnTile(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    const TileMulOpts *mulOpts,
+    unsigned int lineOffset,
+    bool wholeA)
+{
+    TileMulFlags mflags = mulOpts->flags;
+    const Tile *a = &gset->tileA;
+    const Tile *b = &gset->tileBX;
+    const Tile *c = &gset->tileCY;
+    bool isReal;
+    bool conjA, conjB;
+    const SubproblemDim *subdims = gset->subdims;
+    TileMulCore core;
+    DataType dtype = gset->kextra->dtype;
+    unsigned int j, n;
+
+    n = (unsigned int)subdims[1].x;
+    core = checkReplaceCore(gset, mulOpts->core, a->trans, b->trans);
+
+    isReal = !isComplexType(dtype);
+    conjA = ((mflags & TILEMUL_CONJA) != 0);
+    conjB = ((mflags & TILEMUL_CONJB) != 0);
+
+    if (a->trans || !b->trans) {
+        unsigned int startK, endK;
+
+        startK = (a->trans)? lineOffset : 0;
+        endK = (a->trans)? lineOffset + 1 : (unsigned int)subdims[1].bwidth;
+        for (j = startK; j < endK; j++) {
+            genStagedVecMul(ctx, lineOffset, j, a, b, c, conjA,
+                            conjB, core, wholeA);
+                    }
+                }
+    else {
+        bool vectorize = false;
+
+        if (commonTileSegmentLen(a, b) > 1) {
+            vectorize = ((mflags & TILEMUL_FORCE_VECTORIZATION) != 0);
+                }
+        for (j = 0; j < n; j++) {
+            /* full dot product of row of A by column of B */
+            if ((core == TILEMUL_MULADD) && vectorize) {
+                if (isReal) {
+                    genVectorizedVecMulAdd(ctx, lineOffset, j, a, b, c,
+                                           false, false, VECT_MULT_REAL,
+                                           wholeA);
+            }
+    else {
+                    genVectorizedVecMulAdd(ctx, lineOffset, j, a, b, c,
+                                           conjA, conjB, VECT_MULT_COMPLEX_REAL,
+                                     wholeA);
+                    genVectorizedVecMulAdd(ctx, lineOffset, j, a, b, c, conjA,
+                                           conjB, VECT_MULT_COMPLEX_IMAG,
+                                           wholeA);
+            }
+        }
+            else {
+                genVecMul(ctx, lineOffset, j, a, b, c, conjA, conjB,
+                          core, wholeA);
+            }
+        }
+            }
+        }
+
+void
+sprintfComplexMulUpdate(
+    Kstring *expr,
+    const Kstring *dst,
+    const Kstring *a,
+    const Kstring *b,
+    const Kstring *c,
+    bool isDouble,
+    bool conjA,
+    bool conjB,
+    TileMulCore core)
+{
+    Kstring swSrc1;      // swapped element of the first source
+    // real and imaginary part of the second source
+    Kstring reSrc2, imSrc2;
+    const Kstring *src11, *src12, *src21, *src22;
+    const char *sign1 = "", *sign2 = "", *sign3 = "";
+    const char *baseType;
+
+    baseType = (isDouble) ? "double2" : "float2";
+
+    /*
+     * Prepare components for multiplying. We should get the following
+     * vectorized operations:
+     *
+     * c = b * a1 + bsw * (-a2, a2)       if both 'a' and 'b' are not conjugated
+     * c = b * a1 + bsw * (a2, -a2)       if 'b' is conjugated and 'a' is not
+     * c = a * b1 + asw * (-b2, b2)       if 'a' is conjugated and 'b' is not
+     * c = asw * (-b2) + a * (b1, -b1)    if both 'a' and 'b' are conjugated
+     *
+     * Where (a1, a2) and (b1, b2) are complex components of 'a' and 'b',
+     * and asw and bsw - swapped elements of 'a' and 'b' respectively.
+     */
+
+    src11 = (conjB) ? a : b;
+    src21 = (conjB) ? b : a;
+
+    kstrcpy(&swSrc1, src11->buf);
+    swapComplexComponents(&swSrc1, 1);
+    takeComplexApart(&reSrc2, &imSrc2, src21);
+
+    if (conjA && conjB) {
+        src12 = src11;
+        src11 = &swSrc1;
+        src21 = &imSrc2;
+        src22 = &reSrc2;
+        sign1 = sign3 = "-";
+    }
+    else {
+        src12 = &swSrc1;
+        src21 = &reSrc2;
+        src22 = &imSrc2;
+        if (conjA || conjB) {
+            sign3 = "-";
+        }
+        else {
+            sign2 = "-";
+        }
+    }
+
+    if (core == TILEMUL_MAD) {
+        const char *strC = (c == NULL) ? "0" : c->buf;
+
+        ksprintf(expr, "%s = mad(%s, %s%s, %s);\n"
+                       "%s = mad(%s, (%s)(%s%s, %s%s), %s);\n",
+                 dst->buf, src11->buf, sign1, src21->buf, strC,
+                 dst->buf, src12->buf, baseType, sign2, src22->buf,
+                 sign3, src22->buf, dst->buf);
+    }
+    else {
+        const char *op = (dst == c) ? "+=" : "=";
+
+        ksprintf(expr, "%s %s %s * %s%s + %s * (%s)(%s%s, %s%s)",
+                 dst->buf, op, src11->buf, sign1,
+                 src21->buf, src12->buf, baseType, sign2, src22->buf,
+                 sign3, src22->buf);
+        if (!((c == NULL) || (c == dst))) {
+            kstrcatf(expr, " + %s", c->buf);
+        }
+        kstrcatf(expr, "%s", ";\n");
+    }
+}
+
+int
+genMulTiles(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    const TileMulOpts *mulOpts)
+{
+    char s[32];
+    const CLBLASKernExtra *kextra = gset->kextra;
+    const char *tNameIn;
+    unsigned int i;
+    unsigned int iend;
+    bool tra = ((mulOpts->flags & TILEMUL_TRA) != 0);
+    bool trb = ((mulOpts->flags & TILEMUL_TRB) != 0);
+    TileMulCore core;
+    int ret;
+
+    ret = checkInput(gset, mulOpts);
+    if (ret) {
+        return ret;
+    }
+
+    getVectorTypeName(kextra->dtype, kextra->vecLen, &tNameIn, NULL);
+    core = checkReplaceCore(gset, mulOpts->core, tra, trb);
+
+    if (((core == TILEMUL_MULADD || isComplexType(kextra->dtype)) &&
+          !tra && trb)) {
+        sprintf(s,"%s sum;\n", tNameIn);
+        kgenAddStmt(ctx, s);
+    }
+
+    iend = (unsigned int)((mulOpts->flags & TILEMUL_TRA) ?
+                            gset->subdims[1].bwidth : gset->subdims[1].y);
+    for (i = 0; i < iend; i++) {
+        genMulLineOnTile(ctx, gset, mulOpts, i, true);
+    }
+
+    // just to get state
+    ret = kgenAddStmt(ctx, NULL);
+
+    return (ret) ? -EOVERFLOW : 0;
+}
+
+int
+tileMulGen(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    const TileMulOpts *mulOpts)
+{
+    char s[MAX_LENGTH];
+    unsigned int vlenA, vlenB;
+    unsigned int i, iend; //counters
+    // size_t m, n, subK;
+    int ret = 0;
+    TileMulFlags mflags = mulOpts->flags;
+    bool tra = ((mflags & TILEMUL_TRA) != 0);
+    bool trb = ((mflags & TILEMUL_TRB) != 0);
+    bool localA = (mulOpts->memA == CLMEM_LOCAL_MEMORY);
+    bool localB = (mulOpts->memB == CLMEM_LOCAL_MEMORY);
+    bool internalFetchB = ((mflags & TILEMUL_NOT_FETCH_B) == 0);
+    bool bwStride = ((mflags & TILEMUL_BW_STRIDE) != 0);
+    bool incK = ((mflags & TILEMUL_NOT_INC_K) == 0);
+    const SubproblemDim *subdims = gset->subdims;
+    size_t bwidth = bwStride ? subdims[0].bwidth : subdims[1].bwidth;
+    TileMulCore core = mulOpts->core;
+    DataType dtype = gset->kextra->dtype;
+    const KernelVarNames *varNames = &gset->varNames;
+    FetchOpts fetchOpts;
+    struct FetchContext *fctx = mulOpts->fctx;
+    FetchAddrMode addrMode;
+    FetchOptLevel foptlev;
+    struct StatementBatch *batch = NULL;
+    const Tile *tile;
+
+    memset(&fetchOpts, 0, sizeof(fetchOpts));
+    fetchOpts.memA = mulOpts->memA;
+    fetchOpts.memB = mulOpts->memB;
+
+    kgenAddStmt(ctx, "/* -- Tiles multiplier -- */\n");
+
+    getVecLens(gset, &vlenA, &vlenB, NULL);
+
+    /* check generator input values */
+    ret = checkInput(gset, mulOpts);
+    if (ret) {
+        return ret;
+    }
+
+    if (!bwStride && (subdims[0].bwidth != subdims[1].bwidth)) {
+        sprintf(s, "for (int k1 = 0; k1 < %lu; k1 += %lu)",
+                subdims[0].bwidth, subdims[1].bwidth);
+        kgenBeginBranch(ctx, s);
+    }
+
+    core = checkReplaceCore(gset, core, tra, trb);
+    if (((core == TILEMUL_MULADD || isComplexType(dtype)) &&
+            !tra && trb)) {
+
+        unsigned int n;
+        const char *tname;
+
+        n = commonTileSegmentLen(&gset->tileA, &gset->tileBX);
+        getVectorTypeName(gset->tileA.dtype, n, &tname, NULL);
+
+        sprintf(s,"%s sum;\n", tname);
+        kgenAddStmt(ctx, s);
+    }
+
+    // FIXME: remove this kludge for backward compatibility
+    if (fctx == NULL) {
+        fctx = createFetchContext();
+        if (fctx == NULL) {
+            return -ENOMEM;
+        }
+        fetchOpts.mulOpts = mulOpts;
+    }
+    //////////////////////////////////////////////////////
+
+    foptlev = getFetchOptLevels(fctx);
+
+    if ((gset->flags & BGF_WHOLE_A) && internalFetchB &&
+        (foptlev & FOPTLEV_MERGE_FETCHES)) {
+
+        batch = createStmtBatch();
+        if (batch == NULL) {
+            ret = -ENOMEM;
+            goto out;
+        }
+    }
+
+    /*
+     * First, disable sharing internal variables of the fetch code for
+     * the first call so as the fetch generator could declares it for the
+     * first matrix. And then re-enable it when invoking the fetch for
+     * the other matrix if it has been actually enabled.
+     */
+
+    disableFetchOptLevels(fctx, FOPTLEV_CAN_SHARE_TMP_AB);
+
+    /*
+     * fetch elements of the matrix B, by rows or by columns depending on
+     * the transposing flag
+     */
+    if (internalFetchB) {
+        tile = &gset->tileBX;
+        fetchOpts.mrole = MATRIX_B;
+        fetchOpts.linesNum = trb ? tile->nrCols : tile->nrRows;
+        if (batch == NULL) {
+            ret = genFetchInputTile(ctx, fctx, gset, &fetchOpts);
+            if (!ret) {
+                ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_B);
+            }
+        }
+        else {
+            genFetchInputTileBatch(batch, fctx, gset, &fetchOpts);
+        }
+    }
+
+    fetchOpts.mrole = MATRIX_A;
+
+    if (foptlev & FOPTLEV_CAN_SHARE_TMP_AB) {
+        enableFetchOptLevels(fctx, FOPTLEV_CAN_SHARE_TMP_AB);
+    }
+
+    if (ret) {
+        goto out;
+    }
+
+    if (gset->flags & BGF_WHOLE_A) {
+        tile = &gset->tileA;
+        iend = (tra) ? tile->nrCols : tile->nrRows;
+        fetchOpts.linesNum = iend;
+        if (batch == NULL) {
+            ret = genFetchInputTile(ctx, fctx, gset, &fetchOpts);
+        }
+        else {
+            genFetchInputTileBatch(batch, fctx, gset, &fetchOpts);
+            ret = flushStmtBatch(ctx, batch);
+            if (!ret) {
+                ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_B);
+            }
+        }
+
+        if (!ret) {
+            ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_A);
+        }
+        if (ret) {
+            goto out;
+
+        }
+
+        // main multiplying loop
+        for (i = 0; i < iend; i++) {
+            if (i) {
+                kgenAddBlankLine(ctx);
+            }
+            genMulLineOnTile(ctx, gset, mulOpts, i, true);
+        }
+    }
+    else {
+        iend = (unsigned int)((tra) ? subdims[1].bwidth : subdims[1].y);
+        fetchOpts.linesNum = 1;
+
+        // main multiplying loop
+        for (i = 0; i < iend; i++) {
+            if (i) {
+                kgenAddBlankLine(ctx);
+                revalidateFetchContext(fctx, MATRIX_A);
+            }
+            // fetch elements of matrix A from single row
+            fetchOpts.lineOffset = i;
+            genFetchInputTile(ctx, fctx, gset, &fetchOpts);
+            ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_A);
+            if (ret) {
+                goto out;
+            }
+            genMulLineOnTile(ctx, gset, mulOpts, i, false);
+        }
+    }
+
+    /*
+     * increment K-related coordinates or pointers depending on addressing
+     * mode
+     */
+    addrMode = getFetchAddrMode(fctx);
+    if (addrMode & FETCH_ADDR_K_RELATIVE) {
+        kgenAddBlankLine(ctx);
+        genPointerUpdate(ctx, varNames->A, varNames->lda, bwidth,
+                         subdims[0].y, vlenA, dtype, gset->flags,
+                         !tra, localA);
+
+        genPointerUpdate(ctx, varNames->B, varNames->ldb, bwidth,
+                         subdims[0].x, vlenB, dtype, gset->flags,
+                         trb, localB);
+    }
+    else {
+        if (incK && (varNames->k != NULL) && !(localA && localB)) {
+            sprintf(s, "\n%s += %lu;\n", varNames->k, bwidth);
+            kgenAddStmt(ctx, s);
+        }
+    }
+
+    if (!bwStride && (subdims[0].bwidth != subdims[1].bwidth)) {
+        kgenEndBranch(ctx, NULL); // k1 loop
+    }
+    ret = kgenAddStmt(ctx, "/* ---------------------- */\n");
+    ret = (ret) ? -EOVERFLOW : 0;
+
+out:
+    if (batch != NULL) {
+        destroyStmtBatch(batch);
+    }
+    if (fctx != mulOpts->fctx) {
+        destroyFetchContext(fctx);
+    }
+
+    return ret;
+}
diff --git a/src/library/blas/gens/trmm.c b/src/library/blas/gens/trmm.c
new file mode 100644
index 0000000..7655af3
--- /dev/null
+++ b/src/library/blas/gens/trmm.c
@@ -0,0 +1,1423 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Cached global buffers based trmm generator
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+
+#include "init.h"
+#include "blas_kgen.h"
+#include "blas_subgroup.h"
+#include "trxm_common.h"
+
+typedef struct {
+    size_t staggered;
+} MAY_ALIAS extraData_t;
+
+static CLBLASMpatExtra mpatExtra;
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra);
+
+static SolverFlags
+solverFlags(void);
+
+static void fixupArgs( void *args,
+    SubproblemDim *subdims,
+    void *extra );
+
+static int
+blockGetPerf( unsigned int kflags,
+    const void *args );
+
+static int subgGetPerf( unsigned int kflags,
+    const void *args );
+
+static void subgCalcThreads( size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra );
+
+static int trmmGetDefaultDecomp( PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    void *pArgs);
+
+static int trmmSubgGetDefaultDecomp( PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    void *pArgs );
+
+static bool subgCheckCalcDecomp( PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    DataType dtype,
+    int check );
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs);
+
+static bool
+blockCheckCalcDecomp(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    DataType dtype,
+    int check);
+
+static SolverOps blockSops = {
+    generator,
+    assignKargs,
+    isFitToLDS,
+    blockGetPerf,
+    NULL,
+    NULL,
+    NULL,
+    solverFlags,
+    fixupArgs,
+    trmmGetDefaultDecomp,   // getDefaultDecomp
+    blockCheckCalcDecomp,
+    NULL,
+    NULL};
+
+// Solver options for subgroup pattern
+static SolverOps subgSops = {
+    generator,
+    assignKargs,
+    NULL,
+    subgGetPerf,
+    NULL,
+    subgCalcThreads,
+    NULL,
+    solverFlags,
+    fixupArgs,
+    trmmSubgGetDefaultDecomp,
+    subgCheckCalcDecomp,
+    NULL,
+    NULL};
+
+//-----------------------------------------------------------------------------
+
+static void
+initKernelVarNames(KernelVarNames *kvars)
+{
+    kvars->A = "(Ag)";
+    kvars->B = "(Bg)";
+    kvars->C = "C";
+    kvars->coordA = "coord.y";
+    kvars->coordB = "coord.x";
+    kvars->k = "coord.z";
+    kvars->sizeK = "M";
+    kvars->sizeM = "M";
+    kvars->sizeN = "N";
+    kvars->lda = "lda";
+    kvars->ldb = "ldb";
+    kvars->ldc = "ldb";
+    kvars->alpha = "alpha";
+}
+
+//-----------------------------------------------------------------------------
+
+static void
+genInitCurrM(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    KernelExtraFlags kflags)
+{
+    char tmp[1024];
+
+    if (isMatrixUpper(kflags)) {
+        strcpy(tmp, "currM = 0;\n");
+    }
+    else {
+        sprintf(tmp, "currM = (M - 1) / %lu * %lu;\n", dim->y, dim->y);
+    }
+
+    kgenAddStmt(ctx, tmp);
+    kgenAddBlankLine(ctx);
+}
+
+//-----------------------------------------------------------------------------
+
+static void
+genStartPosK(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    KernelExtraFlags kflags,
+    bool subgMode)
+{
+    char tmp[1024];
+    if (isMatrixUpper(kflags)) {
+        // K loop - from diagonal till M
+        if (subgMode) {
+            sprintf(tmp, "uint kBegin = currM;\n");
+        }
+        else {
+            if (!(kflags & KEXTRA_TAILS_M)) {
+                sprintf(tmp, "uint kBegin = currM;\n");
+            }
+            else {
+                sprintf(tmp, "uint kBegin = currM / %lu * %lu;\n",
+                        dim->bwidth, dim->bwidth);
+            }
+        }
+    }
+    else {
+        // K loop - from 0 till diagonal
+        sprintf(tmp, "uint kBegin = 0;\n");
+    }
+
+    kgenAddStmt(ctx, tmp);
+}
+
+//-----------------------------------------------------------------------------
+
+static void
+resetFetchNumA(TileMulOpts *mulOpts)
+{
+    TilePostFetchPrivate *pfPriv;
+    pfPriv = (TilePostFetchPrivate *) mulOpts->postFetchPriv;
+
+    pfPriv[0].fetchNumA = 0;
+    pfPriv[1].fetchNumA = 0;
+}
+
+//-----------------------------------------------------------------------------
+
+static int
+genSubgLoopsK(
+    struct KgenContext *ctx,
+    BlasGenSettings *gset,
+    TileMulOpts *mulOpts,
+    SubgVarNames* pSubgVNames,
+    size_t staggered)
+{
+    char tmp[1024];
+    KernelExtraFlags kflags = gset->kextra->flags;
+    const size_t y0 = gset->subdims[0].y;
+    const size_t bw1 = gset->subdims[1].bwidth;
+    const size_t bw0 = gset->subdims[0].bwidth;
+
+    // bw, that will be used for diagonal block evaluation
+    size_t diagBw1 = getVecLen( gset, CLBLAS_TRMM, MATRIX_A );
+
+    // saving dimensions of tile A, that will be changed for
+    // diagonal block
+    size_t sDimA = gset->tileA.trans ?
+        gset->tileA.nrRows:
+        gset->tileA.nrCols;
+
+    size_t sDimB = gset->tileBX.trans ?
+        gset->tileBX.nrRows:
+        gset->tileBX.nrCols;
+
+    const CLBLASKernExtra* psKExtra = gset->kextra;
+    CLBLASKernExtra diagKExtra;
+    TilePostFetchPrivate postFPriv;
+    int ret = 0;
+
+    kgenPrintf( ctx, "uint k0;\n" );
+    kgenPrintf( ctx, "uint kMax;\n" );
+
+    // upper triangle case
+    if (isMatrixUpper(kflags)) {
+
+        // diagonal part ------------------------------------------------------
+
+        // adjust tile and kextra settings for
+        // processing diagonal block
+        gset->subdims[1].bwidth = diagBw1;
+        if ( gset->tileA.trans ) {
+            gset->tileA.nrRows = diagBw1;
+        }
+        else {
+            gset->tileA.nrCols = diagBw1;
+        }
+        if ( gset->tileBX.trans ) {
+            gset->tileBX.nrRows = diagBw1;
+        }
+        else {
+            gset->tileBX.nrCols = diagBw1;
+        }
+        memcpy( &diagKExtra,gset->kextra,sizeof(CLBLASKernExtra) );
+        diagKExtra.vecLenA = diagBw1 < psKExtra->vecLenA?
+            diagBw1:
+            psKExtra->vecLenA;
+        diagKExtra.vecLenB = diagBw1 < psKExtra->vecLenB?
+            diagBw1:
+            psKExtra->vecLenB;
+        gset->kextra = (const CLBLASKernExtra*)&diagKExtra;
+
+        // Process the triangle block by the 0 item
+        // of each subgroup
+        kgenPrintf( ctx, "// k-coordinate of the end of diagonal block\n" );
+        kgenPrintf( ctx, "// calculated to be aligned to bw1\n");
+        kgenPrintf( ctx,
+            "kMax = kBegin + %lu + (%lu - %lu%%(kBegin+%lu));\n",
+            y0,
+            bw1,
+            bw1,
+            y0);
+
+        sprintf( tmp, "if( %s.x == 0 )", pSubgVNames->itemId );
+        kgenBeginBranch( ctx, tmp );
+
+        sprintf( tmp,
+            "for( k0=kBegin; (k0<kMax)&&(k0<M); k0+=%lu )",
+            diagBw1 );
+        kgenBeginBranch( ctx, tmp );
+
+        kgenPrintf( ctx, "%s=k0;\n", gset->varNames.k );
+        mulOpts->postFetch = genTrxmPostFetchZero;
+        ret = tileMulGen( ctx, gset, mulOpts );
+        if( 0 != ret ){
+            return ret;
+        }
+
+        kgenEndBranch(ctx, NULL);// for()
+        kgenEndBranch(ctx, NULL);// if( itemId.x == 0 )
+
+        // Restore tile and kextra settings to the
+        // original parameters
+        gset->subdims[1].bwidth = bw1;
+        if ( gset->tileA.trans ) {
+            gset->tileA.nrRows = sDimA;
+        }
+        else {
+            gset->tileA.nrCols = sDimA;
+        }
+        if ( gset->tileBX.trans ) {
+            gset->tileBX.nrRows = sDimB;
+        }
+        else {
+            gset->tileBX.nrCols = sDimB;
+        }
+        gset->kextra = psKExtra;
+
+        // rectangle part -----------------------------------------------------
+        kgenAddBlankLine( ctx );
+        kgenPrintf( ctx, "k0 = kMax;\n" );
+        if ( kflags & KEXTRA_TAILS_K_LOWER ) {
+
+            kgenPrintf( ctx, "uint alignedK = M-(M%%%lu);\n", bw1 );
+        }
+        // strided access
+        sprintf(tmp,
+            "for ( k0 = k0+%s.x*%lu; k0 < %s; k0 += %lu )",
+            pSubgVNames->itemId,
+            bw1,
+            ( kflags & KEXTRA_TAILS_K_LOWER )? "alignedK" : "M",
+            bw0);
+
+        kgenBeginBranch(ctx, tmp);
+        // TODO: make staggered access operational with lower-K tails
+        /*kgenPrintf( ctx,
+            "%s = (kBegin+%d) + ( m0*64*(gid%%2) + k0 )%%(M-(kBegin+%d));\n",
+            gset->varNames.k,
+            diagW,
+            diagW); */
+        kgenPrintf( ctx, "%s = k0;\n", gset->varNames.k );
+
+        mulOpts->postFetch = NULL;
+        ret = tileMulGen(ctx, gset, mulOpts);
+        if (ret != 0) {
+            return ret;
+        }
+        kgenEndBranch(ctx, NULL);
+
+        // rectangle tail part ------------------------------------------------
+
+        if ( kflags & KEXTRA_TAILS_K_LOWER ) {
+
+            kgenAddBlankLine( ctx );
+            kgenPrintf( ctx,
+                "// lower K tail is handled by item 0 of each subgroup\n");
+
+            sprintf(tmp, "if( (%s.x == 0)&&(kMax < M) )", pSubgVNames->itemId);
+            kgenBeginBranch( ctx, tmp );
+
+            kgenPrintf( ctx, "%s = alignedK;\n", gset->varNames.k );
+            postFPriv.fetchNumA = 0;
+            postFPriv.gset = gset;
+            mulOpts->postFetch = defaultTilePostFetch;
+            mulOpts->postFetchPriv = &postFPriv;
+
+            ret = tileMulGen( ctx, gset, mulOpts );
+            if ( ret != 0 ) {
+                return ret;
+            }
+            kgenEndBranch( ctx, NULL );
+        }
+    }
+    // lower triangle case
+    else {
+
+        // rectangle part -----------------------------------------------------
+
+        kgenPrintf( ctx, "kMax = currM - currM%%%lu;\n", bw1 );
+        // strided access, staggered access
+        sprintf( tmp,
+            "for( k0 = 0; k0 < kMax; k0 += %lu )",
+            bw0 );
+        kgenBeginBranch( ctx, tmp );
+
+        kgenPrintf( ctx, "%s=(k0+%s.x*%d+%d*gid)%%kMax;\n",
+            gset->varNames.k,
+            pSubgVNames->itemId,
+            bw1,
+            staggered/bw1*bw1 );
+
+        mulOpts->postFetch = NULL;
+        // part without diagonal elements post fetch zeroing
+        ret = tileMulGen(ctx, gset, mulOpts);
+        if (ret != 0) {
+            return ret;
+        }
+        kgenEndBranch( ctx, NULL );
+
+        // diagonal part ------------------------------------------------------
+
+        // adjust tile and kextra settings for
+        // processing diagonal block
+        gset->subdims[1].bwidth = diagBw1;
+        if ( gset->tileA.trans ) {
+            gset->tileA.nrRows = diagBw1;
+        }
+        else {
+            gset->tileA.nrCols = diagBw1;
+        }
+        if ( gset->tileBX.trans ) {
+            gset->tileBX.nrRows = diagBw1;
+        }
+        else {
+            gset->tileBX.nrCols = diagBw1;
+        }
+        psKExtra = gset->kextra;
+        memcpy( &diagKExtra,gset->kextra,sizeof(CLBLASKernExtra) );
+        diagKExtra.vecLenA = diagBw1 < psKExtra->vecLenA?
+            diagBw1:
+            psKExtra->vecLenA;
+        diagKExtra.vecLenB = diagBw1 < psKExtra->vecLenB?
+            diagBw1:
+            psKExtra->vecLenB;
+        gset->kextra = (const CLBLASKernExtra*)&diagKExtra;
+
+        // process the triangle block by the 0 item
+        // of each subgroup
+        sprintf( tmp, "if( %s.x == 0 )", pSubgVNames->itemId );
+        kgenBeginBranch( ctx, tmp );
+
+        sprintf( tmp,
+            "for( k0 = kMax; (k0 < currM+%lu)&&(k0 < M); k0 += %lu )",
+            y0,
+            diagBw1 );
+        kgenBeginBranch( ctx, tmp );
+
+        kgenPrintf( ctx, "%s=k0;\n", gset->varNames.k );
+        mulOpts->postFetch = genTrxmPostFetchZero;
+        resetFetchNumA(mulOpts);
+        ret = tileMulGen(ctx, gset, mulOpts);
+        if (ret != 0) {
+            return ret;
+        }
+        kgenEndBranch( ctx, NULL );// for()
+        kgenEndBranch( ctx, NULL );// if( itemId.x == 0 )
+
+        // Restore tile and kextra settings to the
+        // original parameters
+        gset->subdims[1].bwidth = bw1;
+        if ( gset->tileA.trans ) {
+            gset->tileA.nrRows = sDimA;
+        }
+        else {
+            gset->tileA.nrCols = sDimA;
+        }
+        if ( gset->tileBX.trans ) {
+            gset->tileBX.nrRows = sDimB;
+        }
+        else {
+            gset->tileBX.nrCols = sDimB;
+        }
+        gset->kextra = psKExtra;
+
+    }
+
+    return 0;
+}
+
+//-----------------------------------------------------------------------------
+
+static int
+genLoopsK(
+    struct KgenContext *ctx,
+    BlasGenSettings *gset,
+    TileMulOpts *mulOpts,
+    char *tmp)
+{
+    KernelExtraFlags kflags = gset->kextra->flags;
+    const size_t y0 = gset->subdims[0].y;
+    const size_t bwidth = gset->subdims[1].bwidth;
+    int ret;
+    bool isRel = false;
+    const char *inTypeNameA, *inPtrNameA, *inTypeNameB, *inPtrNameB;
+
+    getVectorTypeName(gset->kextra->dtype, gset->kextra->vecLenA, &inTypeNameA, &inPtrNameA);
+    getVectorTypeName(gset->kextra->dtype, gset->kextra->vecLenB, &inTypeNameB, &inPtrNameB);
+
+    sprintf(tmp, "uint k0;\n");
+    kgenAddStmt(ctx, tmp);
+
+    if (!(kflags & (KEXTRA_TAILS_M_LOWER | KEXTRA_TAILS_N_LOWER |
+                    KEXTRA_TAILS_K_LOWER))) {
+
+        FetchAddrMode addrMode = FETCH_ADDR_A_RELATIVE | FETCH_ADDR_B_RELATIVE |
+                                 FETCH_ADDR_K_RELATIVE;
+
+        isRel = true;
+
+        mulOpts->fctx = createFetchContext();
+        if (mulOpts->fctx == NULL) {
+            return -ENOMEM;
+        }
+        setFetchAddrMode(mulOpts->fctx, addrMode);
+
+        gset->varNames.A = "pA";
+        gset->varNames.B = "pB";
+    }
+    else {
+        gset->flags |= BGF_UPTRS;
+        kgenPrintf(ctx, "GPtr Ag, Bg;\n"
+                        "\n"
+                        "Ag.%s = A;\n"
+                        "Bg.%s = B;\n\n",
+                   inPtrNameA, inPtrNameB);
+    }
+
+    if (isMatrixUpper(kflags)) {
+        if (isRel) {
+            switch ((((gset->kextra->flags & KEXTRA_TRANS_A) != 0)<<1) |
+                    (((gset->kextra->flags & KEXTRA_UPPER_TRIANG) != 0) ^
+                     ((gset->kextra->flags & KEXTRA_COLUMN_MAJOR) != 0))
+                   ) {
+            case 0:
+                kgenPrintf(ctx,
+                    "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n"
+                    "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n",
+                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
+                break;
+            case 1:
+                kgenPrintf(ctx,
+                    "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n"
+                    "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n",
+                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
+                break;
+            case 2:
+                kgenPrintf(ctx,
+                    "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n"
+                    "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n",
+                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
+                break;
+            case 3:
+                kgenPrintf(ctx,
+                    "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n"
+                    "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n",
+                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
+                break;
+            }
+        }
+
+        sprintf(tmp,
+            "for (k0 = kBegin; "
+                "(k0 <= (kBegin + %luu))&&(k0 < M); "
+                "k0 += %lu)",
+            y0,
+            bwidth);
+        kgenBeginBranch(ctx, tmp);
+
+        kgenPrintf( ctx,
+            "coord.z = k0;\n");
+
+        mulOpts->postFetch = genTrxmPostFetchZero;
+        ret = tileMulGen(ctx, gset, mulOpts);
+        if (ret != 0) {
+            return ret;
+        }
+        kgenEndBranch(ctx, NULL);
+
+        //main triangle part
+        sprintf(tmp,
+            "for (; k0 <= max(0, (int)M - %lu); k0 += %lu)",
+            y0,
+            gset->subdims[1].bwidth);
+
+        kgenBeginBranch(ctx, tmp);
+
+        mulOpts->postFetch = NULL;
+        ret = tileMulGen(ctx, gset, mulOpts);
+        if (ret != 0) {
+            return ret;
+        }
+        kgenEndBranch(ctx, NULL);
+
+        // matrix side part
+        // should be calculated by item0 of each subgroup
+        sprintf(tmp, "for (; k0 < M; k0 += %lu)", bwidth);
+        kgenBeginBranch(ctx, tmp);
+
+        kgenPrintf( ctx,
+            "coord.z = k0;\n");
+
+        resetFetchNumA(mulOpts);
+        mulOpts->postFetch = genTrxmPostFetchZero;
+        ret = tileMulGen(ctx, gset, mulOpts);
+        if (ret != 0) {
+            return ret;
+        }
+        kgenEndBranch(ctx, NULL);
+
+    }
+    else {
+        // lower
+        size_t diagBlocks; //Number of bw *y blocks that fit in y*y square
+
+        if (isRel) {
+            switch ((((gset->kextra->flags & KEXTRA_TRANS_A) != 0)<<1) |
+                    (((gset->kextra->flags & KEXTRA_UPPER_TRIANG) != 0) ^
+                     ((gset->kextra->flags & KEXTRA_COLUMN_MAJOR) != 0))
+                   ) {
+            case 0:
+                kgenPrintf(ctx,
+                    "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n"
+                    "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n",
+                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
+                break;
+            case 1:
+                kgenPrintf(ctx,
+                    "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n"
+                    "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n",
+                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
+                break;
+            case 2:
+                kgenPrintf(ctx,
+                    "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n"
+                    "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n",
+                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
+                break;
+            case 3:
+                kgenPrintf(ctx,
+                    "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n"
+                    "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n",
+                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
+                break;
+            }
+        }
+
+        diagBlocks = divRoundUp(y0, bwidth);
+        sprintf(tmp, "uint iterK = min(currM + %luu, M);\n", y0);
+        kgenAddStmt(ctx, tmp);
+        sprintf(tmp, "iterK = (iterK + %lu) / %lu;\n", bwidth - 1, bwidth);
+        kgenAddStmt(ctx, tmp);
+
+        // main triangle part
+        sprintf(tmp, "for (k0 = 0; k0 < max(0, (int)iterK - %lu); k0++)",
+                diagBlocks);
+        kgenBeginBranch(ctx, tmp);
+        mulOpts->postFetch = NULL;
+        // part without diagonal elements post fetch zeroing
+        ret = tileMulGen(ctx, gset, mulOpts);
+        if (ret != 0) {
+            return ret;
+        }
+        kgenEndBranch(ctx, NULL);
+
+        // diagonal part
+        sprintf(tmp, "for (; k0 < iterK; k0++)");
+        kgenBeginBranch(ctx, tmp);
+
+        kgenPrintf( ctx,
+            "coord.z = k0 * %lu;\n",
+            bwidth);
+
+        // diagonal blocks part
+        mulOpts->postFetch = genTrxmPostFetchZero;
+        resetFetchNumA(mulOpts);
+        ret = tileMulGen(ctx, gset, mulOpts);
+        if (ret != 0) {
+            return ret;
+        }
+        kgenEndBranch(ctx, NULL);
+    }
+
+    if (isRel) {
+        destroyFetchContext(mulOpts->fctx);
+        mulOpts->fctx = NULL;
+    }
+
+    return 0;
+}
+
+//-----------------------------------------------------------------------------
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+    char tmp[4096];
+    struct KgenContext *ctx;
+    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    KernelExtraFlags kflags = kextra->flags;
+    DataType dtype = kextra->dtype;
+    bool doubleBased = isDoubleBasedType(dtype);
+    size_t staggered = ((extraData_t*)&kextra->solverPriv)->staggered;
+    int ret;
+    BlasGenSettings gset;
+    TileMulOpts mulOpts;
+    int tra = isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A);
+    int trb = isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_B);
+    unsigned int l1Pans;
+    TilePostFetchPrivate pfPriv[2];
+    UpdateResultFlags upResFlags;
+    TailStatus tailStatus;
+    bool subgMode = false;
+    SubgVarNames subgVNames;
+
+    ctx = createKgenContext(buf, buflen, true);
+    if (ctx == NULL) {
+        return -ENOMEM;
+    }
+
+    // mismatching subdims define case with subgroup decomposition
+    subgMode = ( subdims[0].bwidth != subdims[1].bwidth );
+
+    memset(&gset, 0, sizeof(gset));
+    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
+    gset.flags = BGF_DISTINCT_VECLEN;
+
+    gset.flags |= BGF_WHOLE_A;
+
+    /*FIXME: This used to be a workaround for compilation issues with dtrmm on
+     * cpu. Normally BGF_WHOLE_A should be enabled always. But for now,
+     * there are wrong results for non-aligned cases on CPU and there is
+     * no workaround yet.
+    if (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N | KEXTRA_TAILS_K)) {
+        gset.flags &= ~BGF_WHOLE_A;
+    }*/
+    gset.kextra = kextra;
+    gset.pgran = pgran;
+    //avoid [0].bw loop
+    //gset.subdims[0].bwidth = gset.subdims[1].bwidth;
+
+    memset(pfPriv, 0, sizeof(pfPriv));
+    pfPriv[0].funcID = CLBLAS_TRMM;
+    pfPriv[0].gset = &gset;
+    if ((gset.flags & BGF_WHOLE_A) != 0) {
+        pfPriv[0].wholeA = 1;
+    }
+
+    // at first, generate needed declarations
+    kgenDeclareUptrs(ctx, doubleBased);
+
+    // For inner callback, because both callbacks use own fetchNumA
+    memcpy(&pfPriv[1], &pfPriv[0], sizeof(pfPriv[0]));
+
+    // if both matrices are accessed row-major - using subgroup pattern
+    if ( subgMode ) {
+
+        declareTrxmKernel(ctx,
+            dtype,
+            pgran,
+            kflags,
+            CLBLAS_TRMM,
+            "Subgroup",
+            true,
+            true);
+        gset.flags |= BGF_UPTRS;
+    }
+    else {
+
+        declareTrxmKernel(ctx,
+            dtype,
+            pgran,
+            kflags,
+            CLBLAS_TRMM,
+            "Block",
+            true,
+            true);
+
+    }
+    kgenBeginFuncBody(ctx);
+
+    initDefaultTiles(&gset, CLBLAS_TRMM, 0, PRIV_STORAGE_VARIABLE_SET);
+    declareTileStorages(ctx, &gset);
+
+    kgenAddStmt(ctx,
+                "uint currM, currN;\n"
+                "uint4 coord = 0; /* contains coordB, coordA, k */\n");
+
+    kgenDeclareLocalID(ctx, "lid", pgran);
+    kgenDeclareGroupID(ctx, "gid", pgran);
+
+    if ( subgMode ) {
+
+        gset.varNames.LDS = "scratch";
+
+        // declaring variables used by subgroup mode
+        subgVNames.itemId = "itemId";
+        subgVNames.subgCoord = "subgCoord";
+
+        kgenAddBlankLine( ctx );
+        kgenAddBlankLine(ctx);
+
+        kgenPrintf(ctx, "int2 %s;\n", subgVNames.itemId );
+        kgenPrintf(ctx, "int2 %s;\n", subgVNames.subgCoord);
+
+        // item ID
+        kgenPrintf( ctx,
+            "%s.x = get_local_id(0)%%%d;\n",
+            subgVNames.itemId,
+            subdims[0].bwidth/subdims[1].bwidth);
+
+        // subgroup ID
+        kgenPrintf( ctx,
+            "%s.y = get_local_id(0)/%d;\n",
+            subgVNames.itemId,
+            subdims[0].bwidth/subdims[1].bwidth);
+
+        // subgroup coordX
+        kgenPrintf( ctx,
+            "%s.x = %s.y/%d;\n",
+            subgVNames.subgCoord,
+            subgVNames.itemId,
+            subdims[0].y/subdims[1].y );
+
+        // subgroup coordY
+        kgenPrintf( ctx,
+            "%s.y = %s.y%%%d;\n",
+            subgVNames.subgCoord,
+            subgVNames.itemId,
+            subdims[0].y/subdims[1].y );
+    }
+
+    kgenAddBlankLine(ctx);
+
+    sprintf(tmp, "currN = gid * %lu;\n", subdims->x);
+    kgenAddStmt(ctx, tmp);
+    genInitCurrM(ctx, subdims, kflags);
+
+    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
+        kgenAddStmt(ctx, "A += offA;\n");
+    }
+    genTrxmBMatrShift(ctx, kflags, true);
+
+    if ( subgMode ) {
+        kgenAddStmt(ctx,
+            "GPtr Ag = {A};\n"
+            "GPtr Bg = {B};\n");
+    }
+
+    l1Pans = (unsigned int)subdims[0].x / (unsigned int)subdims[1].x;
+
+    memset(&mulOpts, 0, sizeof(mulOpts));
+    mulOpts.core = ((kflags & KEXTRA_ENABLE_MAD) != 0)
+            ? TILEMUL_MAD
+            : TILEMUL_MULADD;
+    mulOpts.memA = CLMEM_GLOBAL_MEMORY;
+    mulOpts.memB = CLMEM_GLOBAL_MEMORY;
+    mulOpts.postFetch = NULL;
+    mulOpts.postFetchPriv = &pfPriv;
+    mulOpts.flags = TILEMUL_NO_FLAGS;
+    mulOpts.flags |= TILEMUL_EXTERN_RDECL;
+
+    if ( subgMode ) {
+
+        mulOpts.flags |= TILEMUL_NOT_INC_K;
+        mulOpts.flags |= TILEMUL_BW_STRIDE;
+    }
+
+    if (kflags & KEXTRA_TAILS_M_LOWER) {
+        mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
+    }
+    if (kflags & KEXTRA_TAILS_N_LOWER) {
+        mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_B;
+    }
+    if (kflags & KEXTRA_TAILS_K_LOWER) {
+        mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K;
+        mulOpts.flags |= TILEMUL_WRAP_AROUND_TAIL;
+    }
+
+    if (tra) {
+        mulOpts.flags |= TILEMUL_TRA;
+    }
+    if (!trb) {
+        mulOpts.flags |= TILEMUL_TRB;
+    }
+    if (isMatrixConj(kflags, MATRIX_A)) {
+        mulOpts.flags |= TILEMUL_CONJA;
+    }
+    if (isMatrixConj(kflags, MATRIX_B)) {
+        mulOpts.flags |= TILEMUL_CONJB;
+    }
+
+    initKernelVarNames(&gset.varNames);
+
+    if ( subgMode ) {
+
+        kgenPrintf( ctx,
+            "coord.x = currN + %s.x*%d;\n",
+            subgVNames.subgCoord,
+            subdims[1].x );
+    }
+    else {
+
+        sprintf(tmp, "coord.x = currN + lid %% %u * %lu;\n", l1Pans, subdims[1].x);
+        kgenAddStmt(ctx, tmp);
+    }
+
+    // loop over M
+    sprintf(tmp, "for (uint m0 = 0; m0 < M; m0 += %lu)", subdims[0].y);
+    kgenBeginBranch(ctx, tmp);
+
+    genStartPosK( ctx, subdims, kflags, subgMode );
+
+    sprintf(tmp, "coord.z = kBegin;\n");
+    kgenAddStmt(ctx, tmp);
+
+    if ( subgMode ) {
+
+        kgenPrintf(ctx,
+            "coord.y = currM + %s.y*%d;\n",
+            subgVNames.subgCoord,
+            subdims[1].y);
+    }
+    else {
+
+        sprintf( tmp,
+            "coord.y = currM + lid / %u * %lu;\n",
+            l1Pans,
+            subdims[1].y );
+        kgenAddStmt(ctx, tmp);
+    }
+
+    genZeroTile(ctx, &gset.tileCY);
+
+    checkGenBeginHitMatrixBlock(ctx, kflags);
+    tailStatus = checkGenAdjustTailCoords(ctx, CLBLAS_TRMM, &gset, NULL);
+
+    // loops along 'K'
+    if ( subgMode ) {
+        ret = genSubgLoopsK( ctx, &gset, &mulOpts, &subgVNames, staggered);
+    }
+    else {
+        ret = genLoopsK( ctx, &gset, &mulOpts, tmp );
+    }
+
+    if (ret != 0) {
+        printf("%s", buf);
+        return ret;
+    }
+
+    checkGenEndHitMatrixBlock(ctx, kflags);
+    kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE);
+
+    // store results
+    // for result update - x coordinate is in elements, not in vectors
+
+    checkGenRestoreTailCoords(ctx, &gset, tailStatus);
+    upResFlags = kextraToUpresFlags(CLBLAS_TRMM, kflags);
+    upResFlags |= tailStatusToUpresFlags(tailStatus);
+    upResFlags |= UPRES_INDEXING_WITH_CONSTANTS;
+    upResFlags |= UPRES_TRIANG_WRITE_C;
+    upResFlags |= UPRES_EXCEED_PROBLEM_CONDITION;
+
+    if ( subgMode ) {
+
+        mergeUpdateResult( ctx,
+            CLBLAS_TRMM,
+            &gset,
+            &subgVNames,
+            upResFlags,
+            genResultUpdateWithFlags );
+    }
+    else {
+
+        //checkGenBeginHitMatrixBlock(ctx, kflags);
+        genResultUpdateWithFlags( ctx,
+            CLBLAS_TRMM,
+            &gset,
+            upResFlags,
+            NULL,
+            NULL,
+            NULL );
+        //checkGenEndHitMatrixBlock(ctx, kflags);
+    }
+
+    if (isMatrixUpper(kflags)) {
+        sprintf(tmp, "currM += %lu;\n", subdims[0].y);
+    }
+    else {
+        sprintf(tmp, "currM -= %lu;\n", subdims[0].y);
+    }
+    kgenAddStmt(ctx, tmp);
+
+    kgenEndBranch(ctx, NULL);
+
+    kgenEndFuncBody(ctx);
+    ret = kgenAddBlankLine(ctx);
+
+    if (!ret) {
+        ret = (ssize_t)kgenSourceSize(ctx) + 1;
+    }
+
+    destroyKgenContext(ctx);
+
+    return (ret < 0) ? -EOVERFLOW : ret;
+}
+
+//-----------------------------------------------------------------------------
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra)
+{
+    const CLBlasKargs *blasArgs = (const CLBlasKargs*)params;
+    KernelExtraFlags kflags = ((const CLBLASKernExtra*)extra)->flags;
+    int idx;
+
+    (void)extra;
+
+    initSizeKarg(&args[0], blasArgs->M);
+    initSizeKarg(&args[1], blasArgs->N);
+    assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype);
+    initMemobjKarg(&args[3], blasArgs->A, NULL, 0, 0);
+    initSizeKarg(&args[4], blasArgs->lda.matrix);
+    initMemobjKarg(&args[5], blasArgs->B, NULL, 0, 0);
+    initMemobjKarg(&args[6], blasArgs->B, NULL, 0, 0); //C in kernel
+    initSizeKarg(&args[7], blasArgs->ldb.matrix);
+    idx = 8;
+    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
+        initSizeKarg(&args[idx++], blasArgs->offA);
+    }
+    if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
+        initSizeKarg(&args[idx], blasArgs->offBX);
+    }
+}
+
+//-----------------------------------------------------------------------------
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs)
+{
+    (void)dim;
+    (void)dtype;
+    (void)ldsSize;
+    (void)kernelArgs;
+    /* LDS is not used here so we surely fit to LDS */
+    return true;
+}
+
+//-----------------------------------------------------------------------------
+
+static SolverFlags
+solverFlags(void)
+{
+    return (SF_WSPACE_1D);
+}
+
+//-----------------------------------------------------------------------------
+
+static void
+fixupArgs(void *args, SubproblemDim *subdims, void *extra)
+{
+    CLBlasKargs *kargs = (CLBlasKargs*)args;
+    extraData_t *extraData = (extraData_t*)&((CLBLASKernExtra*)extra)->solverPriv;
+
+    const size_t nChans = 8; // !!!DEVICE DEPENDED!!!
+    const size_t wideChans = 64; // !!!DEVICE DEPENDED!!!
+    const size_t sizeType[] = {1,2,2,4};
+
+    size_t sizeBlock = wideChans * nChans / sizeType[kargs->dtype];
+    size_t off = kargs->K % sizeBlock;
+    if (off == 0) { ///!= or == ???
+        extraData->staggered = roundUp(subdims[1].bwidth * sizeType[kargs->dtype]
+                                    , wideChans / sizeType[kargs->dtype]);
+    }
+    else {
+        extraData->staggered = 0;
+    }
+    extraData->staggered = 64 / sizeType[kargs->dtype]; //fixed, not calculated
+
+    fixupTrxmKargs((CLBlasKargs*)args);
+}
+
+//-----------------------------------------------------------------------------
+
+static bool
+blockCheckCalcDecomp(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    DataType dtype,
+    int check)
+{
+    bool ret = true;
+
+    DUMMY_ARG_USAGE(subdimsNum);
+
+    if (check == PGRAN_CHECK) {
+        unsigned int minSize, maxSize;
+
+        maxSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 4 : 8;
+        minSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 1 : 2;
+        ret = decompSanityCheck(subdims, minSize, maxSize, 24, dtype, true);
+        ret = ret && (subdims[0].bwidth == subdims[1].bwidth);
+        ret = ret && (pgran->wgSize[0] == 64);
+    }
+    else {
+        calcPgranDedicated(pgran, subdims, -1, 3);
+    }
+
+    return ret;
+}
+
+//-----------------------------------------------------------------------------
+
+void
+initTrmmCachedBlockPattern(MemoryPattern *mempat)
+{
+    mempat->name = "Cached global memory based trmm";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &blockSops;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_L2;
+    mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_L2;
+    mpatExtra.mobjA = CLMEM_BUFFER;
+    mpatExtra.mobjB = CLMEM_BUFFER;
+    mempat->extra = &mpatExtra;
+}
+
+//-----------------------------------------------------------------------------
+
+void
+initTrmmCachedSubgroupPattern(MemoryPattern *mempat)
+{
+    mempat->name = "Cached global memory based subgroup trmm";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &subgSops;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_L2;
+    mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_L2;
+    mpatExtra.mobjA = CLMEM_BUFFER;
+    mpatExtra.mobjB = CLMEM_BUFFER;
+    mempat->extra = &mpatExtra;
+}
+
+//-----------------------------------------------------------------------------
+
+static int
+blockGetPerf( unsigned int kflags,
+    const void *args )
+{
+    DUMMY_ARG_USAGE(args);
+
+    if( !isMatrixAccessColMaj( CLBLAS_TRMM, kflags, MATRIX_A ) &&
+        !isMatrixAccessColMaj( CLBLAS_TRMM, kflags, MATRIX_B ) ){
+
+        return PPERF_AVERAGE;
+    }
+
+    return PPERF_GOOD;
+}
+
+//-----------------------------------------------------------------------------
+
+static int
+subgGetPerf( unsigned int kflags,
+    const void *args )
+{
+    DUMMY_ARG_USAGE(args);
+
+    if( !isMatrixAccessColMaj( CLBLAS_TRMM, kflags, MATRIX_A ) &&
+        !isMatrixAccessColMaj( CLBLAS_TRMM, kflags, MATRIX_B ) ){
+
+        return PPERF_GOOD;
+    }
+
+    return PPERF_NOT_SUPPORTED;
+}
+
+//-----------------------------------------------------------------------------
+
+static void
+subgCalcThreads( size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra )
+{
+    CLBLASKernExtra* pKExtra;
+    CLBlasKargs    *pArgs;
+
+    //EINVAL
+    if ( NULL == subdims ||
+        NULL == pgran ||
+        NULL == args ||
+        NULL == extra ) {
+
+        return;
+    }
+    pKExtra = (CLBLASKernExtra*)extra;
+    pArgs = (CLBlasKargs*)args;
+
+    // if side is right the dimensions outside kernel are swapped
+    // A is NxN and B is MxN
+    // inside kernel A is still MxM
+    if ( pKExtra->flags & KEXTRA_SIDE_RIGHT ) {
+
+        threads[0] = ( (pArgs->M/subdims[0].x) * 64 );
+        // B tail group
+        if ( pArgs->M%subdims[0].x ) {
+            threads[0] += 64;//pgran->wgSize[0];
+        }
+    }
+    else {
+
+        threads[0] = ( (pArgs->N/subdims[0].x) * 64 );
+        // B tail group
+        if ( pArgs->N%subdims[0].x ) {
+            threads[0] += 64;//pgran->wgSize[0];
+        }
+    }
+    threads[1] = 0;
+
+}
+
+//-----------------------------------------------------------------------------
+
+static int trmmGetDefaultDecomp( PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    void *pArgs)
+{
+    (void*)subdimsNum;
+
+    if ( NULL == pArgs ) {
+        return -EINVAL;
+    }
+
+    subdims[1].bwidth = 2;
+    subdims[1].x = subdims[1].itemX = 8;
+    subdims[1].y = subdims[1].itemY = 8;
+
+    subdims[0].bwidth = 2;
+    subdims[0].x = subdims[0].itemX = 32;
+    subdims[0].y = 128;
+    subdims[0].itemY = -1;
+
+    pgran->wgDim = 1;
+    pgran->wgSize[0] = 64;
+    pgran->wgSize[1] = 1;
+
+    return 0;
+}
+
+static int trmmSubgGetDefaultDecomp( PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    void *pArgs)
+{
+    int itemsPerSubg = 4;
+    int subgA = 8;
+    int subgB = 2;
+
+    int bw1 = 8;
+    int x1 = 4;
+    int y1 = 4;
+    CLBlasKargs *kargs;
+
+    DUMMY_ARG_USAGE(subdimsNum);
+
+    if ( NULL == pArgs ) {
+        return -EINVAL;
+    }
+
+    kargs = (CLBlasKargs *)pArgs;
+
+    if( isComplexType(kargs->dtype) ){
+        bw1 /= 2;
+    }
+    if( isDoubleBasedType(kargs->dtype) ){
+        bw1 /= 2;
+    }
+
+    subdims[1].bwidth = bw1;
+    subdims[1].x = subdims[1].itemX = x1;
+    subdims[1].y = subdims[1].itemY = y1;
+
+    subdims[0].bwidth = bw1 * itemsPerSubg;
+    subdims[0].itemX = x1 * subgB;
+    subdims[0].x = x1*subgB;
+
+    subdims[0].itemY = y1*subgA;
+    subdims[0].y = y1*subgA;
+
+    pgran->wgDim = 1;
+    pgran->wgSize[0] = 64;
+    pgran->wgSize[1] = 1;
+
+    return 0;
+}
+
+//-----------------------------------------------------------------------------
+// TODO: reimplement via new validation API
+static bool
+subgCheckCalcDecomp( PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    DataType dtype,
+    int check )
+{
+    unsigned int subgA = 0;
+    unsigned int subgB = 0;
+    unsigned int regUse = 0;
+    unsigned int itemsPerSubg = 0;
+
+    DUMMY_ARG_USAGE(subdimsNum);
+
+    if( 0 == subdims[0].x ||
+        0 == subdims[0].y ||
+        0 == subdims[0].bwidth ||
+        0 == subdims[1].x ||
+        0 == subdims[1].y ||
+        0 == subdims[1].bwidth ){
+
+        return false;
+    }
+
+    subgA = subdims[0].y/subdims[1].y;
+    subgB = subdims[0].x/subdims[1].x;
+    itemsPerSubg = subdims[0].bwidth/subdims[1].bwidth;
+
+    if( itemsPerSubg < 4 ){
+        return false;
+    }
+
+    if( subdims[1].y < 4 ||
+        subdims[1].x < 4 ||
+        subdims[1].bwidth < 4 ){
+        return false;
+    }
+
+    if( subdims[1].x != subdims[1].itemX ||
+        subdims[1].y != subdims[1].itemY ){
+
+        return false;
+    }
+
+    // the group block must consist of integer number of subgroup blocks
+    if( subdims[0].x % subdims[1].x ||
+        subdims[0].y % subdims[1].y ||
+        subdims[0].bwidth % subdims[1].bwidth ){
+
+        return false;
+    }
+
+    //check fitting of bw to common vector sizes
+    if( isComplexType(dtype) ){
+
+        if( 2*subdims[1].bwidth > 16 ){
+
+            return false;
+        }
+    }
+
+    // check dimensions
+    if( subdims[1].bwidth > 16 ||
+        subdims[1].x > 16 ||
+        subdims[1].y > 16 ){
+
+        return false;
+    }
+
+    // estimate register usage, drop
+    // inevitably slowed decompositions
+    regUse =
+        (   subdims[1].bwidth * subdims[1].x +
+            subdims[1].bwidth * subdims[1].y +
+            subdims[1].x * subdims[1].y ) *
+        dtypeSize(dtype);
+
+    regUse /= 16; // 16 bytes per register
+
+    if( regUse >= 64 ){
+        return false;
+    }
+
+    // passed PGranularity should be checked
+    if( PGRAN_CHECK == check ){
+
+        if( pgran->wgDim != 1 ){
+            return false;
+        }
+        if( pgran->wgSize[0] != 64 ){
+            return false;
+        }
+
+        if( pgran->wgSize[0] != subgA*subgB*itemsPerSubg ){
+            return false;
+        }
+    }
+    // PGranularity should be calculated
+    else{
+        pgran->wgDim = 1;
+        pgran->wgSize[0] = subgA * subgB * itemsPerSubg;
+    }
+
+    return true;
+}
diff --git a/src/library/blas/gens/trmv_reg.cpp b/src/library/blas/gens/trmv_reg.cpp
new file mode 100644
index 0000000..28ee1f2
--- /dev/null
+++ b/src/library/blas/gens/trmv_reg.cpp
@@ -0,0 +1,490 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/*
+ * trmv generator
+ */
+//#define DEBUG_TRMV
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include "blas_kgen.h"
+#include <kprintf.hpp>
+#include <trmv.clT>
+#include <solution_seq.h>
+
+extern "C"
+unsigned int dtypeSize(DataType type);
+
+
+static char Prefix[4];
+
+static SolverFlags
+solverFlags(void)
+{
+	#ifdef DEBUG_TRMV
+	printf("solverFlags callen......\n");
+	#endif
+
+    return (SF_WSPACE_1D);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+
+static void
+assignKargs(KernelArg *args, const void *params, const void* extra );
+
+extern "C"
+void initTrmvRegisterPattern(MemoryPattern *mempat);
+
+static  KernelExtraFlags
+selectVectorization(
+    void *kargs,
+    unsigned int vlen );
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *kArgs);
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs);
+
+static SolverOps trmvOps = {
+    generator,
+    assignKargs,
+    isFitToLDS,
+    NULL, // Prepare Translate Dims
+    NULL, // Inner Decomposition Axis
+    calcNrThreads,
+    NULL,
+    solverFlags,
+	NULL,
+	NULL,
+	NULL,
+	setBuildOpts,
+	selectVectorization
+};
+
+static  KernelExtraFlags
+selectVectorization(
+	void *args,
+	unsigned int vlen )
+{
+	KernelExtraFlags kflags = KEXTRA_NO_FLAGS;
+	CLBlasKargs *kargs  = (CLBlasKargs *)args;
+
+    if( ( (kargs->uplo == clblasLower) && (kargs->order == clblasColumnMajor) ) ||
+          ( (kargs->uplo == clblasUpper) && (kargs->order == clblasRowMajor) )   )
+	    {
+			if( (kargs->N) % vlen)
+			{
+				kflags = KEXTRA_NO_COPY_VEC_A;
+			}
+		}
+    if( kargs->pigFuncID == CLBLAS_TPMV || kargs->pigFuncID == CLBLAS_HPMV || kargs->pigFuncID == CLBLAS_SPMV )
+    {
+        kflags = KEXTRA_NO_COPY_VEC_A;     // Packed-case never do aligned access
+    }
+	return kflags;
+}
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+	const SolutionStep *step = (const SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
+	{
+		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		#ifdef DEBUG_TRMV
+		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
+		#endif
+	}
+    if( (step->funcID == CLBLAS_HEMV) || (kargs->pigFuncID == CLBLAS_HPMV) || (kargs->pigFuncID == CLBLAS_SPMV) )
+	{
+		strcat( buildOptStr, " -DHEMV_ONLY ");
+		/*
+		if(kargs->diag == clblasUnit)
+		{
+			strcat( buildOptStr, " -DHEMV_ZERO_DIAG ");
+		}
+		*/
+	}
+    if ( kargs->pigFuncID == CLBLAS_SPMV )
+    {
+        strcat( buildOptStr, " -DSPMV_ONLY ");
+    }
+    if( (kargs->pigFuncID == CLBLAS_TPMV) || (kargs->pigFuncID == CLBLAS_HPMV) || (kargs->pigFuncID == CLBLAS_SPMV) )
+    {
+        strcat( buildOptStr, " -DPACKED ");
+    }
+
+	return;
+}
+
+
+static CLBLASMpatExtra mpatExtra;
+
+extern "C"
+void initTrmvRegisterPattern(MemoryPattern *mempat)
+{
+	#ifdef DEBUG_TRMV
+	printf("initTRMVREgPattern called with mempat = 0x%p\n", mempat);
+	#endif
+
+	fflush(stdout);
+    mempat->name = "Register accumulation based trmv";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &trmvOps;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L2;
+    mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS; // For "x" vector
+    mpatExtra.mobjA = CLMEM_BUFFER;
+    mpatExtra.mobjB = CLMEM_BUFFER;
+    mempat->extra = &mpatExtra;
+
+	Prefix[TYPE_FLOAT] = 'S';
+	Prefix[TYPE_DOUBLE] = 'D';
+	Prefix[TYPE_COMPLEX_FLOAT] = 'C';
+	Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *_extra)
+{
+	int BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block
+	#ifdef DEBUG_TRMV
+	printf("calcNrThreads called from TRMV_Reg.c\n");
+	#endif
+
+    const CLBlasKargs *kargs = (const CLBlasKargs *)args;
+	const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra;
+
+	clblasOrder order = ( extra->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor;
+	clblasTranspose trans = ( extra->flags & KEXTRA_TRANS_A) ? clblasTrans :
+								(( extra->flags & KEXTRA_CONJUGATE_A) ? clblasConjTrans: clblasNoTrans);
+
+	// unity and doConj handled in setKernelArgs
+    if ( order == clblasRowMajor )
+    {
+        order = clblasColumnMajor;
+        if ( trans == clblasNoTrans)
+        {
+            trans = clblasTrans;
+        }
+        else if ( trans == clblasTrans )
+        {
+            trans = clblasNoTrans;
+        }
+        else // clblasConjTrans
+        {
+            trans = clblasNoTrans;
+        }
+    }
+
+	size_t TARGETROWS =  (trans == clblasNoTrans) ? subdims->y : BLOCKSIZE/(subdims->y/extra->vecLenA);
+
+	#ifdef DEBUG_TRMV
+	printf("kargs-> N : %d, TARGETROWS: %d\n", kargs->N, TARGETROWS);
+	#endif
+
+	size_t blocks = ((kargs->N - 1)/ TARGETROWS) + 1;
+	#ifdef DEBUG_TRMV
+	printf("blocks : %d\n", blocks);
+	#endif
+
+	threads[0] = blocks * BLOCKSIZE;
+	#ifdef DEBUG_TRMV
+	printf("pgran-wgSize[0] : %d, globalthreads[0]  : %d\n", pgran->wgSize[0], threads[0]);
+	#endif
+	threads[1] = 1;
+}
+
+//
+// FIXME: Report correct return value - Needs change in KPRINTF
+//
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+
+	size_t BLOCKSIZE  = pgran->wgSize[0];
+	char tempTemplate[32*1024];
+	char targetRows[10], blockSize[10];
+
+	if ( buf == NULL) // return buffer size
+	{
+		buflen = (64 * 1024 * sizeof(char));
+        return (ssize_t)buflen;
+	}
+	CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
+
+	#ifdef DEBUG_TRMV
+ 	printf("TRMV GENERATOR called....\n");
+	#endif
+
+	if((( extraFlags->flags &  KEXTRA_TRANS_A) || ( extraFlags ->flags & KEXTRA_CONJUGATE_A )))
+	{
+		#ifdef DEBUG_TRMV
+		printf("A is trans or CONJ-TRANS\n");
+		#endif
+	}
+	else
+	{
+		#ifdef DEBUG_TRMV
+		printf("A is noTrans...\n");
+		#endif
+	}
+
+	clblasUplo uplo   = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower;
+	clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor;
+	clblasTranspose trans = ( extraFlags->flags & KEXTRA_TRANS_A) ? clblasTrans : (( extraFlags->flags & KEXTRA_CONJUGATE_A) ? clblasConjTrans: clblasNoTrans);
+
+	// unity and doConj handled in setKernelArgs
+    if ( order == clblasRowMajor )
+    {
+        order = clblasColumnMajor;
+        if ( trans == clblasNoTrans)
+        {
+            trans = clblasTrans;
+        }
+        else if ( trans == clblasTrans )
+        {
+            trans = clblasNoTrans;
+        }
+        else // clblasConjTrans
+        {
+            trans = clblasNoTrans;
+        }
+
+		uplo = ( uplo == clblasUpper)? clblasLower : clblasUpper;
+    }
+
+
+	if ((subdims->y % extraFlags->vecLenA) != 0)
+	{
+		printf("WARNING: TRMV: generator: TARGETROWS must be divisible by Vector Length\n");
+		return 0;
+	}
+
+	size_t TARGETROWS = 0;
+	if ( trans == clblasNoTrans)
+	{
+		#ifdef DEBUG_TRMV
+		printf("clblasNoTrans....%s\n",	( uplo == clblasLower )?"LOWER":"UPPER");
+		#endif
+
+		( uplo == clblasLower )?
+		    		(strcpy(tempTemplate, (char*)trmv_CL_kernel)) : (strcpy(tempTemplate, (char*)trmv_CU_kernel));
+
+		TARGETROWS = subdims->y;
+		if ((BLOCKSIZE % TARGETROWS) != 0)
+		{
+			printf("WARNING: TRMV: generator: Invalid Block Size\n");
+			return 0;
+		}
+	}
+	else // Transpose cases...
+	{
+		#ifdef DEBUG_TRMV
+		printf("clblasTrans....%s\n",	( uplo == clblasLower )?"LOWER":"UPPER");
+		#endif
+
+		( uplo == clblasLower )?
+		    		(strcpy(tempTemplate, (char*)trmv_CLT_kernel)) : (strcpy(tempTemplate, (char*)trmv_CUT_kernel));
+
+		if ((BLOCKSIZE % (subdims->y / extraFlags->vecLenA)) != 0)
+		{
+			printf("WARNING: TRMV: generator: Invalid Block Size\n");
+			return 0;
+		}
+		TARGETROWS = BLOCKSIZE/(subdims->y / extraFlags->vecLenA);
+	}
+
+	#ifdef DEBUG_TRMV
+	printf("dataType : %c\n", Prefix[extraFlags->dtype]);
+	#endif
+
+	// FIXME: VECTORSIZE HARD CODED
+	// FIXME : SetKernelArgs.. sends offa, offx, and lda should be received as uint
+    unsigned int vecLenA = extraFlags->vecLenA;
+
+	#ifdef DEBUG_TRMV
+	printf("Vector length used : %d\n\n", vecLenA);
+	#endif
+
+	bool doVLOAD = false;
+	if( extraFlags->flags &  KEXTRA_NO_COPY_VEC_A )
+	{
+		doVLOAD = true;
+		#ifdef DEBUG_TRMV
+			printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
+		#endif
+	}
+	else
+	{
+		#ifdef DEBUG_TRMV
+			printf("Using Aligned Data Pointer .........................\n");
+		#endif
+	}
+	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD);
+
+    sprintf( targetRows, "%d", TARGETROWS );
+	sprintf( blockSize, "%d", BLOCKSIZE );
+
+	#ifdef DEBUG_TRMV
+    printf("TARGET ROWS = %s\n", targetRows);
+    printf("BLOCK SIZE = %s\n", blockSize);
+	#endif
+
+    kobj.put("%TARGET_ROWS", (const char *)targetRows);
+    kobj.put("%BLOCKSIZE", (const char *) blockSize);
+    kobj.spit((char*)buf, tempTemplate);
+
+	return (64 * 1024 * sizeof(char));
+    // return 0;//(ret < 0) ? -EOVERFLOW : ret;
+}
+
+/*
+		(__global %TYPE const* restrict A, __global %TYPE * _xnew, __global %TYPE const* restrict _x_vector, uint N,
+		int incx, int isUnity, uint lda, int doConj, uint offa, uint offx)
+*/
+static void
+assignKargs(KernelArg *args, const void *params, const void* )
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+
+	//NOTE: This will not work if SolutionStep->args is not passed in const void *params.
+	SolutionStep *step = container_of(blasArgs, args, SolutionStep);
+
+	cl_int inc;
+	cl_int unity, doConj;
+    //bool incxOne = (blasArgs->ldb.vector == 1);
+    //bool incyOne = (blasArgs->ldc.vector == 1);
+
+    INIT_KARG(&args[0], blasArgs->A); 	//A - input matrix - argument
+    if( (step->funcID == CLBLAS_HEMV) || (blasArgs->pigFuncID == CLBLAS_HPMV) || (blasArgs->pigFuncID == CLBLAS_SPMV) )
+	{
+		INIT_KARG(&args[1], blasArgs->C);   //y - since the 2nd argument is the result buffer, we should send y for HEMV
+        INIT_KARG(&args[2], blasArgs->B);   //x - actual x vector argument
+	}
+	else
+	{
+		INIT_KARG(&args[1], blasArgs->B); 	//x - result buffer = _xnew argument
+    	INIT_KARG(&args[2], blasArgs->C); 	//y - scratch == _x_vector argument
+    }
+	initSizeKarg(&args[3], blasArgs->N);
+    inc = blasArgs->ldb.vector;
+    INIT_KARG(&args[4], inc);
+	unity = (blasArgs->diag == clblasUnit);
+   	INIT_KARG(&args[5], unity);
+    initSizeKarg(&args[6], blasArgs->lda.matrix);
+	doConj = (blasArgs->transA == clblasConjTrans);
+
+	#ifdef DEBUG_TRMV
+	printf("doConj is : %d, unity is : %d, incx is : %d\n", doConj, unity, inc);
+	#endif
+
+   	INIT_KARG(&args[7], doConj);
+	initSizeKarg(&args[8], blasArgs->offa);
+	initSizeKarg(&args[9], blasArgs->offBX);
+
+	// For HEMV both alpha and beta has to be passed.
+	if( (step->funcID == CLBLAS_HEMV) || (blasArgs->pigFuncID == CLBLAS_HPMV) || (blasArgs->pigFuncID == CLBLAS_SPMV) )
+	{
+		inc = blasArgs->ldc.vector;
+		INIT_KARG(&args[10], inc);
+		initSizeKarg(&args[11], blasArgs->offCY);
+		assignScalarKarg(&args[12], &(blasArgs->alpha), blasArgs->dtype);
+		assignScalarKarg(&args[13], &(blasArgs->beta), blasArgs->dtype);
+	}
+
+	return;
+}
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs)
+{
+	size_t x, y;
+    cl_ulong maxSize;
+    CLBlasKargs *blasArgs = (CLBlasKargs *)kernelArgs;
+	//size_t tile;
+	size_t maxBlockSize = 256; // PENDING: Query MAX_WORKGROUP_SIZE from OpenCL
+	size_t extra;
+	int naturalVecLength = sizeof(cl_float4)/sizeof(dtype);
+    dim = dim; // Dummy- to remove warnings
+	//extra = (blasArgs->transA == clblasNoTrans) ? dim[0].bwidth : dim[0].y;
+	//extra =  (extra > maxBlockSize) ? maxBlockSize : extra;
+
+	//
+	// TRMV is colMajor always...
+	//
+	y = 16; // Optimized for 16 float4 type reads by a quarter wavefront
+	x = maxBlockSize / y;
+
+	maxSize = x*y*sizeof(cl_float4); // PENDING: Implementing %REDUCE_SUM can bring this down to sizeof(cl_float) for non-transpose cases
+	extra = ((blasArgs->transA == clblasNoTrans) ? x : (y*naturalVecLength)) * sizeof(dtype);
+    return ((maxSize + extra) <= ldsSize);
+/*
+	tile = dim[0].y * dim[0].bwidth;
+	tile = (tile > maxBlockSize) ?  (maxBlockSize) : tile;
+	tile += extra;
+	maxSize = tile * dtypeSize(dtype);
+*/
+}
diff --git a/src/library/blas/gens/trsm.c b/src/library/blas/gens/trsm.c
new file mode 100644
index 0000000..a5f4d88
--- /dev/null
+++ b/src/library/blas/gens/trsm.c
@@ -0,0 +1,1649 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * TRSM generator with support of cached reads from the global memory
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include <matrix_props.h>
+#include <matrix_dims.h>
+
+#include "dblock_kgen.h"
+#include "kerngen.h"
+#include "blas_kgen.h"
+#include "gen_helper.h"
+#include "trxm_common.h"
+#include "trsm_kgen.h"
+#include "legacy/blas_kgen_legacy.h"
+
+typedef enum LdsUseFlags {
+    LDS_NO_USE = 0,
+    LDS_USE_LARGE = 0x1,
+    LDS_USE_DIAGONAL = 0x2
+} LdsUseFlags;
+
+typedef struct TrsmExtraParams {
+    int unrollingFactor;
+    unsigned int unrolledTail;
+    LdsUseFlags ldsUse;
+} TrsmExtraParams;
+
+enum TrsmStage {
+    BLOCK_UPDATE,
+    TILE_UPDATE
+};
+
+static CLBLASMpatExtra mpatExtra;
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs);
+
+static SolverFlags
+solverFlags(void);
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra);
+
+static void
+fixupArgs(void *args, SubproblemDim *subdims, void *extra);
+
+static bool
+checkCalcDecompDedicated(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    DataType dtype,
+    int check);
+
+#if 0
+static int
+getDefaultDecomp(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    void * pArgs);
+#endif
+
+static SolverOps trsmSops = {
+    generator,
+    assignKargs,
+    isFitToLDS,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    solverFlags,
+    fixupArgs,
+    NULL,//getDefaultDecomp
+    checkCalcDecompDedicated,
+    NULL,
+    NULL
+};
+
+// The struct for storage tails
+typedef struct TileSet
+{
+    Tile rectA;     // The rectangular tile A for the update loop at stage 1
+    Tile squareA;   // The square tile for the stage 2
+    Tile origB;     // The rectangular tile B for the update loop at the stage 1
+    Tile bStage2;   // The rectangular tile B for the update loop at thestage 2
+    Tile bAsSqA;    // Descriptor for holding square tile A in the storage of B
+    Tile bAsC;      // Descriptor for holding tile C in the storage of B
+    // the entire tile A matching the storage declared in the kernel
+    Tile A;
+    // the entire tile B matching the storage declared in the kernel
+    Tile B;
+} TileSet;
+
+
+static bool
+useSkewedFetchB(const BlasGenSettings *gset)
+{
+    KernelExtraFlags kflags = gset->kextra->flags;
+    TrsmExtraParams *extraParams = (TrsmExtraParams*)gset->kextra->solverPriv;
+    bool ret = false;
+
+    if (extraParams->ldsUse & LDS_USE_LARGE) {
+        ret = !isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B);
+    }
+
+    return ret;
+}
+
+static void
+restoreTile(Tile* dst, const Tile* src)
+{
+    dst->baseName = src->baseName;
+    dst->vecLen = src->vecLen;
+    dst->storType = src->storType;
+}
+
+static Tile
+substituteTile(Tile* dst, const Tile* src)
+{
+    Tile tmp;
+
+    restoreTile(&tmp, dst);
+    restoreTile(dst, src);
+
+    return tmp;
+}
+
+static void
+sprintfInvertedElement(
+    Kstring *elem,
+    const Tile *tile,
+    unsigned int row,
+    unsigned int col,
+    unsigned int len,
+    bool isU)
+{
+    if (isU) {
+        row = tile->nrRows - row - 1;
+        col = tile->nrCols - col - len;
+    }
+
+    sprintfTileElement(elem, tile, row, col, len);
+}
+
+static void
+genTileInverting(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    const TileSet *tileSet)
+{
+    char tmp[1024];
+    const CLBLASKernExtra *kextra = gset->kextra;
+    KernelExtraFlags kflags = kextra->flags;
+    DataType dtype = kextra->dtype;
+    const SubproblemDim *dim = &gset->subdims[1];
+    unsigned int accLen;
+    unsigned int i, j, k;
+    Tile srcTile;
+    Tile dstTile;
+    bool isU, isComplex;
+    bool isInlined = gset->flags & BGF_EXPLICIT_INLINE;
+    const char* typeNameA;
+    const char* typeNameB;
+
+    memcpy(&srcTile, &tileSet->bAsSqA, sizeof(srcTile));
+    memcpy(&dstTile, &tileSet->squareA, sizeof(dstTile));
+
+    getVectorTypeName(kextra->dtype, dstTile.vecLen, &typeNameA, NULL);
+    getVectorTypeName(kextra->dtype, srcTile.vecLen, &typeNameB, NULL);
+    isU = isMatrixUpper(kflags);
+    isComplex = isComplexType(dtype);
+
+    if (isComplex || dstTile.trans) {
+        accLen = 1;
+    }
+    else {
+        accLen = umin(srcTile.vecLen, dstTile.vecLen);
+        accLen = umin(accLen, srcTile.nrCols);
+    }
+
+    if (!isInlined) {
+        dstTile.baseName = "a";
+        srcTile.baseName = "b";
+        sprintf(tmp, "void\n"
+                     "invertTile(%s *a, %s *b)\n",
+                typeNameA, typeNameB);
+        kgenDeclareFunction(ctx, tmp);
+        kgenBeginFuncBody(ctx);
+    }
+    else {
+        kgenAddStmt(ctx, "// Invert tile\n");
+    }
+
+    // made destination block unit
+    genZeroTile(ctx, &dstTile);
+    for (i = 0; i < dim->y; i++) {
+        genSetUnitInTile(ctx, &dstTile, i, i);
+    }
+    kgenAddBlankLine(ctx);
+
+    for (i = 0; i < dim->y; i++) {
+        Kstring src, srcDiag, dst, dstLast;
+
+        // current source diagonal element
+        sprintfInvertedElement(&srcDiag, &srcTile, i, i, 1, isU);
+        for (j = i; j < dim->y; j++) {
+            // current source non diagonal element
+            if (i) {
+                sprintfInvertedElement(&src, &srcTile, j, i - 1, 1, isU);
+            }
+
+            for (k = 0; k < dim->y; k += accLen) {
+                // current updated vectorized element
+                sprintfInvertedElement(&dst, &dstTile, j, k, accLen, isU);
+
+                // update
+                if (i) {
+                    // last updated vectorized element
+                    sprintfInvertedElement(&dstLast, &dstTile, i - 1, k,
+                                           accLen, isU);
+                    if (isComplex) {
+                        sprintf(tmp, "%s -= mul(%s, %s);\n",
+                                dst.buf, dstLast.buf, src.buf);
+                    }
+                    else {
+                        sprintf(tmp, "%s -= %s * %s;\n",
+                                dst.buf, dstLast.buf, src.buf);
+                    }
+                    kgenAddStmt(ctx, tmp);
+                }
+
+                // divide on the diagonal element
+                if (j == i) {
+                    if (isComplex) {
+                        sprintf(tmp, "%s = div(%s, %s);\n",
+                                dst.buf, dst.buf, srcDiag.buf);
+                    }
+                    else {
+                        sprintf(tmp, "%s /= %s;\n", dst.buf, srcDiag.buf);
+                    }
+                    kgenAddStmt(ctx, tmp);
+                }
+            }
+        }
+        if (i != dim->y - 1) {
+            kgenAddBlankLine(ctx);
+        }
+    }
+
+    if (!isInlined) {
+        kgenEndFuncBody(ctx);
+    }
+    kgenAddBlankLine(ctx);
+
+}
+
+static void
+declareLocalVariables(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    Tile* parTile,
+    TrsmExtraParams * extraParams)
+{
+    char tmp[1024];
+    const SubproblemDim *dims = gset->subdims;
+    const char* parTileTypeName = NULL;
+    bool trb = isMatrixAccessColMaj(CLBLAS_TRSM, gset->kextra->flags,
+                                   MATRIX_B);
+    unsigned int locWidth;
+    unsigned int tsize;
+    unsigned int parTileSize;
+    unsigned int l1Pans;
+    unsigned int step;
+
+    kgenAddStmt(ctx,
+                 "const int lid = get_local_id(0);\n"
+                 "const int gid = get_group_id(0);\n"
+                 "GPtr uA, uB;\n"
+                 "uint coordA, coordB;\n"
+                 "uint m0 = 0, k0, m1;\n");
+
+    if (isMatrixUpper(gset->kextra->flags)) {
+        sprintf(tmp, "uint currM = (M - 1) / %lu * %lu;\n",
+                dims[0].y, dims[0].y);
+        kgenAddStmt(ctx, tmp);
+    }
+
+    /*
+     * Declare private blocks.
+     * The region 'b' stores in different time tiles of both
+     * the input matrices and the result
+     */
+
+    declareTileStorages(ctx, gset);
+
+    *parTile = gset->tileBX;
+
+    if (extraParams->ldsUse) {
+        tsize = dtypeSize(gset->kextra->dtype);
+        l1Pans = (unsigned int)(dims[0].x / dims[1].x);
+
+        parTile->vecLen = (trb) ? (unsigned int)dims[1].x
+                                : (unsigned int)dims[1].bwidth;
+        parTile->vecLen = umin(parTile->vecLen, sizeof(cl_float4) / tsize);
+        parTile->trans = trb;
+
+       /*
+        * Allocate enough space in the local area to fit several tiles
+        * at the stage1 (according to the unrolled factor) and one tile
+        * at the stage2
+        */
+
+        locWidth = (unsigned int)dims[1].bwidth * extraParams->unrollingFactor;
+        if (extraParams->ldsUse & LDS_USE_DIAGONAL) {
+            locWidth = umax(locWidth, (unsigned int)dims[1].y);
+        }
+        if (trb) {
+            parTile->nrRows = locWidth;
+            parTile->nrCols = (unsigned int)dims[0].x;
+            step = (unsigned int)dims[1].x / parTile->vecLen;
+        }
+        else {
+            parTile->nrRows = (unsigned int)dims[0].x;
+            parTile->nrCols = locWidth;
+            step = (unsigned int)dims[1].x * locWidth / parTile->vecLen;
+        }
+
+        parTileSize = tileVectorsNum(parTile);
+
+        getVectorTypeName(gset->kextra->dtype, parTile->vecLen,
+                          &parTileTypeName, NULL);
+
+        sprintf(tmp, "__local %s tmpB[%i];\n"
+                     "LPtr lB;\n"
+                     "LPtr lBMain = {(__local float*)(tmpB + lid %% %u * %u)};\n",
+                parTileTypeName, parTileSize, l1Pans, step);
+        kgenAddStmt(ctx, tmp);
+
+        if (useSkewedFetchB(gset)) {
+            kgenPrintf(ctx, "const uint skewX = lid %% %u %% %lu;\n",
+                       l1Pans, gset->subdims[1].x);
+        }
+    }
+
+    kgenAddBlankLine(ctx);
+}
+
+/*
+ * Generate cyclical tile shifting so as to convert the skewed
+ * storing to "one-to-one", i. e. the first element in the tile
+ * matches to the first element of the respective tile in the
+ * output matrix.
+ */
+static void
+genTileCyclicalShift(struct KgenContext *ctx, BlasGenSettings *gset)
+{
+    const char *tname;
+    Kstring k1, k2, *src, *dst, *ktmp;
+    unsigned int row, col;
+    unsigned int seglen;
+    Tile *tileC = &gset->tileCY;
+
+    seglen = tileLineSegmentLen(tileC);
+    getVectorTypeName(gset->kextra->dtype, seglen, &tname, NULL);
+
+    kgenAddStmt(ctx, "\n// deliver from skewing in the result\n");
+    kgenBeginBranch(ctx, "for (uint i = 0; i < skewX; i++)");
+    kgenPrintf(ctx, "%s tmp;\n\n", tname);
+
+    src = &k1;
+    dst = &k2;
+
+    // Skewing may be used only in case of transposed C
+    for (row = 0; row < tileC->nrRows; row += seglen) {
+        sprintfTileElement(dst, tileC, row, tileC->nrCols - 1, seglen);
+        kgenPrintf(ctx, "tmp = %s;\n", dst->buf);
+        for (col = tileC->nrCols - 1; col > 0; col--) {
+            sprintfTileElement(src, tileC, row, col - 1, seglen);
+            kgenPrintf(ctx, "%s = %s;\n", dst->buf, src->buf);
+            // swap pointer
+            ktmp = src;
+            src = dst;
+            dst = ktmp;
+        }
+        kgenPrintf(ctx, "%s = tmp;\n", dst->buf);
+    }
+
+    kgenEndBranch(ctx, NULL);
+    kgenAddBlankLine(ctx);
+}
+
+/*
+ * Setup coordinates before beginning a trsm stage
+ * A caller must ensure the strict stage sequence:
+ * BLOCK_UPDATE -> TILE_UPDATE
+ */
+static void
+genSetupCoords(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    enum TrsmStage stage)
+{
+    char tmp[1024];
+    KernelExtraFlags kflags = gset->kextra->flags;
+    const SubproblemDim *dims = gset->subdims;
+    unsigned int l1Pans = (unsigned int)(dims[0].x / dims[1].x);
+    const char *s;
+
+    s = isMatrixUpper(kflags) ? "currM" : "m0";
+    sprintf(tmp, "coordA = %s + (lid / %u * %lu);\n",
+            s, l1Pans, dims[1].y);
+    kgenAddStmt(ctx, tmp);
+
+    switch (stage) {
+    case BLOCK_UPDATE:
+        if (isMatrixUpper(kflags)) {
+            sprintf(tmp, "k0 = currM + %lu;\n", dims[0].y);
+        }
+        else {
+            sprintf(tmp, "k0 = 0;\n");
+        }
+        break;
+    case TILE_UPDATE:
+        if (isMatrixUpper(kflags)) {
+            sprintf(tmp, "k0 = currM + %lu - m1 * %lu;\n",
+                    dims[0].y - dims[1].y, dims[1].y);
+        }
+        else {
+            sprintf(tmp, "k0 = m0 + m1 * %lu;\n", dims[1].y);
+        }
+        break;
+    }
+
+    kgenAddStmt(ctx, tmp);
+
+    sprintf(tmp, "coordB = gid * %lu + (lid %% %u * %lu);\n",
+            dims[0].x, l1Pans, dims[1].x);
+
+    kgenAddStmt(ctx, tmp);
+    kgenAddBlankLine(ctx);
+}
+
+// Generate control block of the loop over K
+static void
+genInternalLoopCtl(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    KernelExtraFlags kflags,
+    size_t stepK,
+    size_t boundAlign)
+{
+    char tmp[1024];
+
+    if (isMatrixUpper(kflags)) {
+        if (kflags & KEXTRA_TAILS_M) {
+            sprintf(tmp, "for (k0 = currM + %lu; k0 < M / %lu * %lu; "
+                               "k0 += %lu)",
+                    dim[0].y, boundAlign, boundAlign, stepK);
+        }
+        else {
+            sprintf(tmp, "for (k0 = currM + %lu; k0 < M; k0 += %lu)",
+                    dim[0].y, stepK);
+        }
+    }
+    else {
+        sprintf(tmp, "for (k0 = 0; k0 < m0; k0 += %lu)",
+                stepK);
+    }
+
+    kgenBeginBranch(ctx, tmp);
+}
+
+static void
+initKernelVarNames(KernelVarNames *kvars)
+{
+    kvars->A = "uA";
+    kvars->B = "uB";
+    kvars->C = "B";
+    kvars->coordA = "coordA";
+    kvars->coordB = "coordB";
+    kvars->k = "k0";
+    kvars->sizeM = "M";
+    kvars->sizeN = "N";
+    kvars->sizeK = "M";
+    kvars->lda = "lda";
+    kvars->ldb = "ldb";
+    kvars->ldc = "ldb";
+    kvars->alpha = "alpha";
+    kvars->beta = "beta";
+}
+
+static void
+setFetchHandler(
+    TileMulOpts *mulOpts,
+    const BlasGenSettings *gset,
+    int handler(struct KgenContext *ctx, MatrixRole mrole, void *priv),
+    TilePostFetchPrivate *priv)
+{
+    int i, nrPrivs;
+    const char *regName = NULL;
+
+    if (handler == defaultTilePostFetch) {
+        nrPrivs = 1;
+    }
+    else {
+        nrPrivs = 2;
+        regName = "b";
+    }
+
+    for (i = 0; i < nrPrivs; i++) {
+        priv[i].fetchNumA = 0;
+        priv[i].wholeA = 1;
+        priv[i].funcID = CLBLAS_TRSM;
+        priv[i].gset = gset;
+        priv[i].regName = regName;
+        mulOpts->postFetch = handler;
+        mulOpts->postFetchPriv = priv;
+    }
+}
+
+static void
+genCheckShiftTailB(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    int adjustRestore,
+    TailStatus *tailStatus)
+{
+    BlasGenSettings gsetNew;
+    CLBLASKernExtra kextraNew;
+
+    memcpy(&gsetNew, gset, sizeof(gsetNew));
+    memcpy(&kextraNew, gset->kextra, sizeof(kextraNew));
+    // avoid tail shift for the matrix A
+    kextraNew.flags &= ~(KEXTRA_TAILS_M | KEXTRA_TAILS_M_LOWER);
+    gsetNew.kextra = &kextraNew;
+
+    if (adjustRestore) {
+        checkGenRestoreTailCoords(ctx, &gsetNew, *tailStatus);
+    }
+    else {
+        *tailStatus = checkGenAdjustTailCoords(ctx, CLBLAS_TRSM, &gsetNew,
+                                               NULL);
+    }
+}
+
+static void
+sprintfHitMatrixCond(
+    char *buf,
+    MatrixRole mrole,
+    const char *prefix,
+    const char *suffix)
+{
+    const char *coordName;
+    char bound;
+
+    coordName = (mrole == MATRIX_A) ? "coordA" : "coordB";
+    bound = (mrole == MATRIX_A) ? 'M' : 'N';
+    if (suffix == NULL) {
+        suffix = "";
+    }
+    sprintf(buf, "%s%s < %c%s", prefix, coordName, bound, suffix);
+}
+
+/*
+ * 'mulUpd' arguments mean what action is being done: multiplication on
+ * an inverted tile or subsequent update
+ */
+static void
+sprintfStage2Condition(
+    char *buf,
+    const BlasGenSettings *gset,
+    int mulUpd)
+{
+    KernelExtraFlags kflags = gset->kextra->flags;
+    char hitCond[1024];
+    char *p;
+    unsigned int xPans, yPans;
+
+
+    hitCond[0] = '\0';
+    xPans = (unsigned int)(gset->subdims[0].x / gset->subdims[1].x);
+    yPans = (unsigned int)(gset->subdims[0].y / gset->subdims[1].y);
+    if (kflags & KEXTRA_TAILS_M) {
+        sprintfHitMatrixCond(hitCond, MATRIX_A, " && ", NULL);
+    }
+    p = hitCond + strlen(hitCond);
+    if (kflags & KEXTRA_TAILS_N) {
+        sprintfHitMatrixCond(p, MATRIX_B, " && ", NULL);
+    }
+
+    if (!mulUpd) {
+        if (isMatrixUpper(kflags)) {
+            sprintf(buf, "if (lid / %u + m1 == %u%s)",
+                    xPans, yPans - 1, hitCond);
+        }
+        else {
+            sprintf(buf, "if (lid / %u == m1%s)", xPans, hitCond);
+        }
+    }
+    else {
+        if (isMatrixUpper(kflags)) {
+            sprintf(buf, "if (lid / %u + m1 < %u%s)",
+                    xPans, yPans - 1, hitCond);
+        }
+        else {
+            sprintf(buf, "if (lid / %u > m1%s)", xPans, hitCond);
+        }
+    }
+}
+
+static void
+genZeroTileTrash(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    MatrixRole mrole,
+    Tile* tile)
+{
+    char tmp[1024];
+    const SubproblemDim *dim = &gset->subdims[1];
+    const CLBLASKernExtra *kextra = gset->kextra;
+    unsigned int i, j;
+    unsigned int step;
+    Kstring elem;
+
+    if (mrole == MATRIX_A) {
+        kgenAddBlankLine(ctx);
+    }
+    else {
+        kgenBeginBranch(ctx, NULL);
+    }
+
+    sprintf(tmp, "const int bound = (coordA + %lu > M) ? (M - coordA) : %lu;\n",
+            dim->y, dim->y);
+    kgenAddStmt(ctx, tmp);
+
+    step = tileLineSegmentLen(tile);
+    step = (tile->trans) ? 1 : step;
+
+    for (j = 0; j < tile->nrRows; ++j) {
+        for (i = 0; i < tile->nrCols; i+=step) {
+            sprintfTileElement(&elem, tile, j, i, step);
+            sprintf(tmp, "%s = (bound <= %u) ? 0 : %s;\n", elem.buf, j, elem.buf);
+            kgenAddStmt(ctx, tmp);
+        }
+    }
+
+    // Set units in the trash diagonal elements for a tile of A
+    if (mrole == MATRIX_A) {
+        for (i = 0; i < (unsigned int)dim->y; i++) {
+            sprintfTileElement(&elem, tile, i, i, 1);
+            sprintf(tmp, "%s = (bound <= %d) ? %s : %s;\n",
+                    elem.buf, (int)i, strOne(kextra->dtype), elem.buf);
+            kgenAddStmt(ctx, tmp);
+        }
+    }
+
+    if (mrole == MATRIX_A) {
+        kgenAddBlankLine(ctx);
+    }
+    else {
+        kgenEndBranch(ctx, NULL);
+    }
+}
+
+/*
+ * NOTE: Before invoking this function 'tileA' must be initialized accordingly
+ *       so as it stores a square tile of the matrix A.
+ */
+static void
+genMulOnDiagonalTile(
+    struct KgenContext *ctx,
+    BlasGenSettings *gset,
+    TileSet *tileSet,
+    const TileMulOpts *mulOpts)
+{
+    char tmp[1024];
+    FetchOpts fetchOpts;
+    const SubproblemDim *dim = &gset->subdims[1];
+    TilePostFetchPrivate pfPriv[2];
+    TileMulOpts optsNew;
+    const CLBLASKernExtra *extra = gset->kextra;
+    CLBLASKernExtra extraNew;
+    KernelExtraFlags kflags = extra->flags;
+    Tile t;
+    bool isTail;
+
+    memset(&fetchOpts, 0, sizeof(fetchOpts));
+    fetchOpts.regName = "b";
+    fetchOpts.mrole = MATRIX_A;
+    fetchOpts.lineOffset = 0;
+    fetchOpts.linesNum = (unsigned int)dim->y;
+
+    // setup options to multiply on the inverted tile
+    memcpy(&optsNew, mulOpts, sizeof(TileMulOpts));
+    optsNew.flags &= ~TILEMUL_TRB;
+
+    kgenAddStmt(ctx, "// Fetch and invert the square tile located on the "
+                     "diagonal\n");
+
+    // The matrix B play the role of A
+    t = substituteTile(&gset->tileA, &tileSet->bAsSqA);
+
+    isTail = ((kflags & KEXTRA_TAILS_M) != 0);
+    genFetchInputTile(ctx, mulOpts->fctx, gset, &fetchOpts);
+    setFetchHandler(&optsNew, gset, genTrxmPostFetchZero, pfPriv);
+
+    /*
+     * There is no needs in zeroing tail along K in case of the lower
+     * triangular matrix because it is in the "other" triangle which is
+     * never accessed
+     */
+    if (isTail && !isMatrixUpper(kflags)) {
+        memcpy(&extraNew, extra, sizeof(extraNew));
+        extraNew.flags &= ~KEXTRA_TAILS_K_LOWER;
+        gset->kextra = &extraNew;
+    }
+    genTrxmPostFetchZero(ctx, MATRIX_A, pfPriv);
+
+    /*
+     * One must zero the tail part of a fetched square tile
+     * in order to avoid influence of the trailing trash on the resulting
+     * inverted tile (evaluating proceeds from the bottom towards the top
+     *                of the tile)
+     */
+    if (isTail) {
+        genZeroTileTrash(ctx, gset, MATRIX_A, &gset->tileA);
+    }
+
+    restoreTile(&gset->tileA, &t);
+
+    if(gset->flags & BGF_EXPLICIT_INLINE) {
+        genTileInverting(ctx, gset, tileSet);
+    }
+    else {
+        sprintf(tmp, "invertTile(%s, %s);\n\n",
+                tileSet->squareA.baseName, tileSet->bAsSqA.baseName);
+        kgenAddStmt(ctx, tmp);
+    }
+
+    gset->tileBX = tileSet->bAsC;
+    genTileCopy(ctx, &gset->tileBX, &gset->tileCY, TILECOPY_ASSIGN);
+
+    /*
+     * For the lower diagonal not integrally decomposed matrix A
+     * it's enough to zero the tail part of the result in order to
+     * clear trash accumulated over the update loop
+     */
+    if (isTail && !isMatrixUpper(kflags)) {
+        genZeroTileTrash(ctx, gset, MATRIX_B, &gset->tileBX);
+    }
+
+    genZeroTile(ctx, &gset->tileCY);
+
+    genMulTiles(ctx, gset, &optsNew);
+    kgenAddBlankLine(ctx);
+
+    // restore original extra
+    gset->kextra = extra;
+}
+
+static void
+genUpdateIntermResult(
+    struct KgenContext *ctx,
+    const BlasGenSettings *gset,
+    bool withMhitCond,
+    UpdateResultFlags flags)
+{
+    char tmp[1024];
+    const char *coordY, *coordX;
+    char *revAlp, *alp;
+    DataType dtype = gset->kextra->dtype;
+    KernelExtraFlags kflags = gset->kextra->flags;
+    const SubproblemDim *dim = &gset->subdims[1];
+    const KernelVarNames *kvarNames = &gset->varNames;
+    UpdateResultOp op;
+    UpresVarNames uvars;
+    const char* ctype;
+
+    memset(&uvars, 0, sizeof(uvars));
+
+    op = (flags & UPRES_WITH_BETA) ? UPRES_SUM : UPRES_SET;
+
+    uvars.startRow = kvarNames->coordA;
+    uvars.startCol = kvarNames->coordB;
+    uvars.nrRows = "y";
+    uvars.nrCols = "x";
+    uvars.result = "B";
+    uvars.ld = "ldb";
+
+    ctype = dtypeBuiltinType(dtype);
+    if (isComplexType(dtype)) {
+        if (dtype == TYPE_COMPLEX_FLOAT) {
+            revAlp = "div((float2)(-1.f, 0), alpha)";
+            alp = "(float2)(1.f, 0)";
+        }
+        else {
+            revAlp = "div((double2)(-1., 0), alpha)";
+            alp = "(double2)(1., 0)";
+        }
+    }
+    else {
+        revAlp = "-1. / alpha";
+        alp = "1.";
+    }
+
+    // inline result update
+    flags |= UPRES_INLINE;
+
+    coordY = kvarNames->coordA;
+    coordX = kvarNames->coordB;
+
+    /*
+     * We should be careful here.
+     *
+     * The non tailed case of updateResult() is rewritted.
+     * Now update result for tailed and non tailed cases have a bit
+     * different semantics.
+     *
+     * The first one produces expressions like
+     * 'dst = dst * beta + src * alpha'.
+     *
+     * Here 'dst' and 'src' may be private result stored in registers or
+     * result to be updated in the global memory. Let the first one to be
+     * designated as tileC and the second one as matC.
+     *
+     * The non tailed case produces expressions like
+     * 'dst = matC * beta + tileC * alpha'.
+     *
+     * The second variant is more clear and native for the new implementation.
+     * But as the difference is not eliminated, both the variants are
+     * maintained here.
+     */
+
+    if (!(kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N))) {
+        kgenBeginBranch(ctx, "");
+
+        sprintf(tmp, "%s %s = %s;\n"
+                     "%s alpha = beta;\n",
+                ctype, "beta", revAlp, ctype);
+        kgenAddStmt(ctx, tmp);
+
+        updateResultGen(ctx,
+            gset,
+            CLBLAS_TRSM,
+            op,
+            flags & ~UPRES_WITH_BETA,
+            &uvars);
+
+        kgenEndBranch(ctx, NULL);
+    }
+    else {
+        if (withMhitCond) {
+            sprintf(tmp, "if ((%s < %s) && (%s < %s))",
+                    coordY, kvarNames->sizeM, coordX, kvarNames->sizeN);
+            kgenBeginBranch(ctx, tmp);
+        }
+        else {
+            /* for x, y variables scope */
+            kgenBeginBranch(ctx, NULL);
+        }
+
+        sprintf(tmp, "uint y = min(%luu, %s - (uint)%s);\n"
+                     "uint x = min(%luu, %s - (uint)%s);\n",
+                dim->y, kvarNames->sizeM, coordY,
+                dim->x, kvarNames->sizeN, coordX);
+        kgenAddStmt(ctx, tmp);
+
+        sprintf(tmp, "if ((y == %lu) && (x == %lu))",
+                dim->y, dim->x);
+        kgenBeginBranch(ctx, tmp);
+
+        sprintf(tmp, "%s %s = %s;\n"
+                     "%s alpha = beta;\n",
+                ctype, "beta", revAlp, ctype);
+        kgenAddStmt(ctx, tmp);
+
+        // optimized update
+        updateResultGen(ctx,
+            gset,
+            CLBLAS_TRSM,
+            op,
+            flags & ~UPRES_WITH_BETA,
+            &uvars);
+
+        kgenEndBranch(ctx, NULL);
+
+        flags |= UPRES_GENERIC;
+        kgenBeginBranch(ctx, "else ");
+
+        sprintf(tmp, "%s %s = %s;\n"
+                     "%s %s = %s;\n",
+                ctype, "beta", revAlp,
+                ctype, "alpha", alp);
+        kgenAddStmt(ctx, tmp);
+
+        // not optimized update
+        updateResultGen(ctx,
+            gset,
+            CLBLAS_TRSM,
+            op,
+            flags,
+            &uvars);
+
+        kgenEndBranch(ctx, NULL);
+        kgenEndBranch(ctx, NULL);
+    }
+}
+
+static void
+genPreloadedTileMul(
+    struct KgenContext *ctx,
+    BlasGenSettings *gset,
+    TileMulOpts *mulOpts,
+    const Tile *parTile,
+    const char* copy2LDSFuncName)
+{
+    char tmp[1024];
+    KernelExtraFlags kflags = gset->kextra->flags;
+    unsigned int bwidthOld;
+    const char *oldNameB;
+    const char *ptrName;
+
+    getVectorTypeName(gset->kextra->dtype, parTile->vecLen, NULL, &ptrName);
+    kgenPrintf(ctx, "lB.%s = tmpB;\n", ptrName);
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+
+    if (!isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) {
+        sprintf(tmp, "%s(lB, uB, gid * %lu, k0, ldb);\n",
+            copy2LDSFuncName, gset->subdims[0].x);
+    }
+    else {
+        sprintf(tmp, "%s(lB, uB, k0, gid * %lu, ldb);\n",
+            copy2LDSFuncName, gset->subdims[0].x);
+    }
+    kgenAddStmt(ctx, tmp);
+
+    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
+    kgenAddBlankLine(ctx);
+
+    kgenAddStmt(ctx, "lB = lBMain;\n\n");
+
+    mulOpts->memB = CLMEM_LOCAL_MEMORY;
+    oldNameB = gset->varNames.B;
+    bwidthOld = (unsigned int)gset->subdims[0].bwidth;
+    gset->varNames.B = "lB";
+    gset->subdims[0].bwidth = (parTile->trans) ? parTile->nrRows :
+                                                 parTile->nrCols;
+
+    tileMulGen(ctx, gset, mulOpts);
+
+    gset->varNames.B = oldNameB;
+    gset->subdims[0].bwidth = bwidthOld;
+    mulOpts->memB = CLMEM_GLOBAL_MEMORY;
+}
+
+static void
+initTiles(
+    BlasGenSettings* gset,
+    TileSet* tileSet,
+    const struct SubproblemDim *subdims,
+    KernelExtraFlags kflags,
+    DataType dtype,
+    PrivateStorageType storType)
+{
+    unsigned int rowsA;
+    unsigned int rowsB;
+    unsigned int rowsC;
+    unsigned int colsA;
+    unsigned int colsB;
+    unsigned int colsC;
+    bool transA;
+    bool transB;
+    unsigned int vecLenA;
+    unsigned int vecLenB;
+    unsigned int vecLenC;
+
+    rowsA = (unsigned int)subdims[1].y;
+    colsA = (unsigned int)szmax(subdims[1].y, subdims[1].bwidth);
+
+    rowsB = (unsigned int)szmax(subdims[1].y, subdims[1].bwidth);
+    colsB = (unsigned int)szmax(subdims[1].x, subdims[1].y);
+
+    rowsC = (unsigned int)subdims[1].y;
+    colsC = (unsigned int)subdims[1].x;
+
+    transA = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A);
+    transB = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B);
+
+    vecLenA = (unsigned int)((transA) ? subdims[1].y : subdims[1].bwidth);
+    vecLenA = umin(vecLenA, MAX_TILE_VECLEN);
+    vecLenB = (unsigned int)((transB) ? subdims[1].x : subdims[1].bwidth);
+    vecLenB = umin(vecLenB, MAX_TILE_VECLEN);
+    vecLenC = (transB) ? vecLenB : vecLenA;
+
+    initTile(&tileSet->rectA, "a", (unsigned int)subdims[1].y,
+             (unsigned int)subdims[1].bwidth, vecLenA, dtype,
+             storType, transA, false);
+
+    initTile(&tileSet->squareA, "a", (unsigned int)subdims[1].y,
+             (unsigned int)subdims[1].y, vecLenA, dtype, storType,
+             transA, false);
+
+    initTile(&tileSet->origB, "b", (unsigned int)subdims[1].bwidth,
+             (unsigned int)subdims[1].x, vecLenB, dtype, storType,
+             !transB, false);
+
+    initTile(&tileSet->bStage2, "b", (unsigned int)subdims[1].y,
+             (unsigned int)subdims[1].x, vecLenB, dtype, storType,
+             !transB, false);
+
+    initTile(&tileSet->bAsSqA, "b", (unsigned int)subdims[1].y,
+             (unsigned int)subdims[1].y, vecLenB, dtype, storType,
+             transA, false);
+
+    initTile(&tileSet->bAsC, "b", (unsigned int)subdims[1].y,
+             (unsigned int)subdims[1].x, vecLenB, dtype, storType,
+             gset->tileCY.trans, false);
+
+    initTile(&gset->tileA, "a", rowsA, colsA,
+             vecLenA, dtype, storType, transA, false);
+
+    initTile(&gset->tileBX, "b", rowsB, colsB,
+             vecLenB, dtype, storType, !transB, false);
+
+    initTile(&gset->tileCY, "c", rowsC, colsC,
+             vecLenC, dtype, storType, !transB, false);
+
+    tileSet->A = gset->tileA;
+    tileSet->B = gset->tileBX;
+}
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+    char tmp[1024];
+    struct KgenContext *ctx;
+    ssize_t ret;
+    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    DataType dtype = kextra->dtype;
+    KernelExtraFlags kflags = kextra->flags;
+    CLBLASKernExtra extraNew;
+    BlasGenSettings gset;
+    TileMulOpts mulOpts;
+    const char *ptrName;
+    UpdateResultFlags upFlags = 0;
+    TilePostFetchPrivate pfPriv;
+    unsigned int l1Pans;
+    bool b;
+    Tile parTile;
+    TrsmExtraParams *extraParams = (TrsmExtraParams *)kextra->solverPriv;
+    int ldsLarge, lds_diagonal;
+    bool isInline;
+    TileSet tileSet;
+    char copy2LDSFuncName[FUNC_NAME_MAXLEN];
+    TailStatus tailStatus = 0;
+    FetchAddrMode addrMode = 0;
+    bool tailM = ((kflags & KEXTRA_TAILS_M) != 0);
+    bool tailN = ((kflags & KEXTRA_TAILS_N) != 0);
+    size_t alignK;
+
+    if (pgran->wgDim != 1) {
+        return -EINVAL;
+    }
+
+    l1Pans = (unsigned int)(subdims[0].x / subdims[1].x);
+
+    memset(&gset, 0, sizeof(gset));
+    gset.flags = BGF_WHOLE_A | BGF_EXPLICIT_INLINE | BGF_UPTRS;
+    memcpy(gset.subdims, subdims, sizeof(SubproblemDim) * 2);
+    // there is not need in block structure along K
+    gset.subdims[0].bwidth = gset.subdims[1].bwidth;
+    subdims = gset.subdims;
+
+    /*
+     * Since tiles are changed dynamically, e. g. in the main tilemul
+     * loop they are rectangular, but at the second stage both A and B
+     * tile storages are used for square tiles. One must adjust physical
+     * vectorization accordindly, so as vector length might not be
+     * greater than linear size of any tile
+     */
+    memcpy(&extraNew, kextra, sizeof(extraNew));
+    extraNew.vecLenA = umin(kextra->vecLenA, (unsigned int)subdims[1].y);
+    extraNew.vecLenB = umin(kextra->vecLenB, (unsigned int)subdims[1].y);
+
+    gset.pgran = pgran;
+    gset.kextra = &extraNew;
+    initKernelVarNames(&gset.varNames);
+
+    // multiplication options
+    mulOpts.memA = CLMEM_GLOBAL_MEMORY;
+    mulOpts.memB = CLMEM_GLOBAL_MEMORY;
+    mulOpts.core = (kextra->flags & KEXTRA_ENABLE_MAD) ? TILEMUL_MAD :
+                                                         TILEMUL_MULADD;
+    mulOpts.postFetch = NULL;
+    mulOpts.flags = kextraToTilemulFlags(CLBLAS_TRSM, kflags);
+    mulOpts.flags |= TILEMUL_EXTERN_RDECL | TILEMUL_NOT_INC_K;
+    mulOpts.fctx = createFetchContext();
+    if (mulOpts.fctx == NULL) {
+        return -ENOMEM;
+    }
+
+    disableFetchOptLevels(mulOpts.fctx, FOPTLEV_TMP_COORD_PRECOMPUTING);
+
+    isInline = (gset.flags & BGF_EXPLICIT_INLINE);
+
+    initTiles(&gset, &tileSet, subdims, kflags, dtype,
+              PRIV_STORAGE_VARIABLE_SET);
+
+    ctx = createKgenContext(buf, buflen, true);
+    if (ctx == NULL) {
+        destroyFetchContext(mulOpts.fctx);
+        return -ENOMEM;
+    }
+
+    kgenAddStmt(ctx, "#pragma OPENCL EXTENSION cl_amd_printf : enable\n\n");
+
+    b = isDoubleBasedType(dtype);
+    kgenDeclareUptrs(ctx, b);
+    if (isComplexType(dtype)) {
+        genComplexMathOperators(ctx, dtype);
+    }
+    if(!isInline) {
+        genTileInverting(ctx, &gset, &tileSet);
+    }
+
+    if ( extraParams->ldsUse != LDS_NO_USE ) {
+        SubproblemDim sdims;
+        DBlockCopyFlags flags;
+        unsigned int vecLen;
+
+        if (!isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) {
+            sdims.x = gset.subdims[1].bwidth * extraParams->unrollingFactor;
+            sdims.y = gset.subdims[0].x;
+        }
+        else {
+            sdims.x = gset.subdims[0].x;
+            sdims.y = gset.subdims[1].bwidth * extraParams->unrollingFactor;
+        }
+
+        vecLen = getVecLen(&gset, CLBLAS_TRSM, MATRIX_B);
+        flags = (vecLen < 4) ? DBLOCK_COPY_NOT_VECTORIZE : 0;
+        copyDataBlockGen(ctx, &sdims, gset.pgran, dtype,
+                         DBLOCK_GLOBAL_TO_LOCAL, flags);
+        kgenAddBlankLine(ctx);
+        kgenGetLastFuncName(copy2LDSFuncName, FUNC_NAME_MAXLEN, ctx);
+    }
+
+    declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRSM, "Cached", false,
+                      true);
+    kgenBeginFuncBody(ctx);
+
+    declareLocalVariables(ctx, &gset, &parTile, extraParams);
+    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
+        kgenAddStmt(ctx, "A += offA;\n");
+    }
+    genTrxmBMatrShift(ctx, kflags, false);
+
+    ptrName = dtypeUPtrField(dtype);
+
+    sprintf(tmp, "uB.%s = B;\n\n", ptrName);
+    kgenAddStmt(ctx, tmp);
+
+    // external loop
+    sprintf(tmp, "for (m0 = 0; m0 < M; m0 += %lu)", subdims[0].y);
+    kgenBeginBranch(ctx, tmp);
+    genZeroTile(ctx, &gset.tileCY);
+    genSetupCoords(ctx, &gset, BLOCK_UPDATE);
+
+    kgenAddStmt(ctx, "// Stage 1. Multiply and update with large blocks\n");
+
+    gset.tileA = tileSet.rectA;
+    gset.tileBX = tileSet.origB;
+
+    if (!isMatrixUpper(kflags) && tailM) {
+        addrMode |= FETCH_ADDR_A_CYCLICAL;
+        setFetchAddrMode(mulOpts.fctx, addrMode);
+    }
+
+    ldsLarge = ((extraParams->ldsUse & LDS_USE_LARGE) != 0);
+    alignK = subdims[1].bwidth;
+    if (ldsLarge) {
+        alignK *= extraParams->unrollingFactor;
+    }
+
+    if (ldsLarge) {
+        const char *oldCoordB;
+        FetchAddrMode bamode = addrMode | FETCH_ADDR_K_RELATIVE;
+        bool withSkew;
+
+        withSkew = useSkewedFetchB(&gset);
+        if (!withSkew) {
+            bamode |= FETCH_ADDR_B_RELATIVE;
+        }
+        else {
+            bamode |= FETCH_ADDR_B_CYCLICAL;
+        }
+
+        setFetchAddrMode(mulOpts.fctx, bamode);
+
+        if (tailN) {
+            /*
+             * Conditional branch for those items which hit into
+             * matrix B with their matrix coordinates
+             */
+            sprintf(tmp, "if ((gid + 1) * %lu < N)", subdims[0].x);
+            kgenBeginBranch(ctx, tmp);
+        }
+
+        if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A)) {
+            kgenPrintf(ctx, "uA.%s = A + k0 * lda;\n", ptrName);
+        }
+        else {
+            kgenPrintf(ctx, "uA.%s = A + k0;\n", ptrName);
+        }
+
+        if (withSkew) {
+            unsigned int bwidthOld;
+
+            oldCoordB = gset.varNames.coordB;
+            gset.varNames.coordB = "skewX";
+            bwidthOld = gset.subdims[0].bwidth;
+            gset.subdims[0].bwidth = (parTile.trans) ? parTile.nrRows :
+                                                       parTile.nrCols;
+            gset.subdims[0].bwidth = bwidthOld;
+        }
+
+        genInternalLoopCtl(ctx, subdims, kflags, alignK, alignK);
+        genPreloadedTileMul(ctx, &gset, &mulOpts, &parTile, copy2LDSFuncName);
+        genInternalLoopEnd(ctx);                             // loop over K
+
+        if (withSkew) {
+            gset.varNames.coordB = oldCoordB;
+            setFetchAddrMode(mulOpts.fctx, bamode & ~FETCH_ADDR_B_CYCLICAL);
+            // deliver from skew in the result before proceed to the next stage
+            genTileCyclicalShift(ctx, &gset);
+        }
+
+        if (tailN) {
+            kgenEndBranch(ctx, NULL);
+            kgenBeginBranch(ctx, "else");
+        }
+
+        setFetchAddrMode(mulOpts.fctx, addrMode);
+    }
+
+    if (!ldsLarge || tailN) {
+        genCheckShiftTailB(ctx, &gset, 0, &tailStatus);
+        if ((kflags & KEXTRA_TAILS_N_LOWER) && !tailStatus) {
+            addrMode |= FETCH_ADDR_B_CYCLICAL;
+            setFetchAddrMode(mulOpts.fctx, addrMode);
+        }
+
+        if (tailN) {
+            sprintfHitMatrixCond(tmp, MATRIX_B, "if (", ")");
+            kgenBeginBranch(ctx, tmp);
+        }
+
+        genInternalLoopCtl(ctx, subdims, kflags, subdims[1].bwidth, alignK);
+        tileMulGen(ctx, &gset, &mulOpts);
+        genInternalLoopEnd(ctx);                             // loop over K
+
+        if (tailN) {
+            kgenEndBranch(ctx, NULL);
+        }
+
+        if (extraParams->ldsUse & LDS_USE_LARGE) {
+            kgenEndBranch(ctx, NULL);
+        }
+    }
+
+    sprintf(tmp, "uA.%s = A;\n\n", ptrName);
+    kgenAddStmt(ctx, tmp);
+
+    // processing tails along update dimension
+    if (isMatrixUpper(kflags) &&
+        ((kflags & KEXTRA_TAILS_K_LOWER) ||
+          (ldsLarge && extraParams->unrolledTail))) {
+
+        unsigned int tailChunks;
+
+        tailChunks = (extraParams->ldsUse & LDS_USE_LARGE) ?
+            extraParams->unrolledTail : 1;
+
+        if (tailN) {
+            char hitCond[1024];
+
+            sprintfHitMatrixCond(hitCond, MATRIX_B, "(", ")");
+            sprintf(tmp, "if ((currM + %lu < M) && %s)",
+                    subdims[0].y, hitCond);
+        }
+        else {
+            sprintf(tmp, "if (currM + %lu < M)", subdims[0].y);
+        }
+        kgenBeginBranch(ctx, tmp);
+
+        if (kflags & KEXTRA_TAILS_K_LOWER) {
+            setFetchAddrMode(mulOpts.fctx, addrMode | FETCH_ADDR_K_CYCLICAL);
+            setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv);
+        }
+        if (tailChunks > 1) {
+            mulOpts.flags &= ~TILEMUL_NOT_INC_K;
+            sprintf(tmp, "for (uint k1 = 0; k1 < %u; k1++)", tailChunks);
+            kgenBeginBranch(ctx, tmp);
+        }
+
+		addrMode |= FETCH_ADDR_B_CYCLICAL;
+        setFetchAddrMode(mulOpts.fctx, addrMode);
+        tileMulGen(ctx, &gset, &mulOpts);
+        if (tailChunks > 1) {
+            kgenEndBranch(ctx, NULL);
+            mulOpts.flags |= TILEMUL_NOT_INC_K;
+        }
+
+        kgenEndBranch(ctx, NULL);
+    }
+
+    gset.tileA = tileSet.squareA;
+
+    kgenAddStmt(ctx, "\n/*\n"
+                     " * Stage 2. A part of work items multiply got result on "
+                     "a respective\n"
+                     " * inverted diagonal block, and the remaining ones wait. "
+                     "Then they perform\n"
+                     " * one step of further intermediate result evaluation as "
+                     "multiplying tile by tile.\n"
+                     " * It continues until the whole panel of the "
+                     "matrix A is processed\n"
+                     " */\n");
+
+    // one must deal further with square blocks strictly
+    gset.subdims[0].bwidth = gset.subdims[1].bwidth = gset.subdims[1].y;
+
+    sprintf(tmp, "for (m1 = 0; m1 < %lu; m1++)", subdims[0].y / subdims[1].y);
+    kgenBeginBranch(ctx, tmp);
+
+    if (extraParams->ldsUse & LDS_USE_DIAGONAL) {
+        sprintf(tmp, "const int bid = lid %% %u;\n\n",
+                l1Pans);
+        kgenAddStmt(ctx, tmp);
+    }
+
+    /*
+     * Update the intermediate result multiply on the inverted diagonal tile,
+     * and write back
+     */
+    genSetupCoords(ctx, &gset, TILE_UPDATE);
+
+    sprintfStage2Condition(tmp, &gset, 0);
+    ret = kgenBeginBranch(ctx, tmp);
+
+    upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags);
+    upFlags |= tailStatusToUpresFlags(tailStatus);
+    upFlags |= UPRES_PRIV_DEST | UPRES_WITH_BETA;
+    genUpdateIntermResult(ctx, &gset, false, upFlags);
+
+    kgenAddBlankLine(ctx);
+
+    lds_diagonal = ((extraParams->ldsUse & LDS_USE_DIAGONAL) &&
+                    (kflags & (KEXTRA_COLUMN_MAJOR)) == 0 &&
+                    !(tailM || tailN) &&
+                    !(upFlags & UPRES_NO_VECTORIZATION) &&
+                    !isComplexType(kextra->dtype));
+
+    /*
+     * it's needed now to adjust addressing mode of A so as to don't
+     * exceed the bound of A
+     */
+    if (tailM) {
+        setFetchAddrMode(mulOpts.fctx,
+                         addrMode | FETCH_ADDR_A_CYCLICAL |
+                         FETCH_ADDR_K_CYCLICAL);
+        extraNew.flags |= KEXTRA_TAILS_K_LOWER;
+    }
+
+    genMulOnDiagonalTile(ctx, &gset, &tileSet, &mulOpts);
+    gset.tileBX = tileSet.bStage2;
+    if (tailM) {
+        setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv);
+    }
+
+    kgenAddStmt(ctx, "// Write back the given result\n");
+
+    upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags);
+    upFlags |= tailStatusToUpresFlags(tailStatus);
+
+    if (lds_diagonal) {
+       sprintf(tmp, "tmpB[%%u * %u + bid]", l1Pans);
+    }
+
+    genResultUpdateWithFlags(ctx, CLBLAS_TRSM, &gset, upFlags,
+                                 NULL, NULL, lds_diagonal ? tmp : NULL);
+
+    kgenEndBranch(ctx, NULL);   // multiply on the inverted tile path
+    kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE);
+
+    // continue the tile update
+    kgenAddBlankLine(ctx);
+    sprintfStage2Condition(tmp, &gset, 1);
+    kgenBeginBranch(ctx, tmp);
+    genCheckShiftTailB(ctx, &gset, 0, &tailStatus);
+    if (lds_diagonal) {
+        // TODO: add here storing to LDS as well
+    }
+    else {
+		addrMode |= FETCH_ADDR_B_CYCLICAL;
+        setFetchAddrMode(mulOpts.fctx, addrMode);
+        tileMulGen(ctx, &gset, &mulOpts);
+    }
+    kgenEndBranch(ctx, NULL);           // tile update path
+    kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE);
+
+    kgenEndBranch(ctx, NULL);           // second stage loop
+
+    if (isMatrixUpper(kflags)) {
+        sprintf(tmp, "currM -= %lu;\n", subdims[0].y);
+        kgenAddStmt(ctx, tmp);
+    }
+
+    kgenEndBranch(ctx, NULL);           // loop over M
+
+    ret = kgenEndFuncBody(ctx);
+
+    if (!ret) {
+        ret = (ssize_t)kgenSourceSize(ctx) + 1;
+    }
+
+    destroyFetchContext(mulOpts.fctx);
+    destroyKgenContext(ctx);
+
+    return (ret < 0) ? -EOVERFLOW : ret;
+}
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs)
+{
+    (void)dim;
+    (void)dtype;
+    (void)ldsSize;
+    (void)kernelArgs;
+
+    return true;
+}
+
+static SolverFlags
+solverFlags(void)
+{
+    return (SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS);
+}
+
+static void
+assignKargs(KernelArg *args, const void *params, const void *extra)
+{
+    const CLBlasKargs *blasArgs = (const CLBlasKargs*)params;
+    KernelExtraFlags kflags = ((const CLBLASKernExtra*)extra)->flags;
+    int idx = 7;
+
+    initSizeKarg(&args[0], blasArgs->M);
+    initSizeKarg(&args[1], blasArgs->N);
+    assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype);
+    initMemobjKarg(&args[3], blasArgs->A, NULL, 0, 0);
+    initSizeKarg(&args[4], blasArgs->lda.matrix);
+    initMemobjKarg(&args[5], blasArgs->B, NULL, 0, 0);
+    initSizeKarg(&args[6], blasArgs->ldb.matrix);
+    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
+        initSizeKarg(&args[idx++], blasArgs->offA);
+    }
+    if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
+        initSizeKarg(&args[idx], blasArgs->offBX);
+    }
+}
+
+static void
+fixupArgs(void *args, SubproblemDim *subdims, void *extra)
+{
+    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
+    CLBlasKargs *kargs = (CLBlasKargs*)args;
+    TrsmExtraParams *extraParams = (TrsmExtraParams *)kextra->solverPriv;
+    size_t loadBatch;
+    unsigned int wgSize;
+    unsigned int workRatio;
+    unsigned int ldsUse = LDS_NO_USE;
+    KernelExtraFlags kflags = kextra->flags;
+    SubproblemDim globDim;
+    bool isAmdGPU;
+
+    /*
+     * Calculate size of the batch loaded from global to local memory
+     * at each iteration of the stage 1. Choose such unrolling factor
+     * that allow each work item to load at least 16 bytes that provides
+     * efficient global memory access
+     */
+    loadBatch = subdims[0].x * subdims[1].bwidth * dtypeSize(kargs->dtype);
+    wgSize = (unsigned int)((subdims[0].x / subdims[1].itemX) *
+                            (subdims[0].y / subdims[1].itemY));
+    if (loadBatch < wgSize) {
+        workRatio = 1;
+    }
+    else {
+        workRatio = 16 / ((unsigned int)loadBatch / wgSize);
+        if (!workRatio) {
+            workRatio = 1;
+        }
+    }
+
+#ifndef NDEBUG
+    {
+        const char *envImpl = getenv("AMD_CLBLAS_TRSM_LDSUSE");
+
+        if (envImpl != NULL) {
+            unsigned int w = atoi(envImpl);
+            ldsUse = w % 10;
+            w = w / 10;
+            workRatio = w > 0 ? w : workRatio;
+        }
+    }
+#endif
+
+    ldsUse = LDS_NO_USE;
+    isAmdGPU = ((kflags & KEXTRA_VENDOR_AMD) != 0);
+    if ((isAmdGPU && !(kflags & (KEXTRA_TAILS_K_LOWER | KEXTRA_TAILS_M_LOWER)))
+        || (!isAmdGPU && !(kflags & KEXTRA_TAILS_M))) {
+
+        ldsUse = LDS_USE_LARGE;
+    }
+
+    kargsToProbDims(&globDim, CLBLAS_TRSM, args, false);
+    extraParams->ldsUse = ldsUse;
+    extraParams->unrollingFactor = workRatio;
+    extraParams->unrolledTail = (unsigned int)(((globDim.bwidth %
+             (subdims[1].bwidth * workRatio)) + subdims[1].bwidth - 1) /
+                                                subdims[1].bwidth);
+
+    fixupTrxmKargs(kargs);
+}
+
+static bool
+checkCalcDecompDedicated(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    DataType dtype,
+    int check)
+{
+    bool ret = true;
+
+    DUMMY_ARG_USAGE(subdimsNum);
+
+    if (check == PGRAN_CHECK) {
+        unsigned int minSize, maxSize;
+
+        maxSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 4 : 8;
+        minSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 1 : 2;
+        ret = decompSanityCheck(subdims, minSize, maxSize, 24, dtype, true);
+        ret = ret && (subdims[0].bwidth == subdims[1].bwidth);
+        ret = ret && (pgran->wgSize[0] == 64);
+    }
+    else {
+        calcPgranDedicated(pgran, subdims, -1, 3);
+    }
+
+    return ret;
+}
+
+void
+initTrsmLdsLessCachedPattern(MemoryPattern *mempat)
+{
+    mempat->name = "2-staged cached global memory based block trsm";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 0;
+    mempat->sops = &trsmSops;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L1;
+    mpatExtra.bMset = CLMEM_LEVEL_L1;
+    mpatExtra.mobjA = CLMEM_BUFFER;
+    mpatExtra.mobjB = CLMEM_BUFFER;
+    mempat->extra = &mpatExtra;
+}
+
+#if 0
+
+static int
+getDefaultDecomp(
+    PGranularity *pgran,
+    SubproblemDim *subdims,
+    unsigned int subdimsNum,
+    void * pArgs)
+{
+    pgran->wgDim = 1;
+    pgran->wgSize[0] = 64;
+    pgran->wgSize[1] = 1;
+
+    subdims[0].x = subdims[0].itemX = 32;
+    subdims[0].y = 64;
+    subdims[0].itemY = SUBDIM_UNUSED;
+    subdims[0].bwidth = subdims[1].bwidth = 4;
+    subdims[1].x = subdims[1].itemX = 8;
+    subdims[1].y = subdims[1].itemY = 4;
+}
+
+#endif
diff --git a/src/library/blas/gens/trsm_kgen.c b/src/library/blas/gens/trsm_kgen.c
new file mode 100644
index 0000000..b056b90
--- /dev/null
+++ b/src/library/blas/gens/trsm_kgen.c
@@ -0,0 +1,50 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+#include "trsm_kgen.h"
+
+void
+genComplexMathOperators(
+    struct KgenContext *ctx,
+    DataType dtype)
+{
+    const char *ctype;
+    char tmp[1024];
+
+    ctype = dtypeBuiltinType(dtype);
+    sprintf(tmp, "%s\ndiv(%s u, %s v)\n", ctype, ctype, ctype);
+    kgenDeclareFunction(ctx, tmp);
+    kgenBeginFuncBody(ctx);
+    sprintf(tmp, "return (%s)((u.x * v.x + u.y * v.y) / "
+                             "(v.x * v.x + v.y * v.y),"
+                             "(u.y * v.x - u.x * v.y) / "
+                             "(v.x * v.x + v.y * v.y));\n", ctype);
+    kgenAddStmt(ctx, tmp);
+    kgenEndFuncBody(ctx);
+    kgenAddBlankLine(ctx);
+
+    sprintf(tmp, "%s\nmul(%s u, %s v)\n", ctype, ctype, ctype);
+    kgenDeclareFunction(ctx, tmp);
+    kgenBeginFuncBody(ctx);
+    sprintf(tmp, "return (%s)(u.x * v.x - u.y * v.y, u.x * v.y + u.y * v.x);\n",
+            ctype);
+    kgenAddStmt(ctx, tmp);
+    kgenEndFuncBody(ctx);
+    kgenAddBlankLine(ctx);
+}
+
diff --git a/src/library/blas/gens/trsm_kgen.h b/src/library/blas/gens/trsm_kgen.h
new file mode 100644
index 0000000..3aa43ef
--- /dev/null
+++ b/src/library/blas/gens/trsm_kgen.h
@@ -0,0 +1,28 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef TRSM_KGEN_H_
+#define TRSM_KGEN_H_
+
+#include "blas_kgen.h"
+
+void
+genComplexMathOperators(
+    struct KgenContext *ctx,
+    DataType dtype);
+
+#endif /* TRSM_KGEN_H_ */
diff --git a/src/library/blas/gens/trsv_gemv.cpp b/src/library/blas/gens/trsv_gemv.cpp
new file mode 100644
index 0000000..49d5371
--- /dev/null
+++ b/src/library/blas/gens/trsv_gemv.cpp
@@ -0,0 +1,553 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/*
+ * trsv gemv generator -
+ *
+ * This generator generates code for the GEMV portion of TRSV.
+ * The idea is to call this routine after solving a subset of coefficients.
+ * This generator will help to update the RHS of remaining equations using the
+ * currently solved variables.
+ * The current clBLAS implementation of GEMV does not have support complex types.
+ * Hence, Need to write this kludge.
+ * One day, this should go away and be completely replaced by existing GEMV
+ *
+ * NOTE:
+ * This generator is highly tied to TRSV and is not a replacement for GEMV.
+ * In some cases, this generator generates code not only for updating the RHS
+ * but also for solving the next triangle (trtri based solve) as well.
+ * We have seen marginal performance increases (1GB/s) by doing so.
+ * If this is not important, one can replace this with GEMV when GEMV becomes
+ * feature-complete.
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include <trsv_gemv.clT>
+#include <kprintf.hpp>
+#include <solution_seq.h>
+
+//#define DEBUG_TRSV_GEMV
+
+extern "C"
+unsigned int dtypeSize(DataType type);
+
+static char Prefix[4]; // PENDING: Magic "4" == Number of data types supported (float, double, cl_float2, cl_double2)
+
+static SolverFlags
+solverFlags(void)
+{
+	#ifdef DEBUG_TRSV_GEMV
+	printf("TRSV GEMV solverFlags(): solverFlags called......\n");
+	#endif
+
+    return (SF_WSPACE_1D);
+}
+
+static bool isTransposeFeasible(size_t triangle, size_t blockSize, size_t vecLen, size_t &TARGETHEIGHT);
+
+static bool isNoTransposeFeasible(size_t triangle, size_t blockSize, size_t vecLen,
+									size_t & TARGETROWS, size_t & TARGETWIDTH, size_t &NLOOPS);
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+
+static void
+assignKargs(KernelArg *args, const void *params, const void*);
+
+extern "C"
+void initTrsvGemvDefaultPattern(MemoryPattern *mempat);
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *kArgs);
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs);
+
+static SolverOps trsvGemvOps = {
+    generator,
+    assignKargs,
+    isFitToLDS,
+    NULL, // Prepare Translate Dims
+    NULL, // Inner Decomposition Axis
+    calcNrThreads,
+    NULL,
+    solverFlags,
+	NULL,
+	NULL,
+	NULL,
+	setBuildOpts,
+	NULL
+};
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+	const SolutionStep *step = (const SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
+	{
+		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		#ifdef DEBUG_TRSV_GEMV
+		printf("TRSV GEMV: Setting build options ... Double... for DOUBLE PRECISION support\n");
+		#endif
+	}
+    if( kargs->pigFuncID == CLBLAS_TPSV)
+    {
+        strcat( buildOptStr, " -DPACKED ");
+        #ifdef DEBUG_TRSV_GEMV
+            printf("TPSV GEMV: Setting build options ... PACKED\n");
+        #endif
+    }
+	return;
+}
+
+static CLBLASMpatExtra mpatExtra;
+
+extern "C"
+void initTrsvGemvDefaultPattern(MemoryPattern *mempat)
+{
+	#ifdef DEBUG_TRSV_GEMV
+	printf("TRSV GEMV: initTrsvGemvDefaultPattern called with mempat = 0x%p\n", (void*)mempat);
+	#endif
+
+    mempat->name = "TRSV - GEMV Update Kernel";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &trsvGemvOps;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L2;
+    mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS;
+    mpatExtra.mobjA = CLMEM_BUFFER; // == No images
+    mpatExtra.mobjB = CLMEM_BUFFER; // == No images
+    mempat->extra = &mpatExtra;
+
+	Prefix[TYPE_FLOAT] = 'S';
+	Prefix[TYPE_DOUBLE] = 'D';
+	Prefix[TYPE_COMPLEX_FLOAT] = 'C';
+	Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
+}
+
+/*
+ * Helper function that helps in calculating the "TARGET WIDTH" of
+ * a block with Block Size needed for the case where
+ * "theight" number of variables have been solved.
+ * This is applicable only to NON-TRANSPOSE cases.
+ */
+static cl_ulong getTargetWidth(size_t theight, size_t blk_size, size_t vwidth)
+{
+	cl_ulong nLoops_v, nLoops;
+	//
+	// NOTE: This function should be called only for Non-Transpose cases
+	// NOTE: Does not check if the block size is suitable for our purposes
+	// NOTE:
+	nLoops_v = (theight * theight) / blk_size;
+	nLoops = nLoops_v / vwidth;
+	if (nLoops == 0)
+	{
+		return 0;
+	}
+	return theight/nLoops;
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *_extra)
+{
+	size_t BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block
+	CLBlasKargs *kargs = (CLBlasKargs *)args;
+	CLBLASKernExtra *extra = (CLBLASKernExtra*) _extra;
+	size_t blocks;
+	size_t vecLenA = extra->vecLenA;
+
+	#ifdef DEBUG_TRSV_GEMV
+	printf("TRSV GEMV: calcNrThreads() called \n");
+	#endif
+
+	if (((kargs->order == clblasColumnMajor) && (kargs->transA == clblasNoTrans)) ||
+	   ((kargs->order == clblasRowMajor) && (kargs->transA != clblasNoTrans)))
+	{
+		size_t rowsLeft, TARGETROWS;
+
+		//CL, CU
+		TARGETROWS = subdims->y;
+		rowsLeft = kargs->endRow;
+		blocks = ((rowsLeft-1)/TARGETROWS) + 1;
+	} else {
+		size_t TARGETHEIGHT;
+		if (isTransposeFeasible(subdims->y, BLOCKSIZE, vecLenA, TARGETHEIGHT) == false)
+		{
+			threads[0] =0; threads[1] = 0;
+			#ifdef DEBUG_TRSV_GEMV
+			printf("TRSV GEMV: calcNrThreads() WARNING: Returning 0\n");
+			#endif
+			return;
+		}
+		if (
+			((kargs->uplo == clblasUpper) && (kargs->order == clblasColumnMajor)) ||
+		   	((kargs->uplo == clblasLower) && (kargs->order == clblasRowMajor))
+		   )
+		{
+			blocks = ((kargs->N - kargs->endRow -1) / (BLOCKSIZE / TARGETHEIGHT)) + 1;
+		} else {
+			blocks = (kargs->startRow)/(BLOCKSIZE/TARGETHEIGHT) + 1;
+		}
+	}
+
+	#ifdef DEBUG_TRSV_GEMV
+	printf("blocks : %lu\n", blocks);
+	#endif
+	threads[0] = blocks * BLOCKSIZE;
+	threads[1] = 1;
+	#ifdef DEBUG_TRSV_GEMV
+	printf("pgran-wgSize[0] : %d, globalthreads[0]  : %lu\n", pgran->wgSize[0], threads[0]);
+	#endif
+	return;
+}
+
+static bool isTransposeFeasible(size_t triangle, size_t blockSize, size_t vecLen, size_t &TARGETHEIGHT)
+{
+	size_t maxHeight;
+
+	if (triangle % vecLen)
+	{
+		#ifdef DEBUG_TRSV_GEMV
+		printf("TRSV GEMV: isTransposeFeasible(): triangle not multiple of vectorLength\n");
+		#endif
+		return false;
+	}
+	maxHeight = triangle/vecLen;
+	while (blockSize % maxHeight)
+	{
+		maxHeight--;
+	}
+	// maxHeight at minimum will be 1
+	#ifdef DEBUG_TRSV_GEMV
+	printf("TRSV GEMV: isTransposeFeasible(): Target Height  chosen = %lu\n", maxHeight);
+	#endif
+	TARGETHEIGHT = maxHeight;
+	return true;
+}
+
+/*
+ * NOTE:
+ * No-Transpose case - The code iterates along the X direction. Vectoring is along Y Direction.
+ * Since we dont iterate on Y direction (triangle height), this fixes the "blocky" component of the blocksize.
+ * The blockSize then determines how much width the block has on X direction and thus the number of loops
+ * can be calculated from that information.
+ */
+static bool isNoTransposeFeasible(size_t triangle, size_t blockSize, size_t vecLen,
+									size_t & TARGETROWS, size_t & TARGETWIDTH, size_t &NLOOPS)
+{
+	size_t blockx, blocky, nLoops;
+
+	if ( ((triangle*triangle) % blockSize) != 0)
+	{
+		#ifdef DEBUG_TRSV_GEMV
+		printf("TRSV GEMV: isNoTransposeFeasible(): triangle*triangle not multiple of blockSize\n");
+		#endif
+		return false;
+	}
+
+	if (triangle % vecLen)
+	{
+		#ifdef DEBUG_TRSV_GEMV
+		printf("TRSV GEMV: isNoTransposeFeasible(): triangle not multiple of vectorLength\n");
+		#endif
+		return false;
+	}
+
+	blocky = triangle/vecLen;
+	if (blockSize % blocky)
+	{
+		#ifdef DEBUG_TRSV_GEMV
+		printf("TRSV GEMV: isNoTransposeFeasible(): blockSize not multiple of blocky\n");
+		#endif
+		return false;
+	}
+	blockx = blockSize / blocky;
+	if (triangle % blockx)
+	{
+		#ifdef DEBUG_TRSV_GEMV
+		printf("TRSV GEMV: isNoTransposeFeasible(): blockSize not multiple of blocky\n");
+		#endif
+		return false;
+	}
+	nLoops = triangle/blockx;
+
+	TARGETROWS = triangle;
+	TARGETWIDTH = blockx;
+	NLOOPS = nLoops;
+	return true;
+}
+
+//
+// FIXME: Report correct return value when "buf" is NULL - Needs change in KPRINTF
+// FIXME: Return correct return value when "buf" is NON NULL - Needs change in KPRINTF
+// FIXME: "buflen" check needs to be more accurate. Relies on above changes to KPRINTF
+//
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+	CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
+    unsigned int vecLenA = extraFlags->vecLenA;
+	char tempTemplate[32*1024];
+	char TARGETROWS_S[10], NLOOPS_S[10], TARGETWIDTH_S[10];
+	size_t TARGETROWS, NLOOPS, TARGETWIDTH;
+	char TARGETHEIGHT_S[10], BLOCKSIZE_S[10], TRIANGLE_HEIGHT_S[10];
+	size_t TARGETHEIGHT;
+	bool doVLOAD = false;
+	int BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1];  // [1] will always be 1 since we are a 1D implementation
+
+	if (buf == NULL) // PENDING: Return correct buffer size
+	{
+		return (32 * 1024 * sizeof(char));
+	}
+	if (buflen > 32*1024)
+	{
+		#ifdef DEBUG_TRSV_GEMV
+		printf("TRSV GEMV: generator(): WARNING: Returning 0 as buflen is > 32K\n");
+		#endif
+		return 0;
+	}
+
+	if( extraFlags->flags &  KEXTRA_NO_COPY_VEC_A )
+	{
+		doVLOAD = true;
+		#ifdef DEBUG_TRSV_GEMV
+		printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
+		#endif
+	}
+	else
+	{
+		#ifdef DEBUG_TRSV_GEMV
+			printf("Using Aligned Data Pointer .........................\n");
+		#endif
+	}
+	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD);
+
+	#ifdef DEBUG_TRSV_GEMV
+ 	printf("TRSV GEMV GENERATOR called....\n");
+	#endif
+
+	clblasUplo uplo   = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower;
+	clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor;
+	clblasTranspose trans =
+	(extraFlags->flags & KEXTRA_TRANS_A) ? clblasTrans : (( extraFlags->flags & KEXTRA_CONJUGATE_A) ? clblasConjTrans: clblasNoTrans);
+	bool unit = (((extraFlags->flags) & KEXTRA_UNIT_DIAGONAL) != 0);
+
+	// unity and doConj handled in setKernelArgs
+    if ( order == clblasRowMajor )
+    {
+        order = clblasColumnMajor;
+        if ( trans == clblasNoTrans)
+        {
+            trans = clblasTrans;
+        }
+        else if ( trans == clblasTrans )
+        {
+            trans = clblasNoTrans;
+        }
+        else // clblasConjTrans
+        {
+            trans = clblasNoTrans;
+        }
+		uplo = ( uplo == clblasUpper)? clblasLower : clblasUpper;
+    }
+
+	//
+	// Check Feasibility and then generate the code.
+	//
+	if ( trans != clblasNoTrans)
+	{
+		if (isTransposeFeasible(subdims->y, BLOCKSIZE, vecLenA, TARGETHEIGHT) == false)
+		{
+			return 0;
+		}
+        sprintf( TARGETHEIGHT_S, "%d", TARGETHEIGHT );
+	    sprintf( BLOCKSIZE_S, "%d", BLOCKSIZE );
+        sprintf( TRIANGLE_HEIGHT_S, "%d", subdims->y );
+
+		kobj.put("%TARGET_HEIGHT", TARGETHEIGHT_S);
+		kobj.put("%BLOCKSIZE", BLOCKSIZE_S);
+		kobj.put("%TRIANGLE_HEIGHT", TRIANGLE_HEIGHT_S);
+		( uplo == clblasLower )?
+		    		(strcpy(tempTemplate, (char*)trsv_CLT_ComputeRectangle_kernel)) :
+					(strcpy(tempTemplate, (char*)trsv_CUT_ComputeRectangle_kernel));
+
+	}
+	else // No-Transpose cases...
+	{
+		if (isNoTransposeFeasible(subdims->y, BLOCKSIZE, vecLenA, TARGETROWS, TARGETWIDTH, NLOOPS) == false)
+		{
+			return 0;
+		}
+        sprintf( TARGETROWS_S, "%d", TARGETROWS );
+	    sprintf( TARGETWIDTH_S, "%d", TARGETWIDTH );
+        sprintf( NLOOPS_S, "%d", NLOOPS );
+		kobj.put("%TARGET_ROWS", TARGETROWS_S);
+		kobj.put("%TARGET_WIDTH", TARGETWIDTH_S);
+		kobj.put("%NLOOPS", NLOOPS_S);
+		if (unit)
+		{
+			( uplo == clblasLower )?
+		    (strcpy(tempTemplate, (char*)trsv_CL_ComputeRectangle_kernel)) : (strcpy(tempTemplate, (char*)trsv_CU_ComputeRectangle_kernel));
+		} else {
+			( uplo == clblasLower )?
+		    (strcpy(tempTemplate, (char*)trsv_CL_ComputeRectangle_NonUnity_kernel)) : (strcpy(tempTemplate, (char*)trsv_CU_ComputeRectangle_NonUnity_kernel));
+		}
+	}
+
+	#ifdef DEBUG_TRSV_GEMV
+	printf("dataType : %c\n", Prefix[extraFlags->dtype]);
+	#endif
+
+	// FIXME: VECTORSIZE HARD CODED
+	// FIXME : SetKernelArgs.. sends offa, offx, and lda should be received as uint
+
+	#ifdef DEBUG_TRSV_GEMV
+	printf("Vector length used : %d\n\n", vecLenA);
+	#endif
+
+    kobj.spit((char*)buf, tempTemplate);
+	return (32 * 1024 * sizeof(char));
+}
+
+static void
+assignKargs(KernelArg *args, const void *params, const void*)
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+    cl_int inc;
+	cl_int unity, doConj;
+
+    INIT_KARG(&args[0], blasArgs->A); 	//A - input matrix - argument
+    INIT_KARG(&args[1], blasArgs->B); 	//x - result buffer = _xnew argument
+    initSizeKarg(&args[2], blasArgs->N);
+    inc = blasArgs->ldb.vector;
+    INIT_KARG(&args[3], inc);
+	unity = (blasArgs->diag == clblasUnit);
+   	INIT_KARG(&args[4], unity);
+    initSizeKarg(&args[5], blasArgs->lda.matrix);
+	doConj = (blasArgs->transA == clblasConjTrans);
+	#ifdef DEBUG_TRSV_GEMV
+	printf("TRMV GEMV: assignKargs: doConj is : %d, unity is : %d, incx is : %d\n", doConj, unity, inc);
+	printf("TRMV GEMV: startRow, startCol set to %d, %d\n", blasArgs->startRow, blasArgs->endRow);
+	#endif
+   	INIT_KARG(&args[6], doConj);
+	INIT_KARG(&args[7], blasArgs->startRow);
+	INIT_KARG(&args[8], blasArgs->endRow);
+	initSizeKarg(&args[9], blasArgs->offa);
+	initSizeKarg(&args[10], blasArgs->offBX);
+	return;
+}
+
+/*
+ * isFitToLDS()
+ *
+ * 1. We will assume "dim[0].y" as the TRIANGLE_HEIGHT oiow - The number of variables solved
+ *    by the corresponding TRTRI kernel
+ *
+ * NOTE:
+ * 1. It is Possible that this function can cause "dim[0].y" to change from what was used in
+ *    the "trtri" counterpart.
+ *    In such a case, we will detect this in "xtrsv.c" and abort the TRSV call.
+ * 2. We may need to mellow down the bloated numbers we are returning down here.
+ */
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs)
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs *)kernelArgs;
+    size_t MAXBLOCKSIZE = 256;
+    cl_ulong maxSize;
+
+    if  (
+            ((blasArgs->transA == clblasNoTrans) && (blasArgs->order == clblasColumnMajor)) ||
+            ((blasArgs->transA != clblasNoTrans) && (blasArgs->order == clblasRowMajor))
+        )
+    {
+        //
+        // Estimate worst case Local Memory needed - Vector Width of 4 irrespective of data-type?
+        //
+        cl_ulong tw;
+
+        tw = getTargetWidth(dim[0].y, MAXBLOCKSIZE, 4);
+        if (tw == 0)
+        {
+            do {
+                MAXBLOCKSIZE /= 2;
+                tw = getTargetWidth(dim[0].y, MAXBLOCKSIZE, 4);
+            } while((MAXBLOCKSIZE > 1) && (tw == 0));
+        }
+        #ifdef DEBUG_TRSV_GEMV
+        printf("TRSV GEMV: isFitLDS() tw = %lu\n", tw);
+        #endif
+        maxSize = (1+4+tw)*dtypeSize(dtype) + MAXBLOCKSIZE*dtypeSize(dtype)*4;
+        #ifdef DEBUG_TRSV_GEMV
+        printf("TRSV GEMV: isFitLDS() maxSize = %lu, ldsSize = %lu, Y = %lu\n", maxSize, ldsSize, dim[0].y);
+        #endif
+        return (maxSize < ldsSize);
+    }
+
+    //
+    // The remaining kernels use "TriangleWidth" amount of local memory for storing the RHS.
+    // We will assume "dim[0].y" to be the "TriangleWidth"
+    //
+	MAXBLOCKSIZE = (dim[0].y)*(dim[0].y) > 256 ? 256 : dim[0].y*dim[0].y;
+    maxSize = (dim[0].y + MAXBLOCKSIZE)*dtypeSize(dtype);
+    return (maxSize < ldsSize);
+}
diff --git a/src/library/blas/gens/trsv_trtri.cpp b/src/library/blas/gens/trsv_trtri.cpp
new file mode 100644
index 0000000..071565f
--- /dev/null
+++ b/src/library/blas/gens/trsv_trtri.cpp
@@ -0,0 +1,548 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/*
+ * trsv trtri generator -
+ *
+ * This kernel solves the triangular system of equations with only 1 work-group.
+ * This is terribly slow and forms the weakest link in the chain.
+ * It solves 1 variable per work-item. So, the size of the triangle that can be solved
+ * is limited by the hardware's MAX_WORKGROUP_SIZE.
+ * The "chain" for solving larger systems of equations involve a "gemv" operation
+ * which can be exploited by "xtrsv.c". However, the current "gemv" implementation
+ * does NOT support "single complex" and "double complex" data types.
+ * So, to give complete support, another "trsv_gemv" generator will be used.
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <clblas_stddef.h>
+#include <clBLAS.h>
+#include <blas_mempat.h>
+#include <clkern.h>
+#include <clblas-internal.h>
+#include <trsv.clT>
+#include <solution_seq.h>
+//#include "blas_kgen.h"
+
+#include <kprintf.hpp>
+
+//#define DEBUG_TRSV_TRTRI
+
+extern "C"
+unsigned int dtypeSize(DataType type);
+
+
+static char Prefix[4]; // PENDING: Magic "4" == Number of data types supported (float, double, cl_float2, cl_double2)
+
+
+static SolverFlags
+solverFlags(void)
+{
+    #ifdef DEBUG_TRSV_TRTRI
+    printf("TRSV TRTRI solverFlags(): solverFlags callen......\n");
+    #endif
+
+    return (SF_WSPACE_1D);
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *extra);
+
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+
+static void
+assignKargs(KernelArg *args, const void *params, const void*);
+
+extern "C"
+void initTrsvDefaultPattern(MemoryPattern *mempat);
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *kArgs);
+
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs);
+
+static ssize_t
+generator_tbsv(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra);
+
+static SolverOps trsvOps = {
+    generator,
+    assignKargs,
+    isFitToLDS,
+    NULL, // Prepare Translate Dims
+    NULL, // Inner Decomposition Axis
+    calcNrThreads,
+    NULL, // Image related
+    solverFlags,
+    NULL,
+    NULL,
+    NULL,
+    setBuildOpts,
+    NULL
+};
+
+static void
+setBuildOpts(
+    char * buildOptStr,
+    const void *args)
+{
+    const SolutionStep *step = (const SolutionStep *)args;
+    const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
+    if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
+    {
+        strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+        #ifdef DEBUG_TRSV_TRTRI
+        printf("TRSV TRTRI: Setting build options ... Double... for DOUBLE PRECISION support\n");
+        #endif
+    }
+    if( kargs->pigFuncID == CLBLAS_TPSV)
+    {
+        strcat( buildOptStr, " -DPACKED ");
+        #ifdef DEBUG_TRSV_TRTRI
+            printf("TPSV TRTRI: Setting build options ... PACKED\n");
+        #endif
+    }
+    if( kargs->pigFuncID == CLBLAS_TBSV)
+    {
+        strcat( buildOptStr, " -DBANDED ");
+        #ifdef DEBUG_TRSV_TRTRI
+        printf("TBSV TRTRI: Setting build options .. BANDED\n");
+        #endif
+    }
+    return;
+}
+
+static CLBLASMpatExtra mpatExtra;
+
+extern "C"
+void initTrsvDefaultPattern(MemoryPattern *mempat)
+{
+    #ifdef DEBUG_TRSV_TRTRI
+    printf("TRSV TRTRI: initTRSVDefaultPattern called with mempat = 0x%p\n", (void*)mempat);
+    #endif
+
+    mempat->name = "Triangular matrix solver - Only 1 workgroup";
+    mempat->nrLevels = 2;
+    mempat->cuLevel = 0;
+    mempat->thLevel = 1;
+    mempat->sops = &trsvOps;
+
+    mpatExtra.aMset = CLMEM_LEVEL_L2;
+    mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS;
+    mpatExtra.mobjA = CLMEM_BUFFER; // == No images
+    mpatExtra.mobjB = CLMEM_BUFFER; // == No images
+    mempat->extra = &mpatExtra;
+
+    Prefix[TYPE_FLOAT] = 'S';
+    Prefix[TYPE_DOUBLE] = 'D';
+    Prefix[TYPE_COMPLEX_FLOAT] = 'C';
+    Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
+}
+
+//
+// Read comments atop "isFitToLDS()"
+// This function is required by "isFitLDS()"
+//
+static cl_ulong getTargetWidth(size_t theight, size_t blk_size, size_t vwidth)
+{
+    cl_ulong nLoops_v, nLoops;
+    //
+    // NOTE: This function should be called only for Non-Transpose cases
+    // NOTE: Does not check if the block size is suitable for our purposes
+    // NOTE:
+    nLoops_v = (theight * theight) / blk_size;
+    nLoops = nLoops_v / vwidth;
+    if (nLoops == 0)
+    {
+        return 0;
+    }
+    return theight/nLoops;
+}
+
+static void
+calcNrThreads(
+    size_t threads[2],
+    const SubproblemDim *subdims,
+    const PGranularity *pgran,
+    const void *args,
+    const void *_extra)
+{
+    size_t BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block
+    CLBlasKargs *kargs = (CLBlasKargs *)args;
+    #ifdef DEBUG_TRSV_TRTRI
+    printf("TRSV TRTRI: calcNrThreads() called \n");
+    #endif
+    int blocks = 1;
+
+    _extra = _extra; // Dummy- to avoid warnings
+
+    #ifdef DEBUG_TRSV_TRTRI
+    printf("blocks : %d\n", blocks);
+    #endif
+
+    if (((kargs->order == clblasColumnMajor) && (kargs->transA == clblasNoTrans)) ||
+       ((kargs->order == clblasRowMajor) && (kargs->transA != clblasNoTrans)))
+     {
+        if (subdims->y > BLOCKSIZE)
+        {
+            // These little kernels cannot handle arbitrary numbers
+            printf("TRSV calcNrThreads(): Warning. TRTRI Cannot handle subproblemdim of size %lu\n", subdims->y);
+            threads[0] = 0;
+            threads[1] = 0;
+            return;
+        }
+    } else {
+        if (subdims->y > 1024)
+        {
+            // These little kernels cannot handle arbitrary numbers
+            printf("TRSV calcNrThreads(): Warning. TRTRI Cannot handle subproblemdim of size %lu\n", subdims->y);
+            threads[0] = 0;
+            threads[1] = 0;
+            return;
+        }
+    }
+
+    threads[0] = blocks * BLOCKSIZE;
+    threads[1] = 1;
+    #ifdef DEBUG_TRSV_TRTRI
+    printf("pgran-wgSize[0] : %d, globalthreads[0]  : %lu\n", pgran->wgSize[0], threads[0]);
+    #endif
+    return;
+}
+
+//
+// FIXME: Report correct return value when "buf" is NULL - Needs change in KPRINTF
+// FIXME: Return correct return value - Needs change in KPRINTF
+//
+static ssize_t
+generator(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+    char tempTemplate[32*1024];
+    char vector_size_trans[10], triangle_height[10];
+
+    pgran = pgran; // Dummy- to avoid warnings
+
+    if (buf == NULL) // PENDING: Return correct buffer size
+    {
+        buflen = (32 * 1024 * sizeof(char));
+        return (ssize_t)buflen;
+    }
+
+    CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
+    SolutionStep *step = container_of( pgran , pgran, SolutionStep);    // NOTE: using container_of() to get pigFuncID
+    CLBlasKargs* kargs = (CLBlasKargs*) &(step->args);
+
+    if(kargs->pigFuncID == CLBLAS_TBSV)
+    {
+        return generator_tbsv(buf, buflen, subdims, pgran, extra);
+    }
+
+    #ifdef DEBUG_TRSV_TRTRI
+     printf("TRSV GENERATOR called....\n");
+
+    if((( extraFlags->flags &  KEXTRA_TRANS_A) || ( extraFlags ->flags & KEXTRA_CONJUGATE_A )))
+    {
+        printf("A is trans or CONJ-TRANS\n");
+    }
+    else
+    {
+        printf("A is noTrans...\n");
+    }
+    #endif
+
+    clblasUplo uplo   = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower;
+    clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor;
+    clblasTranspose trans = ( extraFlags->flags & KEXTRA_TRANS_A) ? clblasTrans : (( extraFlags->flags & KEXTRA_CONJUGATE_A) ? clblasConjTrans: clblasNoTrans);
+    //bool unit = (((extraFlags->flags) & KEXTRA_UNIT_DIAGONAL) != 0);
+
+    // unity and doConj handled in setKernelArgs
+    if ( order == clblasRowMajor )
+    {
+        order = clblasColumnMajor;
+        if ( trans == clblasNoTrans)
+        {
+            trans = clblasTrans;
+        }
+        else if ( trans == clblasTrans )
+        {
+            trans = clblasNoTrans;
+        }
+        else // clblasConjTrans
+        {
+            trans = clblasNoTrans;
+        }
+        uplo = ( uplo == clblasUpper)? clblasLower : clblasUpper;
+    }
+
+    if ( trans == clblasNoTrans)
+    {
+        ( uplo == clblasLower )?
+                    (strcpy(tempTemplate, (char*)trsv_CL_SolveTriangle_kernel)) :
+                    (strcpy(tempTemplate, (char*)trsv_CU_SolveTriangle_kernel));
+    }
+    else // Transpose cases...
+    {
+        ( uplo == clblasLower )?
+                    (strcpy(tempTemplate, (char*)trsv_CLT_SolveTriangle_kernel)) :
+                    (strcpy(tempTemplate, (char*)trsv_CUT_SolveTriangle_kernel));
+    }
+
+    #ifdef DEBUG_TRSV_TRTRI
+    printf("dataType : %c\n", Prefix[extraFlags->dtype]);
+    #endif
+
+    // FIXME: VECTORSIZE HARD CODED
+    // FIXME : SetKernelArgs.. sends offa, offx, and lda should be received as uint
+    unsigned int vecLenA = extraFlags->vecLenA;
+
+    #ifdef DEBUG_TRSV_TRTRI
+    printf("Vector length used : %d\n\n", vecLenA);
+    #endif
+
+    bool doVLOAD = false;
+    if( extraFlags->flags &  KEXTRA_NO_COPY_VEC_A )
+    {
+        doVLOAD = true;
+        #ifdef DEBUG_TRSV_TRTRI
+            printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
+        #endif
+    }
+    else
+    {
+        #ifdef DEBUG_TRSV_TRTRI
+            printf("Using Aligned Data Pointer .........................\n");
+        #endif
+    }
+    kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD);
+
+    if (trans != clblasNoTrans)
+    {
+        sprintf( vector_size_trans, "%u", vecLenA );
+        sprintf( triangle_height, "%ld", subdims[0].y );
+        #ifdef DEBUG_TRSV_TRTRI
+        printf("vector size trans = %s\n", vector_size_trans);
+        #endif
+        kobj.put("%PREFIXVECTOR_SIZE_TRANS", (const char *)vector_size_trans);
+        kobj.put("%TRIANGLE_HEIGHT", triangle_height);
+    }
+    kobj.spit((char*)buf, tempTemplate);
+    return (32 * 1024 * sizeof(char));
+}
+
+static void
+assignKargs(KernelArg *args, const void *params, const void*)
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs*)params;
+    cl_int inc;
+    cl_int unity, doConj;
+
+    INIT_KARG(&args[0], blasArgs->A);     //A - input matrix - argument
+    INIT_KARG(&args[1], blasArgs->B);     //x - result buffer = _xnew argument
+    initSizeKarg(&args[2], blasArgs->N);
+    inc = blasArgs->ldb.vector;
+    INIT_KARG(&args[3], inc);
+    unity = (blasArgs->diag == clblasUnit);
+    INIT_KARG(&args[4], unity);
+    initSizeKarg(&args[5], blasArgs->lda.matrix);
+    doConj = (blasArgs->transA == clblasConjTrans);
+    #ifdef DEBUG_TRSV_TRTRI
+    printf("TRMV TRTRI: assignKargs: doConj is : %d, unity is : %d, incx is : %d\n", doConj, unity, inc);
+    printf("TRMV TRTRI: startRow, startCol set to %d, %d\n", blasArgs->startRow, blasArgs->endRow);
+    #endif
+    INIT_KARG(&args[6], doConj);
+    INIT_KARG(&args[7], blasArgs->startRow);
+    INIT_KARG(&args[8], blasArgs->endRow);
+    initSizeKarg(&args[9], blasArgs->offa);
+    initSizeKarg(&args[10], blasArgs->offBX);
+
+    if( blasArgs->pigFuncID == CLBLAS_TBSV)
+    {
+        initSizeKarg(&args[11], blasArgs->K);
+    }
+    return;
+}
+
+/*
+ * isFitToLDS() is based on the "trsv_gemv" counterpart than the kernel corresponding to TRTRI
+ * The Kernels corersponding to TRTRI are run with only 1 Workgroup.
+ * So, it really does not matter at all.
+ * But, if dim[0].y selected by the library changes between TRTRI and TRSV_GEMV, results will go
+ * wrong. So, by using the same "isFitToLDS" function, we will indirectly force the library to
+ * choose the same "SubproblemDim" for both cases.
+ */
+static bool
+isFitToLDS(
+    SubproblemDim *dim,
+    DataType dtype,
+    cl_ulong ldsSize,
+    const void *kernelArgs)
+{
+    CLBlasKargs *blasArgs = (CLBlasKargs *)kernelArgs;
+    size_t MAXBLOCKSIZE = 256;
+    cl_ulong maxSize;
+
+    if  (
+            ((blasArgs->transA == clblasNoTrans) && (blasArgs->order == clblasColumnMajor)) ||
+            ((blasArgs->transA != clblasNoTrans) && (blasArgs->order == clblasRowMajor))
+        )
+    {
+        //
+        // Estimate worst case Local Memory needed - Vector Width of 4 irrespective of data-type?
+        //
+        cl_ulong tw;
+
+        tw = getTargetWidth(dim[0].y, MAXBLOCKSIZE, 4);
+        if (tw == 0)
+        {
+            do {
+                MAXBLOCKSIZE /= 2;
+                tw = getTargetWidth(dim[0].y, MAXBLOCKSIZE, 4);
+            } while((MAXBLOCKSIZE > 1) && (tw == 0));
+        }
+        #ifdef DEBUG_TRSV_TRTRI
+        printf("TRSV TRTRI: isFitLDS() tw = %lu\n", tw);
+        #endif
+        maxSize = (1+4+tw)*dtypeSize(dtype) + MAXBLOCKSIZE*dtypeSize(dtype)*4;
+        #ifdef DEBUG_TRSV_TRTRI
+        printf("TRSV TRTRI: isFitLDS() maxSize = %lu, ldsSize = %lu, Y=%lu\n", maxSize, ldsSize, dim[0].y);
+        #endif
+        return (maxSize < ldsSize);
+    }
+
+    //
+    // The remaining kernels use "TriangleWidth" amount of local memory for storing the RHS.
+    // We will assume "dim[0].y" to be the "TriangleWidth"
+    //
+    MAXBLOCKSIZE = (dim[0].y)*(dim[0].y) > 256 ? 256 : dim[0].y*dim[0].y;
+    maxSize = (dim[0].y + MAXBLOCKSIZE)*dtypeSize(dtype);
+    return (maxSize < ldsSize);
+}
+
+static ssize_t
+generator_tbsv(
+   char *buf,
+   size_t buflen,
+   const struct SubproblemDim *subdims,
+   const struct PGranularity *pgran,
+   void *extra)
+{
+    char tempTemplate[32*1024];
+    char vector_size_trans[10], triangle_height[10];
+
+    pgran = pgran; // Dummy- to avoid warnings
+
+    if (buf == NULL) // PENDING: Return correct buffer size
+    {
+        buflen = (32 * 1024 * sizeof(char));
+        return (ssize_t)buflen;
+    }
+
+    CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
+
+    clblasUplo uplo   = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower;
+    clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor;
+    clblasTranspose trans = ( extraFlags->flags & KEXTRA_TRANS_A) ? clblasTrans : (( extraFlags->flags & KEXTRA_CONJUGATE_A) ? clblasConjTrans: clblasNoTrans);
+
+    // unity and doConj handled in setKernelArgs
+    if ( order == clblasColumnMajor )
+    {
+        if ( trans == clblasNoTrans)
+        {
+            trans = clblasTrans;
+        }
+        else if ( trans == clblasTrans )
+        {
+            trans = clblasNoTrans;
+        }
+        else // clblasConjTrans
+        {
+            trans = clblasNoTrans;
+        }
+        uplo = ( uplo == clblasUpper)? clblasLower : clblasUpper;
+    }
+
+    if ( trans == clblasNoTrans)
+    {
+        ( uplo == clblasLower )?
+                    (strcpy(tempTemplate, (char*)trsv_CL_SolveTriangle_kernel)) :
+                    (strcpy(tempTemplate, (char*)trsv_CU_SolveTriangle_kernel));
+    }
+    else // Transpose cases...
+    {
+        ( uplo == clblasLower )?
+                    (strcpy(tempTemplate, (char*)trsv_CLT_SolveTriangle_kernel)) :
+                    (strcpy(tempTemplate, (char*)trsv_CUT_SolveTriangle_kernel));
+    }
+
+    unsigned int vecLenA = extraFlags->vecLenA;
+
+    bool doVLOAD = false;
+    if( extraFlags->flags &  KEXTRA_NO_COPY_VEC_A )
+    {
+        doVLOAD = true;
+        #ifdef DEBUG_TRSV_TRTRI
+            printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
+        #endif
+    }
+    else
+    {
+        #ifdef DEBUG_TRSV_TRTRI
+            printf("Using Aligned Data Pointer .........................\n");
+        #endif
+    }
+    kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD);
+
+    if (trans != clblasNoTrans)
+    {
+        sprintf( vector_size_trans, "%u", vecLenA );
+        sprintf( triangle_height, "%ld", subdims[0].y );
+        kobj.put("%PREFIXVECTOR_SIZE_TRANS", (const char *)vector_size_trans);
+        kobj.put("%TRIANGLE_HEIGHT", triangle_height);
+    }
+    kobj.spit((char*)buf, tempTemplate);
+    return (32 * 1024 * sizeof(char));
+}
+
diff --git a/src/library/blas/gens/trxm_common.c b/src/library/blas/gens/trxm_common.c
new file mode 100644
index 0000000..0e4c68d
--- /dev/null
+++ b/src/library/blas/gens/trxm_common.c
@@ -0,0 +1,289 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+
+#include <matrix_props.h>
+#include <matrix_dims.h>
+
+#include "trxm_common.h"
+
+void
+declareTrxmKernel(
+    struct KgenContext *ctx,
+    DataType dtype,
+    const PGranularity *pgran,
+    KernelExtraFlags kflags,
+    BlasFunctionID funcID,
+    const char *nameSuffix,
+    bool declareC,
+    bool restrictPointers)
+{
+    char tmp[1024];
+    char strC[1024];
+    char fpref, fsuff;
+    const char *typeName;
+    // swap coordinate names for the right side
+    char coordNames[2] = {'M', 'N'};
+    int side = ((kflags & KEXTRA_SIDE_RIGHT) != 0);
+    char offStr[1024];
+    int len = 0;
+    const char *qualA[2], *qualB[2];      // type qualifiers
+
+    typeName = dtypeBuiltinType(dtype);
+    fpref = dtypeToBlasPrefix(dtype);
+    fsuff = (funcID == CLBLAS_TRMM) ? 'm' : 's';
+    if (nameSuffix == NULL) {
+        nameSuffix = "";
+    }
+    strC[0] = '\0';
+    if (declareC) {
+        sprintf(strC, "    __global %s *C,\n", typeName);
+    }
+
+    offStr[0] = '\0';
+    if (kflags & KEXTRA_STARTM_NOT_ZERO) {
+        len = sprintf(offStr, ",\n    uint offset%c", coordNames[side]);
+    }
+    if (kflags & KEXTRA_STARTN_NOT_ZERO) {
+        len += sprintf(offStr + len, ",\n    uint offset%c",
+                       coordNames[1 - side]);
+    }
+    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
+        strcat(offStr, ",\n    uint offA");
+    }
+    if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
+        strcat(offStr, ",\n    uint offB");
+    }
+
+    if (restrictPointers) {
+        qualA[0] = "const ";
+        qualA[1] = "restrict ";
+    }
+    else {
+        qualA[0] = qualA[1] = "";
+    }
+
+    if (restrictPointers && declareC) {
+        qualB[0] = "const ";
+        qualB[1] = "restrict ";
+    }
+    else {
+        qualB[0] = qualB[1] = "";
+    }
+
+    sprintf(tmp, "__attribute__((reqd_work_group_size(%u, 1, 1)))\n"
+                 "void __kernel\n"
+                 "%ctr%cm%s(\n"
+                 "    uint %c,\n"
+                 "    uint %c,\n"
+                 "    %s alpha,\n"
+                 "    %s__global %s *%sA,\n"
+                 "    uint lda,\n"
+                 "    %s__global %s *%sB,\n"
+                 "%s"
+                 "    uint ldb%s)\n",
+            pgran->wgSize[0], fpref, fsuff, nameSuffix, coordNames[side],
+            coordNames[1 - side], typeName, qualA[0], typeName, qualA[1],
+            qualB[0], typeName, qualB[1], strC, offStr);
+
+    kgenDeclareFunction(ctx, tmp);
+}
+
+
+void
+genTrxmBMatrShift(
+    struct KgenContext *ctx,
+    KernelExtraFlags kflags,
+    bool useC)
+{
+    char tmp[1024], addstr[1024];
+    int len = 0;
+    const char *opstr;
+    char coordNames[2] = {'M', 'N'};
+    int side = (int)((kflags & KEXTRA_SIDE_RIGHT) != 0);
+    bool cmaj = ((kflags & KEXTRA_COLUMN_MAJOR) != 0);
+
+    if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
+        len = sprintf(addstr, "offB");
+    }
+    if (kflags & KEXTRA_STARTM_NOT_ZERO) {
+        opstr = (len) ? " + " : "";
+
+        if (cmaj) {
+            len += sprintf(addstr + len, "%soffset%c",
+                           opstr, coordNames[side]);
+        }
+        else {
+            len += sprintf(addstr + len, "%soffset%c * ldb",
+                           opstr, coordNames[side]);
+        }
+    }
+    if (kflags & KEXTRA_STARTN_NOT_ZERO) {
+        opstr = (len) ? " + " : "";
+
+        if (cmaj) {
+            len += sprintf(addstr + len, "%soffset%c * ldb",
+                           opstr, coordNames[1 - side]);
+        }
+        else {
+            len += sprintf(addstr + len, "%soffset%c",
+                           opstr, coordNames[1 - side]);
+        }
+    }
+
+    if (len) {
+        sprintf(tmp, "B += %s;\n", addstr);
+        kgenAddStmt(ctx, tmp);
+        if (useC) {
+            sprintf(tmp, "C += %s;\n", addstr);
+            kgenAddStmt(ctx, tmp);
+        }
+        kgenAddBlankLine(ctx);
+    }
+}
+
+void
+fixupTrxmKargs(CLBlasKargs *kargs)
+{
+    size_t offA = (kargs->side == clblasRight) ? kargs->offsetN :
+                                                    kargs->offsetM;
+    kargs->offA += offA * kargs->lda.matrix + offA;
+    if (kargs->order == clblasColumnMajor) {
+        kargs->offBX += kargs->offsetN * kargs->ldb.matrix + kargs->offsetM;
+    }
+    else {
+        kargs->offBX += kargs->offsetM * kargs->ldb.matrix + kargs->offsetN;
+    }
+
+    kargs->offsetM = kargs->offsetN = 0;
+}
+
+/* avoid " + 0" statements */
+static void
+genAdd(char *buf, size_t val)
+{
+    if (val == 0) {
+        buf[0] = 0; //zero length string
+    }
+    else {
+        sprintf(buf, " + %lu", val);
+    }
+}
+
+int
+genTrxmPostFetchZero(
+    struct KgenContext *ctx,
+    MatrixRole mrole,
+    void *priv)
+{
+    TilePostFetchPrivate *pfPriv = (TilePostFetchPrivate*)priv;
+    char tmp[1024];
+    char stmtStr[512];
+
+    const CLBLASKernExtra *kextra = pfPriv->gset->kextra;
+    KernelExtraFlags kflags = kextra->flags;
+    const KernelVarNames *vnames = &pfPriv->gset->varNames;
+    char yCoordVar[64], xCoordVar[64];
+    size_t blockx, blocky;
+    unsigned int x, y;
+    const struct SubproblemDim *dims = &pfPriv->gset->subdims[1];
+    DataType dtype = pfPriv->gset->kextra->dtype;
+    bool b;
+    bool tra;
+    Kstring kstr;
+    const Tile* pTile = &pfPriv->gset->tileA;
+
+    // For both A and B tiles, zero tail along K
+    b = ((pfPriv->gset->flags & BGF_DISTINCT_VECLEN));
+    if (checkForTailFetches(pfPriv->funcID, dims, kextra,
+                            mrole, b, true) != FETCH_NO_TAILS) {
+        defaultTilePostFetch(ctx, mrole, &pfPriv[1]);
+    }
+
+    if (mrole == MATRIX_B) {
+        /* This is not triangular matrix, just go away from here */
+        return 0;
+    }
+
+    blockx = blocky = 0;
+    // zero triangular part of tile a
+    // either single row of tile a either the whole tile have been fetched
+
+    tra = isMatrixAccessColMaj(pfPriv->funcID, kflags, mrole);
+    if (tra) {
+        blocky = pfPriv->wholeA ? dims->bwidth : 1;
+        blockx = dims->y;
+        sprintf(xCoordVar, "%s", vnames->coordA);
+        sprintf(yCoordVar, "%s", vnames->k);
+    }
+    else {
+        blocky = pfPriv->wholeA ? dims->y : 1;
+        blockx = dims->bwidth;
+        sprintf(xCoordVar, "%s", vnames->k);
+        sprintf(yCoordVar, "%s", vnames->coordA);
+    }
+
+    kgenAddStmt(ctx, "// post fetch A\n");
+    kgenBeginBranch(ctx, NULL);
+
+    genAdd(stmtStr, (size_t)pfPriv->fetchNumA);
+    sprintf(tmp, "uint zy = %s%s;\n", yCoordVar, stmtStr);
+    kgenAddStmt(ctx, tmp);
+
+    // loop through block rows (there is only one row in A block)
+    for(y = 0; y < blocky; y++) {
+        // loop through all elements of block row
+        for(x = 0; x < blockx; x++) {
+            unsigned int row, col;
+            char cmp = '<';
+
+            row = (unsigned int)(tra ? x : y);
+            col = (unsigned int)(tra ? y : x);
+
+            if (((kflags & KEXTRA_UPPER_TRIANG) != 0) ^
+                    ((kflags & KEXTRA_COLUMN_MAJOR) != 0)) {
+                cmp = '>';
+            }
+
+            genAdd(stmtStr, x);
+            sprintfTileElement(&kstr, pTile, row, col, 1);
+
+            sprintf(tmp, "%s = zy %c %s%s ? 0 : %s;\n",
+                    kstr.buf,
+                    cmp, xCoordVar, stmtStr,
+                    kstr.buf);
+
+            kgenAddStmt(ctx, tmp);
+            if (kflags & KEXTRA_UNIT_DIAGONAL) {
+                const char *one = strOne(dtype);
+
+                sprintf(tmp, "%s = zy == %s%s ? "
+                        "%s : %s;\n",
+                        kstr.buf, xCoordVar, stmtStr,
+                        one, kstr.buf);
+                kgenAddStmt(ctx, tmp);
+            }
+        }
+        if (y != blocky - 1) {
+            kgenAddStmt(ctx, "zy++;\n");
+        }
+        pfPriv->fetchNumA++;
+    }
+
+    return kgenEndBranch(ctx, NULL);
+}
diff --git a/src/library/blas/gens/trxm_common.h b/src/library/blas/gens/trxm_common.h
new file mode 100644
index 0000000..0914712
--- /dev/null
+++ b/src/library/blas/gens/trxm_common.h
@@ -0,0 +1,139 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef TRXM_BUFS_COMMON_H_
+#define TRXM_BUFS_COMMON_H_
+
+#include "blas_kgen.h"
+#include "gen_helper.h"
+#include "blas_funcs.h"
+
+/*
+ * COMMON NOTES:
+ * To use the functions the caller must guarantee kernel argument
+ * naming and subproblem dimensions independent on the side.
+ * That means size of A must be named as 'M'. The 'y' field of dimensions
+ * must be a step over rows of the matrix A in case of the left side, and over
+ * columns of the matrix otherwise. Similarly the 'x' field must be a step
+ * over columns of the matrix B in case of the left side, and over rows of
+ * the matrix otherwise. Both 'A' and 'B' are passed in global buffers.
+ */
+
+void
+declareTrxmKernel(
+    struct KgenContext *ctx,
+    DataType dtype,
+    const PGranularity *pgran,
+    KernelExtraFlags kflags,
+    BlasFunctionID funcID,
+    const char *nameSuffix,
+    bool declareC,
+    bool restrictPointers);
+
+/*
+ * Declare local variables for LDS based version
+ * of TRXM kernels.
+ *
+ * It provides the names typical for another generators as well:
+ *
+ * lid, gid - local and global ID.
+ * m0, k0 - top level counters over M and N
+ * currM, currN - current block coordinates over M and N at the top level
+ * tempA, tempB - blocks of matrix A and B located in the local memory
+ * tempC - block of matrix C located in the local memory; declared if
+ *      the 'useLocalC' argument is set
+ * c - matrix C tile located in registers; declared if the 'useLocalC'
+ *      argument is not set
+ * x, y - auxiliary variables to evaluate size of read/write blocks
+ *
+ * TRXM specific variables:
+ *
+ * startM, endM - starting and end coordinate over rows a kernel can access
+ */
+void
+declareLdsBasedTrxmVariables(
+    struct KgenContext *ctx,
+    DataType dtype,
+    const SubproblemDim *dims,
+    const PGranularity *pgran,
+    bool useLocalC);
+
+/*
+ * NOTE: the all following functions generate a code
+ *       using local variables declared with the
+ *       'declareTrxmLocalVariables' function
+ */
+
+void
+genPrepareTrxmBlockA(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    DataType dtype,
+    const CopyBufFuncs *copyFuncs,
+    const ZeroFuncs *zeroFuncs,
+    KernelExtraFlags flags,
+    const char *nameM);
+
+void
+genPrepareTrxmBlockB(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    DataType dtype,
+    const CopyBufFuncs *copyFuncs,
+    const ZeroFuncs *zeroFuncs,
+    KernelExtraFlags flags);
+
+void
+genUpdateTrxmResult(
+    struct KgenContext *ctx,
+    const SubproblemDim *dims,
+    char *fnName,
+    char *genericFnName,
+    KernelExtraFlags kflags);
+
+/*
+ * Triangulate matrix block. The decision to triangulate is
+ * made based on the current coordinates.
+ */
+void
+genTriangMatrBlock(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    DataType dtype,
+    KernelExtraFlags kflags);
+
+/*
+ * Move matrix B start pointer according to offsetM, offsetN.
+ */
+void
+genTrxmBMatrShift(
+    struct KgenContext *ctx,
+    KernelExtraFlags kflags,
+    bool useC);
+
+void
+fixupTrxmKargs(CLBlasKargs *kargs);
+
+/* Setting to zero upper/lower triangle elements and optionally set diagonal
+ * elements to one after fetching */
+int
+genTrxmPostFetchZero(
+    struct KgenContext *ctx,
+    MatrixRole mrole,
+    void *priv);
+
+#endif /* TRXM_BUFS_COMMON_H_ */
diff --git a/src/library/blas/gens/tuned_numbers.c b/src/library/blas/gens/tuned_numbers.c
new file mode 100644
index 0000000..18603ae
--- /dev/null
+++ b/src/library/blas/gens/tuned_numbers.c
@@ -0,0 +1,418 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include "tuned_numbers.h"
+
+#define USE_TUNED_NUMBERS
+
+typedef enum callType
+{
+    GEMM_NN_CALL,    // A-Non trans, B-Non trans
+    GEMM_NT_CALL,    // A-Non trans, B-Trans
+    GEMM_TN_CALL,    // A-Trans, B-Non trans
+    GEMM_TT_CALL,    // A-Trans, B-Trans
+
+    HERK_UN_CALL,    // Upper, Non-trans
+    HERK_UC_CALL,    // Upper, Conj-trans
+    HERK_LN_CALL,    // Lower, Non-trans
+    HERK_LC_CALL,    // Lower, Conj-trans
+
+    SYMM_LU_CALL,   // Left, Upper
+    SYMM_RU_CALL,   // Right, Upper
+    SYMM_LL_CALL,   // Left, Lower
+    SYMM_RL_CALL,   // Right, Lower
+
+    HEMM_LU_CALL,   // Left, Upper
+    HEMM_RU_CALL,   // Right, Upper
+    HEMM_LL_CALL,   // Left, Lower
+    HEMM_RL_CALL,   // Right, Lower
+
+    NUM_CALL_TYPES
+
+} callType;
+
+
+
+
+blockSizes bestBlockSizeForDevice( SolutionStep *step )
+{
+    blockSizes temp;
+    callType currCall;
+    CLBlasKargs *kargs = &(step->args);
+    TargetDevice *kDevice = &(step->device);
+    size_t maxWGSize;
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////
+	// QUICK FIX: changing code using fast regex search-replace:
+	// Removing the tagged array-of-structs initialization  - which works only with gcc
+	// moving the global static variable locally and assiging the values as individual statements
+	// this is not thread-safe; fix-this if thread safety is needed
+
+
+	static blockSizes bestBlockSizes [NUM_DEVICE_CHIPS][4][NUM_CALL_TYPES];         // [NUM_DEVICE_CHIPS][NUM_DATATYPES][NUM_CALL_TYPES]
+
+	// Block sizes for unknows devices -- using default numbers
+
+	{ blockSizes t = { 16, 8, 8, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_FLOAT][GEMM_NN_CALL] = t; }
+	{ blockSizes t = { 16, 8, 8, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_FLOAT][GEMM_NT_CALL] = t; }
+	{ blockSizes t = { 16, 16, 8, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_FLOAT][GEMM_TN_CALL] = t; }
+	{ blockSizes t = { 8, 16, 4, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_FLOAT][GEMM_TT_CALL] = t; }
+
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_DOUBLE][GEMM_NN_CALL] = t; }
+	{ blockSizes t = { 16, 16, 8, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_DOUBLE][GEMM_NT_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_DOUBLE][GEMM_TN_CALL] = t; }
+	{ blockSizes t = { 8, 16, 2, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_DOUBLE][GEMM_TT_CALL] = t; }
+
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][GEMM_NN_CALL] = t; }
+	{ blockSizes t = { 8, 16, 4, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][GEMM_NT_CALL] = t; }
+	{ blockSizes t = { 16, 16, 4, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][GEMM_TN_CALL] = t; }
+	{ blockSizes t = { 8, 16, 2, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][GEMM_TT_CALL] = t; }
+
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][GEMM_NN_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][GEMM_NT_CALL] = t; }
+	{ blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][GEMM_TN_CALL] = t; }
+	{ blockSizes t = { 8, 16, 1, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][GEMM_TT_CALL] = t; }
+
+	{ blockSizes t = { 16, 16, 4, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][HERK_UN_CALL] = t; }
+	{ blockSizes t = { 16, 16, 4, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][HERK_UC_CALL] = t; }
+	{ blockSizes t = { 16, 16, 4, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][HERK_LN_CALL] = t; }
+	{ blockSizes t = { 16, 16, 4, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][HERK_LC_CALL] = t; }
+
+	{ blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][HERK_UN_CALL] = t; }
+	{ blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][HERK_UC_CALL] = t; }
+	{ blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][HERK_LN_CALL] = t; }
+	{ blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][HERK_LC_CALL] = t; }
+
+	{ blockSizes t = { 16, 8, 8, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_FLOAT][SYMM_LU_CALL] = t; }
+	{ blockSizes t = { 16, 8, 8, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_FLOAT][SYMM_RU_CALL] = t; }
+	{ blockSizes t = { 16, 8, 8, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_FLOAT][SYMM_LL_CALL] = t; }
+	{ blockSizes t = { 16, 8, 8, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_FLOAT][SYMM_RL_CALL] = t; }
+
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_DOUBLE][SYMM_LU_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_DOUBLE][SYMM_RU_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_DOUBLE][SYMM_LL_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_DOUBLE][SYMM_RL_CALL] = t; }
+
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][SYMM_LU_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][SYMM_RU_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][SYMM_LL_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][SYMM_RL_CALL] = t; }
+
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][SYMM_LU_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][SYMM_RU_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][SYMM_LL_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][SYMM_RL_CALL] = t; }
+
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][HEMM_LU_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][HEMM_RU_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][HEMM_LL_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][HEMM_RL_CALL] = t; }
+
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][HEMM_LU_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][HEMM_RU_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][HEMM_LL_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][HEMM_RL_CALL] = t; }
+
+	#ifdef USE_TUNED_NUMBERS
+
+	// Block sizes for Cayman
+	{ blockSizes t = { 32, 4, 4, 8, 0 }; bestBlockSizes[CAYMAN][TYPE_FLOAT][GEMM_NN_CALL] = t; }
+	{ blockSizes t = { 8, 32, 4, 8, 1 }; bestBlockSizes[CAYMAN][TYPE_FLOAT][GEMM_NT_CALL] = t; }
+	{ blockSizes t = { 16, 16, 8, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_FLOAT][GEMM_TN_CALL] = t; }
+
+	{ blockSizes t = { 8, 8, 8, 2, 0 }; bestBlockSizes[CAYMAN][TYPE_DOUBLE][GEMM_NN_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 8, 0 }; bestBlockSizes[CAYMAN][TYPE_DOUBLE][GEMM_NT_CALL] = t; }
+	{ blockSizes t = { 16, 16, 8, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_DOUBLE][GEMM_TN_CALL] = t; }
+
+	{ blockSizes t = { 8, 8, 8, 2, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_FLOAT][GEMM_NN_CALL] = t; }
+	{ blockSizes t = { 8, 32, 4, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_FLOAT][GEMM_NT_CALL] = t; }
+	{ blockSizes t = { 16, 16, 8, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_FLOAT][GEMM_TN_CALL] = t; }
+
+	{ blockSizes t = { 8, 8, 8, 2, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_DOUBLE][GEMM_NN_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_DOUBLE][GEMM_NT_CALL] = t; }
+	{ blockSizes t = { 8, 16, 4, 2, 1 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_DOUBLE][GEMM_TN_CALL] = t; }
+
+	{ blockSizes t = { 4, 16, 8, 2, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_FLOAT][HERK_UN_CALL] = t; }
+
+	{ blockSizes t = { 8, 8, 8, 8, 0 }; bestBlockSizes[CAYMAN][TYPE_FLOAT][SYMM_LU_CALL] = t; }
+	{ blockSizes t = { 32, 4, 4, 8, 0 }; bestBlockSizes[CAYMAN][TYPE_FLOAT][SYMM_RU_CALL] = t; }
+	{ blockSizes t = { 8, 8, 8, 8, 0 }; bestBlockSizes[CAYMAN][TYPE_FLOAT][SYMM_LL_CALL] = t; }
+	{ blockSizes t = { 16, 4, 8, 8, 0 }; bestBlockSizes[CAYMAN][TYPE_FLOAT][SYMM_RL_CALL] = t; }
+
+	{ blockSizes t = { 8, 8, 8, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_DOUBLE][SYMM_LU_CALL] = t; }
+	{ blockSizes t = { 8, 8, 8, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_DOUBLE][SYMM_RU_CALL] = t; }
+	{ blockSizes t = { 8, 8, 4, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_DOUBLE][SYMM_LL_CALL] = t; }
+	{ blockSizes t = { 16, 4, 8, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_DOUBLE][SYMM_RL_CALL] = t; }
+
+	{ blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_FLOAT][SYMM_LU_CALL] = t; }
+	{ blockSizes t = { 16, 4, 8, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_FLOAT][SYMM_RU_CALL] = t; }
+	{ blockSizes t = { 16, 8, 4, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_FLOAT][SYMM_LL_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_FLOAT][SYMM_RL_CALL] = t; }
+
+	{ blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_DOUBLE][SYMM_LU_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_DOUBLE][SYMM_RU_CALL] = t; }
+	{ blockSizes t = { 8, 16, 4, 2, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_DOUBLE][SYMM_LL_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_DOUBLE][SYMM_RL_CALL] = t; }
+
+	{ blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_FLOAT][HEMM_LU_CALL] = t; }
+	{ blockSizes t = { 16, 4, 8, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_FLOAT][HEMM_RU_CALL] = t; }
+
+	// Block sizes for Tahiti
+	{ blockSizes t = { 32, 8, 4, 8, 0 }; bestBlockSizes[TAHITI][TYPE_FLOAT][GEMM_NN_CALL] = t; }
+	{ blockSizes t = { 8, 32, 4, 8, 0 }; bestBlockSizes[TAHITI][TYPE_FLOAT][GEMM_NT_CALL] = t; }
+	{ blockSizes t = { 32, 8, 8, 8, 0 }; bestBlockSizes[TAHITI][TYPE_FLOAT][GEMM_TN_CALL] = t; }
+
+	{ blockSizes t = { 32, 8, 4, 4, 1 }; bestBlockSizes[TAHITI][TYPE_DOUBLE][GEMM_NN_CALL] = t; }
+	{ blockSizes t = { 8, 32, 4, 4, 1 }; bestBlockSizes[TAHITI][TYPE_DOUBLE][GEMM_NT_CALL] = t; }
+	{ blockSizes t = { 16, 16, 8, 4, 1 }; bestBlockSizes[TAHITI][TYPE_DOUBLE][GEMM_TN_CALL] = t; }
+
+	{ blockSizes t = { 32, 8, 4, 4, 1 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][GEMM_NN_CALL] = t; }
+	{ blockSizes t = { 8, 32, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][GEMM_NT_CALL] = t; }
+	{ blockSizes t = { 16, 16, 8, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][GEMM_TN_CALL] = t; }
+
+	{ blockSizes t = { 16, 16, 4, 2, 1 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][GEMM_NN_CALL] = t; }
+	{ blockSizes t = { 8, 32, 4, 2, 1 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][GEMM_NT_CALL] = t; }
+	{ blockSizes t = { 16, 16, 4, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][GEMM_TN_CALL] = t; }
+
+	{ blockSizes t = { 8, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][HERK_UN_CALL] = t; }
+	{ blockSizes t = { 4, 16, 8, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][HERK_UC_CALL] = t; }
+	{ blockSizes t = { 8, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][HERK_LN_CALL] = t; }
+	{ blockSizes t = { 8, 32, 8, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][HERK_LC_CALL] = t; }
+
+	{ blockSizes t = { 8, 16, 4, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][HERK_UN_CALL] = t; }
+	{ blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][HERK_UC_CALL] = t; }
+	{ blockSizes t = { 8, 16, 4, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][HERK_LN_CALL] = t; }
+	{ blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][HERK_LC_CALL] = t; }
+
+	{ blockSizes t = { 32, 8, 4, 8, 0 }; bestBlockSizes[TAHITI][TYPE_FLOAT][SYMM_LU_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 8, 0 }; bestBlockSizes[TAHITI][TYPE_FLOAT][SYMM_RU_CALL] = t; }
+	{ blockSizes t = { 32, 8, 4, 8, 0 }; bestBlockSizes[TAHITI][TYPE_FLOAT][SYMM_LL_CALL] = t; }
+	{ blockSizes t = { 32, 8, 8, 4, 0 }; bestBlockSizes[TAHITI][TYPE_FLOAT][SYMM_RL_CALL] = t; }
+
+	{ blockSizes t = { 32, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_DOUBLE][SYMM_LU_CALL] = t; }
+	{ blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_DOUBLE][SYMM_RU_CALL] = t; }
+	{ blockSizes t = { 32, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_DOUBLE][SYMM_LL_CALL] = t; }
+	{ blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_DOUBLE][SYMM_RL_CALL] = t; }
+
+	{ blockSizes t = { 32, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][SYMM_LU_CALL] = t; }
+	{ blockSizes t = { 8, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][SYMM_RU_CALL] = t; }
+	{ blockSizes t = { 32, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][SYMM_LL_CALL] = t; }
+	{ blockSizes t = { 32, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][SYMM_RL_CALL] = t; }
+
+	{ blockSizes t = { 16, 16, 4, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][SYMM_LU_CALL] = t; }
+	{ blockSizes t = { 8, 32, 4, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][SYMM_RU_CALL] = t; }
+	{ blockSizes t = { 16, 16, 4, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][SYMM_LL_CALL] = t; }
+	{ blockSizes t = { 8, 32, 4, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][SYMM_RL_CALL] = t; }
+
+	{ blockSizes t = { 32, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][HEMM_LU_CALL] = t; }
+	{ blockSizes t = { 8, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][HEMM_RU_CALL] = t; }
+	{ blockSizes t = { 32, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][HEMM_LL_CALL] = t; }
+	{ blockSizes t = { 32, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][HEMM_RL_CALL] = t; }
+
+	{ blockSizes t = { 16, 16, 4, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][HEMM_LU_CALL] = t; }
+	{ blockSizes t = { 8, 32, 4, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][HEMM_RU_CALL] = t; }
+	{ blockSizes t = { 16, 16, 4, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][HEMM_LL_CALL] = t; }
+	{ blockSizes t = { 8, 32, 4, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][HEMM_RL_CALL] = t; }
+
+	// Block-sizes for Cypress
+	{ blockSizes t = { 32, 8, 4, 8, 1 }; bestBlockSizes[CYPRESS][TYPE_FLOAT][GEMM_NN_CALL] = t; }
+	{ blockSizes t = { 8, 8, 8, 8, 1 }; bestBlockSizes[CYPRESS][TYPE_FLOAT][GEMM_NT_CALL] = t; }
+	{ blockSizes t = { 16, 16, 8, 8, 0 }; bestBlockSizes[CYPRESS][TYPE_FLOAT][GEMM_TN_CALL] = t; }
+
+	{ blockSizes t = { 16, 4, 4, 8, 1 }; bestBlockSizes[CYPRESS][TYPE_DOUBLE][GEMM_NN_CALL] = t; }
+	{ blockSizes t = { 8, 16, 8, 4, 1 }; bestBlockSizes[CYPRESS][TYPE_DOUBLE][GEMM_NT_CALL] = t; }
+	{ blockSizes t = { 4, 16, 8, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_DOUBLE][GEMM_TN_CALL] = t; }
+
+	{ blockSizes t = { 16, 4, 4, 8, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][GEMM_NN_CALL] = t; }
+	{ blockSizes t = { 8, 32, 4, 4, 1 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][GEMM_NT_CALL] = t; }
+	{ blockSizes t = { 16, 16, 8, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][GEMM_TN_CALL] = t; }
+
+	{ blockSizes t = { 8, 32, 8, 2, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][HERK_UN_CALL] = t; }
+	{ blockSizes t = { 8, 16, 4, 2, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][HERK_UC_CALL] = t; }
+	{ blockSizes t = { 8, 16, 8, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][HERK_LN_CALL] = t; }
+	{ blockSizes t = { 8, 16, 4, 2, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][HERK_LC_CALL] = t; }
+
+	{ blockSizes t = { 8, 8, 8, 8, 0 }; bestBlockSizes[CYPRESS][TYPE_FLOAT][SYMM_LU_CALL] = t; }
+	{ blockSizes t = { 8, 8, 8, 8, 0 }; bestBlockSizes[CYPRESS][TYPE_FLOAT][SYMM_RU_CALL] = t; }
+	{ blockSizes t = { 8, 8, 4, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_FLOAT][SYMM_LL_CALL] = t; }
+	{ blockSizes t = { 8, 8, 8, 8, 0 }; bestBlockSizes[CYPRESS][TYPE_FLOAT][SYMM_RL_CALL] = t; }
+
+	{ blockSizes t = { 8, 16, 8, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_DOUBLE][SYMM_LU_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 8, 0 }; bestBlockSizes[CYPRESS][TYPE_DOUBLE][SYMM_RU_CALL] = t; }
+	{ blockSizes t = { 8, 8, 4, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_DOUBLE][SYMM_LL_CALL] = t; }
+	{ blockSizes t = { 16, 4, 8, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_DOUBLE][SYMM_RL_CALL] = t; }
+
+	{ blockSizes t = { 8, 16, 8, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][SYMM_LU_CALL] = t; }
+	{ blockSizes t = { 8, 8, 8, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][SYMM_RU_CALL] = t; }
+	{ blockSizes t = { 8, 8, 4, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][SYMM_LL_CALL] = t; }
+	{ blockSizes t = { 8, 8, 8, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][SYMM_RL_CALL] = t; }
+
+	{ blockSizes t = { 16, 8, 4, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_DOUBLE][SYMM_LU_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_DOUBLE][SYMM_RU_CALL] = t; }
+	{ blockSizes t = { 8, 8, 4, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_DOUBLE][SYMM_LL_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_DOUBLE][SYMM_RL_CALL] = t; }
+
+	{ blockSizes t = { 8, 8, 4, 8, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][HEMM_LU_CALL] = t; }
+	{ blockSizes t = { 32, 4, 8, 2, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][HEMM_RU_CALL] = t; }
+	{ blockSizes t = { 4, 32, 4, 2, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][HEMM_LL_CALL] = t; }
+	{ blockSizes t = { 32, 4, 8, 2, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][HEMM_RL_CALL] = t; }
+
+	{ blockSizes t = { 16, 4, 4, 2, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_DOUBLE][HEMM_LU_CALL] = t; }
+	{ blockSizes t = { 32, 4, 8, 2, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_DOUBLE][HEMM_RU_CALL] = t; }
+	{ blockSizes t = { 4, 16, 4, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_DOUBLE][HEMM_LL_CALL] = t; }
+	{ blockSizes t = { 32, 4, 8, 2, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_DOUBLE][HEMM_RL_CALL] = t; }
+
+	// Block-sizes for GeForce GTX 580
+	{ blockSizes t = { 16, 32, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_FLOAT][GEMM_NN_CALL] = t; }
+	{ blockSizes t = { 16, 32, 4, 8, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_FLOAT][GEMM_NT_CALL] = t; }
+	{ blockSizes t = { 32, 16, 8, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_FLOAT][GEMM_TN_CALL] = t; }
+
+	{ blockSizes t = { 16, 16, 8, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_DOUBLE][GEMM_NN_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_DOUBLE][GEMM_NT_CALL] = t; }
+	{ blockSizes t = { 16, 32, 8, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_DOUBLE][GEMM_TN_CALL] = t; }
+
+	{ blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][GEMM_NN_CALL] = t; }
+	{ blockSizes t = { 32, 16, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][GEMM_NT_CALL] = t; }
+	{ blockSizes t = { 32, 8, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][GEMM_TN_CALL] = t; }
+
+	{ blockSizes t = { 16, 32, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_DOUBLE][GEMM_NN_CALL] = t; }
+	{ blockSizes t = { 32, 8, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_DOUBLE][GEMM_NT_CALL] = t; }
+
+	{ blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][HERK_UN_CALL] = t; }
+	{ blockSizes t = { 16, 32, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][HERK_UC_CALL] = t; }
+	{ blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][HERK_LN_CALL] = t; }
+	{ blockSizes t = { 16, 32, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][HERK_LC_CALL] = t; }
+
+	{ blockSizes t = { 16, 32, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_DOUBLE][HERK_UN_CALL] = t; }
+	{ blockSizes t = { 8, 16, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_DOUBLE][HERK_UC_CALL] = t; }
+	{ blockSizes t = { 16, 32, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_DOUBLE][HERK_LN_CALL] = t; }
+	{ blockSizes t = { 8, 16, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_DOUBLE][HERK_LC_CALL] = t; }
+
+	{ blockSizes t = { 32, 16, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_FLOAT][SYMM_LU_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 8, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_FLOAT][SYMM_RU_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_FLOAT][SYMM_LL_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 8, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_FLOAT][SYMM_RL_CALL] = t; }
+
+	{ blockSizes t = { 16, 8, 8, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_DOUBLE][SYMM_LU_CALL] = t; }
+	{ blockSizes t = { 16, 4, 8, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_DOUBLE][SYMM_RU_CALL] = t; }
+	{ blockSizes t = { 16, 8, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_DOUBLE][SYMM_LL_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_DOUBLE][SYMM_RL_CALL] = t; }
+
+	{ blockSizes t = { 16, 8, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][SYMM_LU_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][SYMM_RU_CALL] = t; }
+	{ blockSizes t = { 16, 8, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][SYMM_LL_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][SYMM_RL_CALL] = t; }
+
+	{ blockSizes t = { 16, 4, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_DOUBLE][SYMM_LU_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_DOUBLE][SYMM_RU_CALL] = t; }
+	{ blockSizes t = { 16, 8, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_DOUBLE][SYMM_LL_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_DOUBLE][SYMM_RL_CALL] = t; }
+
+	{ blockSizes t = { 16, 8, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][HEMM_LU_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][HEMM_RU_CALL] = t; }
+	{ blockSizes t = { 16, 8, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][HEMM_LL_CALL] = t; }
+	{ blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][HEMM_RL_CALL] = t; }
+
+	#endif      // USE_TUNED_NUMBERS
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    identifyDevice( kDevice );          // Query device name and stores it in the structure
+
+    if( kargs->pigFuncID == CLBLAS_GEMM2 )
+    {
+        if( kargs->transA == clblasNoTrans )
+        {
+            if( kargs->transB == clblasNoTrans )
+                    currCall = GEMM_NN_CALL;
+            else    currCall = GEMM_NT_CALL;
+        }
+        else
+        {
+            if( kargs->transB == clblasNoTrans )
+                    currCall = GEMM_TN_CALL;
+            else    currCall = GEMM_TT_CALL;
+        }
+    }
+    else if( kargs->pigFuncID == CLBLAS_HERK )
+    {
+        if( kargs->uplo == clblasUpper )
+        {
+            if( kargs->transA == clblasNoTrans )
+                    currCall = HERK_UN_CALL;
+            else    currCall = HERK_UC_CALL;
+        }
+        else
+        {
+            if( kargs->transA == clblasNoTrans )
+                    currCall = HERK_LN_CALL;
+            else    currCall = HERK_LC_CALL;
+        }
+    }
+    else if( (kargs->pigFuncID == CLBLAS_SYMM) || (kargs->pigFuncID == CLBLAS_SYMM_DIAGONAL) )
+    {
+        if( kargs->side == clblasLeft )
+        {
+            if( kargs->uplo == clblasUpper )
+                    currCall = SYMM_LU_CALL;
+            else    currCall = SYMM_LL_CALL;
+        }
+        else
+        {
+            if( kargs->uplo == clblasUpper )
+                    currCall = SYMM_RU_CALL;
+            else    currCall = SYMM_RL_CALL;
+        }
+    }
+    else if( (kargs->pigFuncID == CLBLAS_HEMM) || (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL) )
+    {
+        if( kargs->side == clblasLeft )
+        {
+            if( kargs->uplo == clblasUpper )
+                    currCall = HEMM_LU_CALL;
+            else    currCall = HEMM_LL_CALL;
+        }
+        else
+        {
+            if( kargs->uplo == clblasUpper )
+                    currCall = HEMM_RU_CALL;
+            else    currCall = HEMM_RL_CALL;
+        }
+    }
+
+    temp = bestBlockSizes [ (kDevice->ident).chip ] [kargs->dtype] [currCall];
+
+    if( (temp.TY == 0) || (temp.TX == 0) || (temp.ITEMY == 0) || (temp.ITEMX == 0) )
+    {
+        // If optimal block-sizes for the device is not available,
+        // we take default block-sizes
+        temp = bestBlockSizes [CHIP_UNKNOWN] [kargs->dtype] [currCall];
+    }
+
+    maxWGSize = deviceMaxWorkgroupSize( (kDevice->id), NULL );
+
+    while( ( ((size_t)temp.TY)*((size_t)temp.TX) ) > maxWGSize )   // FIXME check this
+    {
+       if( temp.TX < temp.TY )
+               temp.TX /= 2;
+       else    temp.TY /= 2;
+    }
+
+    return temp;
+}
+
diff --git a/src/library/blas/gens/tuned_numbers.h b/src/library/blas/gens/tuned_numbers.h
new file mode 100644
index 0000000..01b6538
--- /dev/null
+++ b/src/library/blas/gens/tuned_numbers.h
@@ -0,0 +1,45 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef _TUNED_NUMBERS_
+#define _TUNED_NUMBERS_
+
+#include <clBLAS.h>
+#include <cltypes.h>
+#include <devinfo.h>
+#include <solution_seq.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct blockSizes
+{
+    unsigned char TY;               // Not more than 32
+    unsigned char TX;
+    unsigned char ITEMY:7;          // Not more than 8
+    unsigned char ITEMX:7;
+    unsigned char useBarrier:1;
+} blockSizes;
+
+blockSizes bestBlockSizeForDevice( SolutionStep *step );
+
+#ifdef __cplusplus
+}       /* extern "C" { */
+#endif
+
+#endif // _TUNED_NUMBERS_
diff --git a/src/library/blas/gens/xxmv_common.c b/src/library/blas/gens/xxmv_common.c
new file mode 100644
index 0000000..3f27512
--- /dev/null
+++ b/src/library/blas/gens/xxmv_common.c
@@ -0,0 +1,346 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+#include <clblas_stddef.h>
+#include "xxmv_common.h"
+
+static void
+genMul(char *buf, size_t val, const char* type, const char* sum, const char* mul)
+{
+    if (mul == NULL) {
+        if (sum == NULL) {
+            sprintf(buf, "%lu", val);
+        }
+        else {
+            if (val == 0) {
+                sprintf(buf, "%s", sum); //zero length string
+            }
+            else {
+                sprintf(buf, "%s + %lu", sum, val);
+            }
+        }
+    }
+    else {
+        if (sum == NULL) {
+            if (val == 0) {
+                sprintf(buf, "0"); //zero length string
+            }
+            else
+            if (val == 1) {
+                sprintf(buf, "%s",
+                    mul); //zero length string
+            }
+            else {
+                sprintf(buf, "mad24((%s)%lu, (%s)%s, (%s)0)",
+                    type, val, type, mul, type);
+                //sprintf(buf, "%lu * %s", val, mul);
+            }
+        }
+        else {
+            if (val == 0) {
+                sprintf(buf, "mad24((%s)%s, (%s)%s, (%s)0)",
+                    type, sum, type, mul, type); //zero length string
+                //sprintf(buf, "%s * %s", sum, mul);
+            }
+            else {
+                sprintf(buf, "mad24((%s)%s + %lu, (%s)%s, (%s)0)",
+                    type, sum, val, type, mul, type);
+                //sprintf(buf, "(%s + %lu) * %s", sum, val, mul);
+            }
+        }
+    }
+}
+
+
+void
+genFetchX(
+    struct KgenContext *ctx,
+    Tile *tile,
+    unsigned int vecLen,
+    DataType dtype,
+    const KernelVarNames *varNames,
+    TileMulFlags tflags,
+    KernelExtraFlags kflags)
+{
+    Kstring kstr[1];
+    Tile memtile;
+    char tmp[1024], strMul[128];
+    unsigned int n;
+    const char *ptrName;
+    bool tailN = (tflags & TILEMUL_SKEW_B) != 0;
+    bool incxOne = ((kflags & KEXTRA_INCX_ONE) != 0);
+    bool elemFetch = ((kflags & KEXTRA_NO_COPY_VEC_B) != 0);
+    unsigned int nfetch = !tailN && incxOne && !elemFetch ? vecLen : 1;
+
+    (void)dtype;
+    initTile(&memtile, NULL, tile->nrRows, tile->nrCols, nfetch,
+             tile->dtype, tile->storType,  tile->trans, tile->packed);
+    getVectorTypeName(tile->dtype, vecLen, NULL, &ptrName);
+
+    if (!tailN && incxOne && !elemFetch) {
+        sprintf(tmp, "const uint xk = %s / %u;\n", varNames->k, vecLen);
+        kgenAddStmt(ctx, tmp);
+        for (n = 0; forEachTile(kstr, n, 0, 2, tile, &memtile); n++) {
+            sprintf(tmp,"%s = %s.%s[xk + %u];\n",
+                        kstr[0].buf, varNames->B, ptrName, n);
+            kgenAddStmt(ctx, tmp);
+        }
+    }
+    else {
+        for (n = 0; forEachTile(kstr, n, 0, 2, tile, &memtile); n++) {
+            genMul(strMul, n, "int", "k", incxOne ? NULL : "incx");
+            if (tailN) {
+                sprintf(tmp,"%s = X[k + %u < %s ? %s : 0];\n",
+                 kstr[0].buf, n, varNames->sizeK, strMul);
+            }
+            else {
+                sprintf(tmp,"%s = X[%s];\n",kstr[0].buf, strMul);
+            }
+            kgenAddStmt(ctx, tmp);
+        }
+    }
+
+    if (tailN) {
+        for (n = 0; forEachTile(kstr, n, 0, 2, tile, &memtile); n++) {
+            sprintf(tmp,"%s = k + %u < %s ? %s : 0;\n",
+                        kstr[0].buf, n, varNames->sizeK, kstr[0].buf);
+            kgenAddStmt(ctx, tmp);
+        }
+    }
+}
+
+void
+setResultPos(
+    struct KgenContext *ctx,
+    KernelExtraFlags kflags,
+    const char *axVar)
+{
+    bool incyOne = ((kflags & KEXTRA_INCY_ONE) != 0);
+
+    char tmp[2048];
+
+    if (incyOne) {
+        sprintf(tmp, "Y += %s;\n", axVar);
+    }
+    else {
+        sprintf(tmp, "Y += incy * (int)%s;\n", axVar);
+    }
+    kgenAddStmt(ctx, tmp);
+}
+
+void
+updateResultVectorTiled(
+    struct KgenContext *ctx,
+    KernelExtraFlags kflags,
+    unsigned int vecLen,
+    Tile *tile)
+{
+    bool beta0 = ((kflags & KEXTRA_BETA_ZERO) != 0);
+    bool incyOne = ((kflags & KEXTRA_INCY_ONE) != 0);
+    bool tailM = ((kflags & KEXTRA_TAILS_M) != 0);
+    bool isComplex = isComplexType(tile->dtype);
+    unsigned int n, i;
+    const char *outTypeName, *outPtrName;
+    Tile result, memtile;
+
+    char tmp[2048],strMul[256];
+    Kstring kstr[2];
+
+    if (isComplex) {
+        vecLen = 1;
+    }
+    initTile(&result, "r", tile->nrRows, tile->nrCols, tile->nrRows,
+                    tile->dtype, tile->storType, true, tile->packed);
+    declareOneTileStorage(ctx, &result);
+
+    memtile = result;
+    memtile.baseName = NULL;
+    memtile.vecLen = !tailM && incyOne ? vecLen : 1;
+    getVectorTypeName(memtile.dtype, memtile.vecLen, &outTypeName, &outPtrName);
+
+    sprintf(tmp,"GPtr uC;\n"
+                "uC.f = Y;\n");
+    kgenAddStmt(ctx, tmp);
+
+    if (!tailM && incyOne) {
+        for (n = 0; forEachTile(kstr, n, 0, 2, &result, &memtile); n++) {
+            sprintf(tmp,"%s = uC.%s[%u];\n",
+                        kstr[0].buf, outPtrName, n);
+            kgenAddStmt(ctx, tmp);
+        }
+    }
+    else {
+        for (n = 0; forEachTile(kstr, n, 0, 2, &result, &memtile); n++) {
+            genMul(strMul, n, "int", NULL, incyOne ? NULL : "incy");
+            if (tailM) {
+                sprintf(tmp,"%s = Y[coordA + %u >= M ? 0 : %s];\n",
+                        kstr[0].buf, n, strMul);
+            }
+            else {
+                sprintf(tmp,"%s = Y[%s];\n",
+                        kstr[0].buf, strMul);
+            }
+            kgenAddStmt(ctx, tmp);
+        }
+    }
+
+    if (isComplex) {
+        const char *complVec =
+                    isDoubleBasedType(tile->dtype) ? "double2" : "float2";
+        Tile onetile = result;
+        onetile.baseName = NULL;
+        onetile.vecLen = 1;
+        for (n = 0; forEachTile(kstr, n, 0, 3, &result, tile, &onetile); n++) {
+            if (beta0) {
+                sprintf(tmp,
+                       "%s = %s * alpha.x + %s.yx * (%s)(-alpha.y, alpha.y);\n",
+                       kstr[0].buf, kstr[1].buf, kstr[1].buf, complVec);
+            }
+            else {
+                sprintf(tmp,
+                        "%s = %s * beta.x + %s.yx * (%s)(-beta.y, beta.y) + "
+                        "%s * alpha.x + %s.yx * (%s)(-alpha.y, alpha.y);\n",
+                        kstr[0].buf, kstr[0].buf, kstr[0].buf, complVec,
+                        kstr[1].buf, kstr[1].buf, complVec);
+            }
+            kgenAddStmt(ctx, tmp);
+        }
+    }
+    else {
+        for (n = 0; forEachTile(kstr, n, 0, 2, &result, tile); n++) {
+            if (beta0) {
+                sprintf(tmp, "%s = alpha * %s;\n", kstr[0].buf, kstr[1].buf);
+            }
+            else {
+                sprintf(tmp, "%s = beta * %s + alpha * %s;\n",
+                             kstr[0].buf, kstr[0].buf, kstr[1].buf);
+            }
+            kgenAddStmt(ctx, tmp);
+        }
+    }
+
+    if (!tailM && incyOne) {
+        for (i = 0; forEachTile(kstr, i, 0, 2, &result, &memtile); i++) {
+            sprintf(tmp,"uC.%s[%u] = %s;\n",
+                        outPtrName, i, kstr[0].buf);
+            kgenAddStmt(ctx, tmp);
+        }
+    }
+    else {
+        if (!tailM) {
+            for (i = 0; forEachTile(kstr, i, 0, 2, &result, &memtile); i++) {
+                sprintf(tmp,"*Y = %s;\n", kstr[0].buf);
+                //sprintf(tmp,"Y[%u * incy] = %s;\n", i, kstr.buf);
+                kgenAddStmt(ctx, tmp);
+                kgenAddStmt(ctx, "Y += incy;\n");
+            }
+        }
+        else {
+            for (n = forEachTile(NULL, 0, 0, 2, &result, &memtile);
+                     n != 0; n--) {
+                i = n - 1;
+                forEachTile(kstr, i, 0, 2, &result, &memtile);
+                genMul(strMul, i, "int", NULL, incyOne ? NULL : "incy");
+                sprintf(tmp,"Y[coordA + %u >= M ? 0 : %s] = %s;\n",
+                        i, strMul, kstr[0].buf);
+                kgenAddStmt(ctx, tmp);
+            }
+        }
+    }
+}
+
+void
+genIncPointers(
+    struct KgenContext *ctx,
+    KernelExtraFlags kflags)
+{
+    bool incxOne = ((kflags & KEXTRA_INCX_ONE) != 0);
+    bool incyOne = ((kflags & KEXTRA_INCY_ONE) != 0);
+
+    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
+        kgenAddStmt(ctx, "A += offA;\n");
+    }
+    if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
+        kgenAddStmt(ctx, "X += offX;\n");
+    }
+    if (kflags & KEXTRA_CY_OFF_NOT_ZERO) {
+        kgenAddStmt(ctx, "Y += offY;\n");
+    }
+
+    if (!incxOne) {
+        kgenAddStmt(ctx, "X += incx > 0 ? 0 : (N - 1) * abs(incx);\n");
+    }
+    if (!incyOne) {
+        kgenAddStmt(ctx, "Y += incy > 0 ? 0 : (M - 1) * abs(incy);\n");
+    }
+}
+
+void
+genStoreLocalResult(
+    struct KgenContext *ctx,
+    Tile *tile,
+    const char *lid)
+{
+    Kstring kstr;
+    char tmp[1024];
+    unsigned int i;
+
+    for (i = 0; forEachTile(&kstr, i, 0, 1, tile); i++) {
+        sprintf(tmp, "localRes[%s][%u] = %s;\n", lid, i, kstr.buf);
+        kgenAddStmt(ctx, tmp);
+    }
+}
+
+void
+genAddLocalResult(
+    struct KgenContext *ctx,
+    Tile *tile,
+    const char *lid,
+    unsigned int cLocal,
+    unsigned int bStep)
+{
+    Kstring kstr;
+    char tmp[1024];
+    unsigned int i;
+
+    sprintf(tmp, "for (uint i = 1; i < %u; i++)", cLocal);
+    kgenBeginBranch(ctx, tmp);
+    for (i = 0; forEachTile(&kstr, i, 0, 1, tile); i++) {
+        sprintf(tmp, "%s += localRes[%s + i*%u][%u];\n",
+                     kstr.buf, lid, bStep, i);
+        kgenAddStmt(ctx, tmp);
+    }
+    kgenEndBranch(ctx, NULL);
+}
+
+void
+genMergeResults(
+    struct KgenContext *ctx,
+    Tile *result,
+    Tile *source)
+{
+    unsigned int i;
+    Kstring kstr[2];
+    char tmp[2048];
+
+    for (i = 0; forEachTile(kstr, i, 0, 2, result, source); i++) {
+        sprintf(tmp, "%s += %s;\n", kstr[0].buf, kstr[1].buf);
+        kgenAddStmt(ctx, tmp);
+    }
+}
+
diff --git a/src/library/blas/gens/xxmv_common.h b/src/library/blas/gens/xxmv_common.h
new file mode 100644
index 0000000..13fcec5
--- /dev/null
+++ b/src/library/blas/gens/xxmv_common.h
@@ -0,0 +1,74 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef XXMV_COMMON_H_
+#define XXMV_COMMON_H_
+
+#include "blas_kgen.h"
+#include "gen_helper.h"
+
+/* Fetch part of vector x into tile b */
+void
+genFetchX(
+    struct KgenContext *ctx,
+    Tile *tile,
+    unsigned int vecLen,
+    DataType dtype,
+    const KernelVarNames *varNames,
+    TileMulFlags tflags,
+    KernelExtraFlags kflags);
+
+void
+setResultPos(
+    struct KgenContext *ctx,
+    KernelExtraFlags kflags,
+    const char *axVar);
+
+void
+updateResultVectorTiled(
+    struct KgenContext *ctx,
+    KernelExtraFlags kflags,
+    unsigned int vecLen,
+    Tile *tile);
+
+void
+genIncPointers(
+    struct KgenContext *ctx,
+    KernelExtraFlags kflags);
+
+void
+genStoreLocalResult(
+    struct KgenContext *ctx,
+    Tile *tile,
+    const char *lid);
+
+void
+genAddLocalResult(
+    struct KgenContext *ctx,
+    Tile *tile,
+    const char *lid,
+    unsigned int cLocal,
+    unsigned int bStep);
+
+/* Store partial result to private result buffer */
+void
+genMergeResults(
+    struct KgenContext *ctx,
+    Tile *result,
+    Tile *source);
+
+#endif /* XXMV_COMMON_H_ */
diff --git a/src/library/blas/impl.c b/src/library/blas/impl.c
new file mode 100644
index 0000000..98bbb2a
--- /dev/null
+++ b/src/library/blas/impl.c
@@ -0,0 +1,129 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <defbool.h>
+#include <clBLAS.h>
+#include <clblas-internal.h>
+
+clblasStatus
+clblasSelectImplementation(
+    clblasImplementation impl)
+{
+    switch (impl) {
+    case clblasDefaultGemm:
+    case clblasLdsBlockGemm:
+    case clblasImageBlockGemm:
+    case clblasBlockGemmWithCaching:
+        clblasSolvers[CLBLAS_GEMM].defaultPattern =
+            getGemmMemPatternIndex(impl);
+        break;
+    case clblasDefaultTrmm:
+    case clblasLdsBlockTrmm:
+    case clblasImageBlockTrmm:
+    case clblasBlockTrmmWithCaching:
+        clblasSolvers[CLBLAS_TRMM].defaultPattern =
+            getTrmmMemPatternIndex(impl);
+        break;
+    case clblasDefaultTrsm:
+    case clblasLdsBlockTrsm:
+    case clblasImageBlockTrsm:
+    case clblasBlockTrsmWithCaching:
+    case clblasBlockTrsmWithoutLds:
+        clblasSolvers[CLBLAS_TRSM].defaultPattern =
+            getTrsmMemPatternIndex(impl);
+        break;
+    default:
+        return clblasInvalidValue;
+    }
+
+    return clblasSuccess;
+}
+
+int
+scratchImagesEnabled(void)
+{
+    int enable = 0;
+    const char *envImpl;
+
+    envImpl = getenv("AMD_CLBLAS_GEMM_IMPLEMENTATION");
+    if ((envImpl != NULL) && (strcmp(envImpl, "1") == 0)) {
+        enable = 1;
+    };
+    envImpl = getenv("AMD_CLBLAS_TRMM_IMPLEMENTATION");
+    if ((envImpl != NULL) && (strcmp(envImpl, "1") == 0)) {
+        enable = 1;
+    };
+    envImpl = getenv("AMD_CLBLAS_TRSM_IMPLEMENTATION");
+    if ((envImpl != NULL) && (strcmp(envImpl, "1") == 0)) {
+        enable = 1;
+    };
+
+    return enable;
+}
+
+void
+parseEnvImplementation(void)
+{
+    const char *envImpl;
+
+    envImpl = getenv("AMD_CLBLAS_GEMM_IMPLEMENTATION");
+    clblasSelectImplementation(clblasDefaultGemm);
+    if (envImpl != NULL) {
+        if (strcmp(envImpl, "0") == 0) {
+            clblasSelectImplementation(clblasLdsBlockGemm);
+        }
+        else if (strcmp(envImpl, "1") == 0) {
+            clblasSelectImplementation(clblasImageBlockGemm);
+        }
+        else if (strcmp(envImpl, "2") == 0) {
+            clblasSelectImplementation(clblasBlockGemmWithCaching);
+        }
+    }
+
+    envImpl = getenv("AMD_CLBLAS_TRMM_IMPLEMENTATION");
+    clblasSelectImplementation(clblasDefaultTrmm);
+    if (envImpl != NULL) {
+        if (strcmp(envImpl, "0") == 0) {
+            clblasSelectImplementation(clblasLdsBlockTrmm);
+        }
+        else if (strcmp(envImpl, "1") == 0) {
+            clblasSelectImplementation(clblasImageBlockTrmm);
+        }
+        else if (strcmp(envImpl, "2") == 0) {
+            clblasSelectImplementation(clblasBlockTrmmWithCaching);
+        }
+    }
+
+    envImpl = getenv("AMD_CLBLAS_TRSM_IMPLEMENTATION");
+    clblasSelectImplementation(clblasDefaultTrsm);
+    if (envImpl != NULL) {
+        if (strcmp(envImpl, "0") == 0) {
+            clblasSelectImplementation(clblasLdsBlockTrsm);
+        }
+        else if (strcmp(envImpl, "1") == 0) {
+            clblasSelectImplementation(clblasImageBlockTrsm);
+        }
+        else if (strcmp(envImpl, "2") == 0) {
+            clblasSelectImplementation(clblasBlockTrsmWithoutLds);
+        }
+        else if (strcmp(envImpl, "3") == 0) {
+            clblasSelectImplementation(clblasBlockTrsmWithCaching);
+        }
+    }
+}
diff --git a/src/library/blas/include/blas_funcs.h b/src/library/blas/include/blas_funcs.h
new file mode 100644
index 0000000..5536f9c
--- /dev/null
+++ b/src/library/blas/include/blas_funcs.h
@@ -0,0 +1,94 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Blas function identifiers and properties
+ */
+
+#ifndef BLASFUNCS_H_
+#define BLASFUNCS_H_
+
+#include <defbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum BlasFunctionID {
+    CLBLAS_GEMV,
+    CLBLAS_SYMV,
+    CLBLAS_GEMM,
+    CLBLAS_TRMM,
+    CLBLAS_TRSM,
+    CLBLAS_SYRK,
+    CLBLAS_SYR2K,
+	CLBLAS_TRMV,
+	CLBLAS_HEMV,
+	CLBLAS_TRSV,
+	CLBLAS_TRSV_GEMV,	// Need a Kludge as current "gemv" don't support complex types
+	CLBLAS_SYMM,
+    CLBLAS_SYMM_DIAGONAL,
+    CLBLAS_HEMM_DIAGONAL,
+	CLBLAS_GEMM2,
+	CLBLAS_GEMM_TAIL,
+	CLBLAS_SYR,
+	CLBLAS_SYR2,
+	CLBLAS_GER,
+	CLBLAS_HER,
+	CLBLAS_HER2,
+    CLBLAS_HEMM,
+    CLBLAS_HERK,
+    CLBLAS_TPMV,
+    CLBLAS_SPMV,
+    CLBLAS_HPMV,
+    CLBLAS_TPSV,
+    CLBLAS_SPR,
+    CLBLAS_SPR2,
+    CLBLAS_HPR,
+    CLBLAS_HPR2,
+    CLBLAS_GBMV,
+    CLBLAS_TBMV,
+    CLBLAS_SBMV,
+    CLBLAS_HBMV,
+    CLBLAS_TBSV,
+    CLBLAS_SWAP,
+    CLBLAS_SCAL,
+    CLBLAS_COPY,
+    CLBLAS_AXPY,
+    CLBLAS_DOT,
+    CLBLAS_REDUCTION_EPILOGUE,
+    CLBLAS_ROTG,
+    CLBLAS_ROTMG,
+    CLBLAS_ROT,
+    CLBLAS_ROTM,
+    CLBLAS_iAMAX,
+    CLBLAS_NRM2,
+    CLBLAS_ASUM,
+
+    /* ! Must be the last */
+    BLAS_FUNCTIONS_NUMBER
+} BlasFunctionID;
+
+int funcBlasLevel(BlasFunctionID funcID);
+bool funcHasBeta(BlasFunctionID funcID);
+bool funcHasTriangMatrix(BlasFunctionID funcID);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BLASFUNCS_H_ */
diff --git a/src/library/blas/include/blas_mempat.h b/src/library/blas/include/blas_mempat.h
new file mode 100644
index 0000000..dfca4a6
--- /dev/null
+++ b/src/library/blas/include/blas_mempat.h
@@ -0,0 +1,378 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Related to BLAS memory patterns
+ */
+
+#ifndef BLAS_MEMPAT_H_
+#define BLAS_MEMPAT_H_
+
+#include <clBLAS.h>
+#include <mempat.h>
+#include <clkern.h>
+#include <kern_cache.h>
+
+/**
+ * @brief Type of internal function implementation
+ */
+typedef enum clblasImplementation {
+
+    clblasDefaultGemm,           /**< Default: let the library decide what to use. */
+    clblasLdsBlockGemm,          /**< Use blocked GEMM with LDS optimization. */
+    clblasImageBlockGemm,        /**< Use blocked GEMM with image-based... */
+    clblasBlockGemmWithCaching,  /**< Use blocked GEMM with cache-usage optimization. */
+    clblasSubgroupGemmWithCaching,/**< Use subgroup GEMM with cache-usage optimization. */
+
+    clblasDefaultTrmm,           /**< Default: let the library decide what to use. */
+    clblasLdsBlockTrmm,          /**< Use blocked TRMM with LDS optimization. */
+    clblasImageBlockTrmm,        /**< Use blocked TRMM with image-based... */
+    clblasBlockTrmmWithCaching,  /**< Use blocked TRMM with cache-usage optimization. */
+    clblasSubgroupTrmmWithCaching,/**< Use subgroup TRMM with cache-usage optimization. */
+
+    clblasDefaultTrsm,           /**< Default: let the library decide what to use. */
+    clblasLdsBlockTrsm,          /**< Use blocked TRSM with LDS optimization. */
+    clblasImageBlockTrsm,        /**< Use blocked TRSM with image-based... */
+    clblasBlockTrsmWithCaching,  /**< Use blocked TRSM with cache-usage optimization. */
+    clblasBlockTrsmWithoutLds,
+
+    clblasDefaultSyrk,
+    clblasBlockSyrk,
+    clblasSubgSyrk,
+
+    clblasDefaultSyr2k,
+    clblasBlockSyr2k,
+    clblasSubgSyr2k
+
+} clblasImplementation;
+
+/**
+ * @internal
+ * @brief extra information for a memory pattern
+ *        used for BLAS problem solving
+ * @ingroup BLAS_SOLVERIF_SPEC
+ */
+typedef struct CLBLASMpatExtra {
+    /** memory levels used to store blocks of matrix A */
+    meml_set_t aMset;
+    /** memory levels used to store blocks of matrix B */
+    meml_set_t bMset;
+    CLMemType mobjA;
+    CLMemType mobjB;
+} CLBLASMpatExtra;
+
+/*
+ * init memory patterns for the xGEMM functions
+ *
+ * Returns number of the initialized patterns
+ */
+unsigned int
+initGemmMemPatterns(MemoryPattern *mempats);
+
+/*
+ * Get index of the specific xGEMM pattern
+ */
+int
+getGemmMemPatternIndex(clblasImplementation impl);
+
+/*
+ * Get preferred xGEMM pattern
+ */
+clblasImplementation
+getGemmPreferredPattern(void);
+
+/*
+ * init memory patterns for the xGEMV functions
+ *
+ * Returns number of the initialized patterns
+ */
+unsigned int
+initGemvMemPatterns(MemoryPattern *mempats);
+
+/*
+ * Get index of the specific xGEMV pattern
+ */
+int
+getGemvMemPatternIndex(clblasImplementation impl);
+
+/*
+ * init memory patterns for the xSYMV functions
+ *
+ * Returns number of the initialized patterns
+ */
+unsigned int
+initSymvMemPatterns(MemoryPattern *mempats);
+
+/*
+ * Get index of the specific xSYMV pattern
+ */
+int
+getSymvMemPatternIndex(clblasImplementation impl);
+
+/*
+ * init memory patterns for the xTRMM functions
+ *
+ * Returns number of the initialized patterns
+ */
+unsigned int
+initTrmmMemPatterns(MemoryPattern *mempats);
+
+/*
+ * Get index of the specific xTRMM pattern
+ */
+int
+getTrmmMemPatternIndex(clblasImplementation impl);
+
+/*
+ * Get preferred xTRMM pattern
+ */
+clblasImplementation
+getTrmmPreferredPattern(void);
+
+/*
+ * init memory patterns for the xTRSM functions
+ *
+ * Returns number of the initialized patterns
+ */
+unsigned int
+initTrsmMemPatterns(MemoryPattern *mempats);
+
+/*
+ * Get index of the specific xTRSM pattern
+ */
+int
+getTrsmMemPatternIndex(clblasImplementation impl);
+
+/*
+ * Get preferred xTRSM pattern
+ */
+clblasImplementation
+getTrsmPreferredPattern(void);
+
+/*
+ * init memory patterns for the xSYR2K functions
+ *
+ * Returns number of the initialized patterns
+ */
+unsigned int
+initSyr2kMemPatterns(MemoryPattern *mempats);
+
+/*
+ * Get index of the specific xSYR2K pattern
+ */
+int
+getSyr2kMemPatternIndex(clblasImplementation impl);
+
+/*
+ * init memory patterns for the xSYRK functions
+ *
+ * Returns number of the initialized patterns
+ */
+unsigned int
+initSyrkMemPatterns(MemoryPattern *mempats);
+
+/*
+ * Get index of the specific xSYRK pattern
+ */
+int
+getSyrkMemPatternIndex(clblasImplementation impl);
+
+/*
+ * init memory patters for TRMV routine
+ * Returns the number of inited patterns
+ */
+unsigned int
+initTrmvMemPatterns(MemoryPattern *mempats);
+
+int
+getTrmvMemPatternIndex(clblasImplementation impl);
+
+/*
+ * init memory patterns for TRSV TRTRI routine
+ * Returns the number of inited patterns
+ */
+unsigned int
+initTrsvMemPatterns(MemoryPattern *mempats);
+
+int
+getTrsvMemPatternIndex(clblasImplementation impl);
+
+unsigned int
+initTrsvGemvMemPatterns(MemoryPattern *mempats);
+
+int
+getTrsvGemvMemPatternIndex(clblasImplementation impl);
+
+unsigned int
+initSymmMemPatterns(MemoryPattern *mempats);
+
+int
+getSymmMemPatternIndex(clblasImplementation impl);
+
+unsigned int
+initGemmV2MemPatterns(MemoryPattern *mempats);
+
+int
+getGemmV2MemPatternIndex(clblasImplementation impl);
+
+unsigned int
+initGemmV2TailMemPatterns(MemoryPattern *mempats);
+
+int
+getGemmV2TailMemPatternIndex(clblasImplementation impl);
+
+/*
+ * init memory patterns for the xSYR functions
+ *
+ * Returns number of the initialized patterns
+ */
+unsigned int
+initSyrMemPatterns(MemoryPattern *mempats);
+
+/*
+ * Get index of the specific xSYR pattern
+ */
+int
+getSyrMemPatternIndex(clblasImplementation impl);
+
+/*
+ * init memory patterns for the xSYR2 functions
+ *
+ * Returns number of the initialized patterns
+ */
+unsigned int
+initSyr2MemPatterns(MemoryPattern *mempats);
+
+/*
+ * Get index of the specific xSYR2 pattern
+ */
+int
+getSyr2MemPatternIndex(clblasImplementation impl);
+
+
+/*
+ * init memory patters for GER routine
+ * Returns the number of inited patterns
+ */
+unsigned int
+initGerMemPatterns(MemoryPattern *mempats);
+
+int
+getGerMemPatternIndex(clblasImplementation impl);
+
+unsigned int
+initHerMemPatterns(MemoryPattern *mempats);
+
+/*
+ * Get index of the specific xSYR pattern
+ */
+int
+getHerMemPatternIndex(clblasImplementation impl);
+
+/*
+ * init memory patterns for the xHER2 functions
+ *
+ * Returns number of the initialized patterns
+ */
+unsigned int
+initHer2MemPatterns(MemoryPattern *mempats);
+
+/*
+ * Get index of the specific xHER2 pattern
+ */
+int
+getHer2MemPatternIndex(clblasImplementation impl);
+
+unsigned int
+initGbmvMemPatterns(MemoryPattern *mempats);
+
+int
+getGbmvMemPatternIndex(clblasImplementation impl);
+
+unsigned int
+initSwapMemPatterns(MemoryPattern *mempats);
+
+int
+getSwapMemPatternIndex(clblasImplementation impl);
+
+unsigned int
+initScalMemPatterns(MemoryPattern *mempats);
+
+int
+getScalMemPatternIndex(clblasImplementation impl);
+
+unsigned int
+initCopyMemPatterns(MemoryPattern *mempats);
+
+int
+getCopyMemPatternIndex(clblasImplementation impl);
+
+unsigned int
+initDotMemPatterns(MemoryPattern *mempats);
+
+int
+getDotMemPatternIndex(clblasImplementation impl);
+
+unsigned int
+initAxpyMemPatterns(MemoryPattern *mempats);
+
+int
+getAxpyMemPatternIndex(clblasImplementation impl);
+
+unsigned int
+initReductionMemPatterns(MemoryPattern *mempats);
+
+int
+getReductionMemPatternIndex(clblasImplementation impl);
+
+unsigned int
+initRotgMemPatterns(MemoryPattern *mempats);
+
+int
+getRotgMemPatternIndex(clblasImplementation impl);
+
+unsigned int
+initRotmgMemPatterns(MemoryPattern *mempats);
+
+int
+getRotmgMemPatternIndex(clblasImplementation impl);
+
+unsigned int
+initRotmMemPatterns(MemoryPattern *mempats);
+
+int
+getRotmMemPatternIndex(clblasImplementation impl);
+
+unsigned int
+initiAmaxMemPatterns(MemoryPattern *mempats);
+
+int
+getiAmaxMemPatternIndex(clblasImplementation impl);
+
+unsigned int
+initNrm2MemPatterns(MemoryPattern *mempats);
+
+int
+getNrm2MemPatternIndex(clblasImplementation impl);
+
+unsigned int
+initAsumMemPatterns(MemoryPattern *mempats);
+
+int
+getAsumMemPatternIndex(clblasImplementation impl);
+
+#endif /* BLAS_MEMPAT_H_ */
diff --git a/src/library/blas/include/clblas-internal.h b/src/library/blas/include/clblas-internal.h
new file mode 100644
index 0000000..81ab512
--- /dev/null
+++ b/src/library/blas/include/clblas-internal.h
@@ -0,0 +1,399 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef CLBLAS_INTERNAL_H_
+#define CLBLAS_INTERNAL_H_
+
+#include <defbool.h>
+#include <blas_mempat.h>
+#include <devinfo.h>
+#include <trace_malloc.h>
+
+#include "blas_funcs.h"
+#include "kernel_extra.h"
+
+#if defined(_MSC_VER)
+#define VISIBILITY_HIDDEN
+#else
+#define VISIBILITY_HIDDEN __attribute__((visibility("hidden")))
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct SolutionStep;
+
+typedef struct CLBlasSolvers {
+    MemoryPattern memPatterns[MEMPAT_PER_BLASFN];
+    unsigned int nrPatterns;
+    int defaultPattern;         /*   -1 -- select among all available patterns
+                                 * >= 0 -- index for memPatterns[]
+                                 */
+} CLBlasSolvers;
+
+extern int clblasInitialized;
+
+extern CLBlasSolvers clblasSolvers[BLAS_FUNCTIONS_NUMBER];
+
+extern struct KernelCache *clblasKernelCache;
+
+typedef union ArgMultiplier {
+    cl_float argFloat;
+    cl_double argDouble;
+    FloatComplex argFloatComplex;
+    DoubleComplex argDoubleComplex;
+} ArgMultiplier;
+
+typedef union LeadingDimention {
+    size_t matrix;  /**< Positive ld value for matrixes */
+    int vector;     /**< Integer offset value for vectors */
+} LeadingDimention;
+
+typedef enum reductionType {
+    REDUCE_BY_SUM,
+    REDUCE_BY_MAX,
+    REDUCE_BY_MIN,
+    REDUCE_MAX_WITH_INDEX,
+    REDUCE_BY_HYPOT,
+    REDUCE_BY_SSQ,
+    REDUCE_MAX_WITH_INDEX_ATOMICS
+} reductionType;
+
+/**
+ * @internal
+ * @brief Kernel arguments for solver methods
+ * @ingroup SUBMIT_PROBLEM
+ */
+typedef struct CLBlasKargs {
+    BlasFunctionID pigFuncID; // FuncID piggy backing on this call. Used by Blas-3 routines to take advantage of GEMM code
+    /** Kernel type to pass the arguments for */
+    CLBlasKernelType kernType;
+    DataType dtype;             /**< Data type */
+    clblasOrder order;          /**< Row/column order */
+    clblasSide side;            /**< Matrix A side */
+    clblasUplo uplo;            /**< Matrix A is upper/lower */
+    clblasTranspose transA;     /**< Operation to be applied to matrix A */
+    clblasTranspose transB;     /**< Operation to be applied to matrix B */
+    clblasDiag diag;            /**< Matrix A diagonality */
+    size_t M;                   /**< Problem size in M dimension */
+    size_t N;                   /**< Problem size in N dimension */
+    size_t K;                   /**< Problem size in K dimension, or number of diagonals in a banded-matrix */
+    ArgMultiplier alpha;        /**< Alpha multiplier */
+    cl_mem A;                   /**< Matrix A data */
+    LeadingDimention lda;       /**< Matrix A leading dimension */
+    cl_mem B;                   /**< Matrix B data */
+    LeadingDimention ldb;       /**< Matrix B or vector X leading dimension */
+    ArgMultiplier beta;         /**< Beta multiplier */
+    cl_mem C;                   /**< Matrix C data */
+    LeadingDimention ldc;       /**< Matrix C or vector Y leading dimension */
+    cl_mem D;                   /**< Extra cl_mem buffer. For scratch usage or other purpose */
+    cl_mem E;                   /**< Extra buffer.. Needed for blas 1 functions */
+    int addrBits;               /**< Number of device address bits */
+    /** Problem start offset in M dimension to process from */
+    size_t offsetM;
+    /** Problem start offset in N dimension to process from */
+    size_t offsetN;
+    /** Problem start offset in K dimension to process from */
+    size_t offsetK;
+    cl_mem scimage[2];          /**< Scratch images */
+    size_t offA;                /**< Offset of first element of matrix A */
+    /** Offset of first element of matrix B or vector X */
+    size_t offBX;
+    /**< Offset of first element of matrix C or vector Y */
+    size_t offCY;
+	size_t offa;				/**< Offset of first element of Matrix A */
+	size_t offb;				/**< Offset of first element of Matrix B */
+	size_t offc;				/**< Offset of first element of Matrix C */
+    size_t offd;                /**< Offset of first element of buffer D */
+    size_t offe;                /**< Offset of first element of buffer E */
+	cl_int startRow;				/**< Triangular Solver - Identify where the triangle starts */
+	cl_int endRow;					/**< Triangular Solver - Identify where the triangle ends */
+	size_t tailStartM;			// Tail Kernel for GEMM2
+	size_t tailStartN;			// Tail Kernel for GEMM2
+    size_t KL;                  // Number of sub-diagonals in a banded-matrix
+    size_t KU;                  // Number of super-diagonals in a banded-matrix
+    reductionType redctnType;   // To store kind of reduction for reduction-framewrok to handle -- enum
+} CLBlasKargs;
+
+static __inline bool
+areKernelsCacheable(void)
+{
+    return (clblasKernelCache != NULL);
+}
+
+/*
+ * Assign a scalar multiplied on a matrix as a kernel argument
+ */
+void
+assignScalarKarg(KernelArg *arg, const void *value, DataType dtype);
+
+/**
+ * calculate amount of global threads needed to compute all the problem
+ *
+ * @wgDim: Subproblem dimension at the level where the previous level subproblem
+ *        is distributed among different work groups
+ * @M: problem size in dimension M before the distributing
+ * @N: problem size in dimension N before the distributing
+ */
+void
+calcGlobalThreads(
+    size_t globalThreads[2],
+    const SubproblemDim *wgDim,
+    const PGranularity *pgran,
+    size_t M,
+    size_t N);
+
+/**
+ * @internal
+ * @brief Get the context associated with kernel.
+ *
+ * @param[in] kernel Kernel object being queried.
+ * @param[out] context The context.
+ *
+ * @return clGetKernelInfo() return code.
+ */
+cl_int
+getKernelContext(
+    cl_kernel kernel,
+    cl_context *context);
+
+/**
+ * @brief Get the context associated with queue.
+ *
+ * @param[in] queue Queue being queried.
+ * @param[out] context The context.
+ *
+ * @return clGetCommandQueueInfo() return code.
+ */
+cl_int
+getQueueContext(
+    cl_command_queue queue,
+    cl_context *context);
+
+/**
+ * @internal
+ * @brief Get the device specified when the command-queue is created.
+ *
+ * @param[in] queue Queue being queried.
+ * @param[out] device The device.
+ *
+ * @return clGetCommandQueueInfo() return code.
+ */
+cl_int
+getQueueDevice(
+    cl_command_queue queue,
+    cl_device_id *device);
+
+/**
+ * @internal
+ * @brief Get the currently specified properties for the command-queue.
+ *
+ * @param[in] queue Queue being queried.
+ * @param[out] props Properties.
+ *
+ * @return clGetCommandQueueInfo() return code.
+ */
+cl_int
+getQueueProperties(
+    cl_command_queue queue,
+    cl_command_queue_properties *props);
+
+Kernel
+*makeKernel(
+    cl_device_id device,
+    cl_context context,
+    SolverKgen kernelGenerator,
+    const SubproblemDim *dims,
+    const PGranularity *pgran,
+    const CLBLASKernExtra *extra,
+    const char *buildOpts,
+    cl_int *error);
+
+Kernel
+*loadKernel( const unsigned char** buffer,
+             size_t sizeBuffer,
+             KernelKey *key,
+             const CLBLASKernExtra *extra,
+             cl_int *error);
+
+/*
+ * TODO: doxygen style comments
+ */
+void
+setupBuildOpts(
+    char opts[BUILD_OPTS_MAXLEN],
+    cl_device_id devID,
+    MemoryPattern *mempat);
+
+// Internal scatter image API
+
+int
+initSCImages(void);
+
+void
+releaseSCImages(void);
+
+/**
+ * Request an image appropriating the most to perform a user API request
+ *
+ * @ctx: context containing images
+ * @devID: id of device the image will used for
+ * @bestSize: size of image, i. e. minWidth*bestHeight of the image that should
+ *            be enough to solve a problem in single step
+ * @minSize: minimal size of image image, i. e. minWidth*minHeight
+ * @minWidth: minimal image width
+ *
+ * Returns memory object of the most appropriate image. If there are
+ * not images available for the device or not enough memory, to allocate
+ * some internal structures to save a usage info the function returns NULL.
+ */
+cl_mem
+getSCImage(
+    cl_context ctx,
+    cl_device_id devID,
+    cl_ulong bestSize,
+    cl_ulong minSize,
+    size_t minWidth);
+
+void
+putSCImage(cl_device_id devID, cl_mem image);
+
+char
+*sprintfGranulation(char *buf, const SubproblemDim *dim, int level);
+
+const char
+*kernelTypeString(CLBlasKernelType ktype);
+
+#ifdef DUMP_CLBLAS_KERNELS
+
+void
+dumpKernel(
+    const struct SolutionStep *step,
+    CLBlasKernelType ktype);
+
+#else       /* DUMP_CLBLAS_KERNEL */
+
+// stub, does nothing
+#define dumpKernel(step, ktype)
+
+#endif      /* !DUMP_CLBLAS_KERNEL */
+
+static __inline solver_id_t
+makeSolverID(int fid, int mpat)
+{
+    return (solver_id_t)(fid * MEMPAT_PER_BLASFN + mpat);
+}
+
+static __inline int
+solverFunctionID(solver_id_t sid)
+{
+    return (sid / MEMPAT_PER_BLASFN);
+}
+
+static __inline int
+solverPattern(solver_id_t sid)
+{
+    return (sid % MEMPAT_PER_BLASFN);
+}
+
+typedef enum ErrorCodeSet {
+     A_MAT_ERRSET,
+     B_MAT_ERRSET,
+     C_MAT_ERRSET,
+     X_VEC_ERRSET,
+     Y_VEC_ERRSET,
+     END_ERRSET
+} ErrorCodeSet;
+
+clblasStatus
+checkMatrixSizes(
+    DataType dtype,
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    cl_mem A,
+    size_t offA,
+    size_t lda,
+    ErrorCodeSet err );
+
+clblasStatus
+checkBandedMatrixSizes(
+    DataType dtype,
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    size_t KL,
+    size_t KU,
+    cl_mem A,
+    size_t offA,
+    size_t lda,
+    ErrorCodeSet err );
+
+clblasStatus
+checkVectorSizes(
+    DataType dtype,
+    size_t N,
+    cl_mem x,
+    size_t offx,
+    int incx,
+    ErrorCodeSet err );
+
+clblasStatus
+checkMemObjects(
+    cl_mem A,
+    cl_mem B,
+    cl_mem C,
+    bool checkC,
+    ErrorCodeSet errA,
+    ErrorCodeSet errB,
+    ErrorCodeSet errC );
+
+/**
+ * @brief Set preferred function internal implementation.
+ *
+ * Some BLAS functions are implemented in several different ways internally.
+ * By default the library tries to select the most suitable implementation for
+ * given problem. Using this function user can force library to use specific one.
+ *
+ * @return \b clblasSuccess on success, \b clblasInvalidValue if an
+ * unknown implementation id was passed.
+ */
+clblasStatus
+clblasSelectImplementation(
+    clblasImplementation impl);
+
+/**
+ * @brief Set preferred implementation according to environment variable.
+ */
+void
+parseEnvImplementation(void);
+
+/**
+ * @brief Check whether it is allowed to use scratch images
+ */
+int
+scratchImagesEnabled(void);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CLBLAS_INTERNAL_H_ */
diff --git a/src/library/blas/include/events.h b/src/library/blas/include/events.h
new file mode 100644
index 0000000..d097b20
--- /dev/null
+++ b/src/library/blas/include/events.h
@@ -0,0 +1,29 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Events used during SolutionStep decomposition internally.
+ */
+
+#ifndef EVENTS_H_
+#define EVENTS_H_
+
+void decomposeEventsSetup(void);
+void decomposeEventsTeardown(void);
+cl_event* decomposeEventsAlloc(void);
+
+#endif  /* EVENTS_H_ */
diff --git a/src/library/blas/include/kprintf.hpp b/src/library/blas/include/kprintf.hpp
new file mode 100644
index 0000000..e2eb366
--- /dev/null
+++ b/src/library/blas/include/kprintf.hpp
@@ -0,0 +1,131 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#ifndef __KPRINTF_HPP__
+#define __KPRINTF_HPP__
+
+#include <stdio.h>
+#include <iostream>
+#include <vector>
+#include <time.h>
+#include <string.h>
+#include <stdlib.h>
+
+typedef enum REDUCTION_TYPE {
+    REDUCTION_BY_SUM,
+    REDUCTION_BY_MAX,
+    REDUCTION_BY_MIN,
+    REDUCTION_BY_HYPOT,
+    REDUCTION_BY_SSQ
+} REDUCTION_TYPE;
+
+
+typedef enum RedWithIndexImpl {
+    ATOMIC_FLI,
+    REG_FLI,
+    ATOMIC_FHI,
+    REG_FHI
+} RedWithIndexImpl;
+
+class kprintf {
+public:
+    typedef struct fmt {
+        const char *key;
+        const char *value;
+    }fmt_t;
+private:
+    enum SRV { SCALAR, VECTOR };
+    const char *HALFWORD; // 1/2 of DERIVED
+    const char *QUARTERWORD; // 1/4 of DERIVED
+    const char *HALFQUARTERWORD;  // 1/8 of DERIVED
+    const char *VLOADWORD;
+    const char *DERIVED;
+    const char *BASE;
+    bool doVLOAD;
+    bool doVSTORE;
+    char dataType;
+
+    // For mystrtok()
+    char* strtokPtr;
+    int strtokCount;
+
+    enum SRV s_or_v;
+    int vectorWidth, effectiveVectorWidthOnBaseType;
+    size_t maxKeySize;
+    int wgSize;
+
+    std::vector<struct fmt> v;
+    struct fmt get(const char *key);
+    const char *findType(char *type);
+    const char *findVectorWidthType(char *type);
+    const char *findTypeVLOAD(char *type);
+    const char *findTypeVSTORE(char *type);
+    void generateVecSuffix(char *p, int n);
+    void registerType(const char *baseType, int vecWidth, int internalVecWidth=1);
+    void registerReducedTypes( const char* in, int div);
+    void registerSuperTypes( const char* in, int mul);
+    char* mystrtok( char* in, const char* tok); //NOTE: strtok overwrites the string. we dont like that...
+    //
+    // VLOAD %TYPE%V from (%PTYPE*) kind of memory locations
+    // The Kernel writers should use "%TYPE" and "%TYPE%V" for kernel aguments, local variables etc..
+    // However, while loading using %VLOAD, they should cast the pointers as "%PTYPE *" because
+    // VLOADn imposes certain restrictions.
+    // Having the pointers as %TYPE and %TYPE%V relieves us from address calculations for primitives
+    // which are vectors (like float2, double2 etc..)
+    //
+    void registerVLOAD();
+    void registerVSTORE();
+    void registerVectorWidth();
+    void handleMakeVector(char **_src, char **_dst, int div = 1);
+    void handleMUL(char **_src, char **_dst, bool vmul=false);
+    void handleMAD(char **_src, char **_dst, bool vmul=false);
+    void handleDIV(char **_src, char **_dst, bool vdiv=false);
+    void handleADD_SUB(char **_src, char **_dst, const char op);
+    void handleVLoadWithIncx(char **_src, char **_dst, bool ignoreFirst = false);
+    void handleVStoreWithIncx(char **_src, char **_dst);
+    void handleReduceSum(char **_src, char **_dst);
+    void handleReduceSumReal(char **_src, char **_dst, int vlength);
+    void handleReduceMax(char **_src, char **_dst);
+    void handleReduceMin(char **_src, char **_dst);
+    void handleReduceHypot(char **_src, char **_dst);
+    void handleCONJUGATE(char **_src, char **_dst);
+    void handleClearImaginary(char **_src, char **_dst);
+    void handleAlignedDataAccess(char **_src, char **_dst);
+    void handleAlignedVSTORE(char **_src, char **_dst);
+    void handlePredicate(char **_src, char **_dst);
+    void handleComplexJoin(char **_src, char **_dst);
+    void doConstruct(const char *type, int vecWidth, bool doVLOAD, bool doVSTORE, int wgSize);
+    void handleVMAD_AND_REDUCE(char **_src, char **_dst);
+    void handleMAD_AND_REDUCE(char **_src, char **_dst);
+    void handleVFOR(char **_src, char **_dst, bool isReal);
+    void handleReductionFramework(char **_src, char **_dst, REDUCTION_TYPE reductionType= REDUCTION_BY_SUM);
+    void handleVABS(char **_src, char **_dst);
+
+    void getRandomString(char *str, int length);
+
+public:
+    kprintf(char _type, int vecWidth=1, bool doVLOAD=false, bool doVSTORE = false, int wgSize=64);
+    kprintf(const char *type, int vecWidth=1, bool doVLOAD=false, bool doVSTORE=false, int wgSize=64);
+    void put(const char *key, const char *value);
+    //
+    // PENDING:
+    // Needs ammendment at a later point of time when we support MACROS
+    //
+    int real_strlen(const char *src);
+    void spit(char *dst, char *src);
+};
+
+#endif
diff --git a/src/library/blas/include/matrix_dims.h b/src/library/blas/include/matrix_dims.h
new file mode 100644
index 0000000..c962420
--- /dev/null
+++ b/src/library/blas/include/matrix_dims.h
@@ -0,0 +1,81 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef MATRIX_DIMS_H_
+#define MATRIX_DIMS_H_
+
+#include <defbool.h>
+#include <clblas-internal.h>
+#include <matrix_props.h>
+#include <kerngen.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void
+swapDimXY(SubproblemDim *dim);
+
+size_t
+matrBlockPitch(
+    const SubproblemDim *dim,
+    MatrixRole mrole,
+    DataType dtype,
+    clblasSide side);
+
+cl_ulong
+matrBlockSize(
+    SubproblemDim *dim,
+    MatrixRole mrole,
+    DataType dtype,
+    clblasSide side);
+
+size_t
+matrBlockHeight(
+    SubproblemDim *dim,
+    MatrixRole mrole,
+    clblasSide side);
+
+/*
+ * Transform respective kernel arguments to problem dimension.
+ * if 'offset' is set to true, then it transform starting offsets
+ * to process matrices from, otherwise it transforms matrix sizes.
+ * It ignores 'bwidth' field in offset mode.
+ */
+void
+kargsToProbDims(
+    SubproblemDim *probDim,
+    BlasFunctionID funcID,
+    const CLBlasKargs *kargs,
+    bool offset);
+
+/*
+ * Transform problem dimensions to respective kernel arguments.
+ * In the offset mode it ignore 'offsetK' and always sets it to 0
+ */
+void
+probDimsToKargs(
+    CLBlasKargs *kargs,
+    BlasFunctionID funcID,
+    SubproblemDim *blasDim,
+    bool offset);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* MATRIX_DIMS_H_ */
diff --git a/src/library/blas/include/matrix_props.h b/src/library/blas/include/matrix_props.h
new file mode 100644
index 0000000..fee4611
--- /dev/null
+++ b/src/library/blas/include/matrix_props.h
@@ -0,0 +1,70 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef MATRIX_PROPS_H_
+#define MATRIX_PROPS_H_
+
+#include <defbool.h>
+
+#include "clblas-internal.h"
+#include "blas_funcs.h"
+#include "matrix_props.h"
+
+typedef enum MatrixRole {
+    MATRIX_A,
+    MATRIX_B,
+    MATRIX_C,
+    MATRIX_ROLES_NUMBER
+} MatrixRole;
+
+/*
+ * Functions to deal with kernel extra flags
+ */
+
+// Is a matrix should be conjugated
+bool
+isMatrixConj(KernelExtraFlags flags, MatrixRole mrole);
+
+/*
+ * Is a matrix accessed in the column-major order
+ */
+bool
+isMatrixAccessColMaj(
+    BlasFunctionID funcID,
+    KernelExtraFlags flags,
+    MatrixRole mrole);
+
+/*
+ * Triangularity type at the physical layout with account
+ * of solution element indices the largest part makes
+ * a contribution to. That means a right-side, non transposed,
+ * upper diagonal matrix is considered as the lower triangular
+ * since the largest part make a contribution to solution elements
+ * with a highest index.
+ */
+static __inline bool
+isMatrixUpper(KernelExtraFlags kflags);
+
+static __inline bool
+isMatrixUpper(KernelExtraFlags kflags)
+{
+    return (((kflags & KEXTRA_UPPER_TRIANG) != 0) ^
+            ((kflags & KEXTRA_TRANS_A) != 0) ^
+            ((kflags & KEXTRA_SIDE_RIGHT) != 0));
+}
+
+#endif /* MATRIX_PROPS_H_ */
diff --git a/src/library/blas/include/solution_seq.h b/src/library/blas/include/solution_seq.h
new file mode 100644
index 0000000..3ddee22
--- /dev/null
+++ b/src/library/blas/include/solution_seq.h
@@ -0,0 +1,178 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef SOLUTION_SEQ_H_
+#define SOLUTION_SEQ_H_
+
+#include <list.h>
+#include <granulation.h>
+#include <kern_cache.h>
+#include <kernel_extra.h>
+
+#include "blas_funcs.h"
+#include "clblas-internal.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// subproblem dimension components
+typedef enum SDimConponent {
+    SDIM_X,
+    SDIM_Y,
+    SDIM_BWIDTH
+} SDimComponent;
+
+typedef struct SolutionStep {
+    BlasFunctionID funcID;
+    Kernel *kernels[MAX_CLBLAS_KERNELS_PER_STEP];
+    CLBlasKargs args;
+    cl_command_queue cmdQueue;
+    TargetDevice device;
+    cl_uint numEventsInWaitList;
+    const cl_event *eventWaitList;
+    cl_event *event;
+    unsigned int patternID;
+    SubproblemDim subdims[MAX_SUBDIMS];
+    PGranularity pgran;
+    KernelExtraFlags extraFlags;
+    ListNode node;
+} SolutionStep;
+
+/**
+ * @internal
+ * @brief Make solution sequence
+ *
+ * @param[in] funcID                BLAS function ID
+ * @param[in] args                  BLAS parameters
+ * @param[in] numCommandQueues      Number of the command queues
+ * @param[in] commandQueues         Command queues to distribute the problem
+ *                                  among
+ * @param[in] numEventsInWaitList   Number of events in the wait list
+ * @param[in] eventWaitList         List of events which must fire before any
+ *                                  of the problem's kernels can be executed
+ * @param[out] events               List of output events signaling on
+ *                                  completion of evaluating the problem for
+ *                                  the command queues.
+ * @param[out] seq                  Solution sequence head which will be
+ *                                  followed by all needed solution steps
+ *                                  after the function returns
+ *
+ * @returns
+ *     - \b CL_SUCCESS on success;
+ *     - \b CL_INVALID_VALUE if \b numCommandQueues is zero, or
+ *       \b commandQueues is NULL;
+ *     - \b CL_INVALID_DEVICE if the function ID indicates that this is
+ *        a double precision function, but any of the command queue's devices
+ *        does not support double precision;
+ *     - \b CL_INVALID_COMMAND_QUEUE if any of the passed command queues is
+ *        invalid;
+ *     - \b CL_OUT_OF_HOST_MEMORY if there is not enough memory to allocate
+ *        internal structures;
+ *     - \b CL_OUT_OF_HOST_RESOURCES if required scratch resources are
+ *        unavailable.
+ *
+ * @ingroup SUBMIT_PROBLEM
+ */
+cl_int
+makeSolutionSeq(
+    BlasFunctionID funcID,
+    const CLBlasKargs *args,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events,
+    ListHead *seq);
+
+/**
+ * @internal
+ * @brief Free solution sequence
+ *
+ * @param[out] seq                  Solution sequence to free
+ *
+ * It initializes the list after freeing.
+ *
+ * @ingroup SUBMIT_PROBLEM
+ */
+void
+freeSolutionSeq(ListHead *seq);
+
+void
+freeSolutionStep(ListNode *node);
+
+/**
+ * @internal
+ * @brief Execute solution sequence
+ *
+ * @param[in] seq                   Sequence to execute
+ *
+ * @returns CL_SUCCESS on success, errors from a clEnqueueNDRangeKernel() call
+ *          otherwise.
+ *
+ * @ingroup SUBMIT_PROBLEM
+ */
+cl_int
+executeSolutionSeq(const ListHead *seq);
+
+/*
+ * Get math decomposition of a solution step in order
+ * to accelerate its evaluation of faster kernels for
+ * other functions. The step must inserted into a
+ * solution sequence.
+ */
+ListNode
+*decomposeProblemStep(SolutionStep *step);
+
+cl_int
+selectVectorization(const SolutionStep *step, CLBLASKernExtra *kextra);
+
+// Find vector length which lda and tile width is divisible on
+unsigned int appropriateVecLen(size_t ld, unsigned int typeSize,
+                               size_t tileWidth, int funcLevel);
+
+KernelExtraFlags VISIBILITY_HIDDEN
+clblasArgsToKextraFlags(
+    const CLBlasKargs *args,
+    BlasFunctionID funcID);
+
+void VISIBILITY_HIDDEN
+getStepGranulation(SolutionStep *step);
+
+bool VISIBILITY_HIDDEN
+dimensionsExceedProblemSize(SolutionStep *step);
+
+void VISIBILITY_HIDDEN
+getMinimalStepGranulation(SolutionStep *step);
+
+void VISIBILITY_HIDDEN
+detectProblemTails(SolutionStep *step);
+
+void VISIBILITY_HIDDEN
+detectOffsets(SolutionStep *step);
+
+unsigned int VISIBILITY_HIDDEN
+selectPattern( SolutionStep* pStep, unsigned int maxImages);
+
+void VISIBILITY_HIDDEN
+fixupGemmOffsets(CLBlasKargs *kargs, KernelExtraFlags kflags, size_t offsetK);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* SOLUTION_SEQ_H_ */
diff --git a/src/library/blas/init.c b/src/library/blas/init.c
new file mode 100644
index 0000000..5095cb0
--- /dev/null
+++ b/src/library/blas/init.c
@@ -0,0 +1,242 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <clBLAS.h>
+#include <toolslib.h>
+#include <kern_cache.h>
+#include <version.h>
+#include <trace_malloc.h>
+
+#include "clblas-internal.h"
+#include <events.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+clblasStatus
+clblasGetVersion(cl_uint* major, cl_uint* minor, cl_uint* patch)
+{
+    *major = clblasVersionMajor;
+    *minor = clblasVersionMinor;
+    *patch = clblasVersionPatch;
+
+    return clblasSuccess;
+}
+
+clblasStatus
+clblasSetup(void)
+{
+    solver_id_t sidsNum;
+	char* tmp			= NULL;
+
+	//	Made the cache unlimited by default
+	size_t kCacheLimit = 0;
+
+    if (clblasInitialized) {
+        return clblasSuccess;
+    }
+
+    // printf("\n%s, line %d\n", __func__, __LINE__);
+    initMallocTrace();
+
+    clblasSolvers[CLBLAS_GEMM].nrPatterns =
+        initGemmMemPatterns(clblasSolvers[CLBLAS_GEMM].memPatterns);
+    clblasSolvers[CLBLAS_GEMM].defaultPattern = -1;
+
+    clblasSolvers[CLBLAS_TRMM].nrPatterns =
+        initTrmmMemPatterns(clblasSolvers[CLBLAS_TRMM].memPatterns);
+    clblasSolvers[CLBLAS_TRMM].defaultPattern = -1;
+
+    clblasSolvers[CLBLAS_TRSM].nrPatterns =
+        initTrsmMemPatterns(clblasSolvers[CLBLAS_TRSM].memPatterns);
+    clblasSolvers[CLBLAS_TRSM].defaultPattern = -1;
+
+    clblasSolvers[CLBLAS_GEMV].nrPatterns =
+        initGemvMemPatterns(clblasSolvers[CLBLAS_GEMV].memPatterns);
+    clblasSolvers[CLBLAS_GEMV].defaultPattern = -1;
+
+    clblasSolvers[CLBLAS_SYMV].nrPatterns =
+        initSymvMemPatterns(clblasSolvers[CLBLAS_SYMV].memPatterns);
+    clblasSolvers[CLBLAS_SYMV].defaultPattern = -1;
+
+    clblasSolvers[CLBLAS_SYR2K].nrPatterns =
+        initSyr2kMemPatterns(clblasSolvers[CLBLAS_SYR2K].memPatterns);
+    clblasSolvers[CLBLAS_SYR2K].defaultPattern = -1;
+
+    clblasSolvers[CLBLAS_SYRK].nrPatterns =
+        initSyrkMemPatterns(clblasSolvers[CLBLAS_SYRK].memPatterns);
+    clblasSolvers[CLBLAS_SYRK].defaultPattern = -1;
+
+	clblasSolvers[CLBLAS_TRMV].nrPatterns =
+		initTrmvMemPatterns(clblasSolvers[CLBLAS_TRMV].memPatterns);
+	clblasSolvers[CLBLAS_TRMV].defaultPattern = -1;
+
+	// HEMV uses the same memory pattern as TRMV.
+	clblasSolvers[CLBLAS_HEMV].nrPatterns =
+		initTrmvMemPatterns(clblasSolvers[CLBLAS_HEMV].memPatterns);
+	clblasSolvers[CLBLAS_HEMV].defaultPattern = -1;
+
+	clblasSolvers[CLBLAS_TRSV].nrPatterns =
+		initTrsvMemPatterns(clblasSolvers[CLBLAS_TRSV].memPatterns);
+	clblasSolvers[CLBLAS_TRSV].defaultPattern = -1;
+
+	clblasSolvers[CLBLAS_TRSV_GEMV].nrPatterns =
+		initTrsvGemvMemPatterns(clblasSolvers[CLBLAS_TRSV_GEMV].memPatterns);
+	clblasSolvers[CLBLAS_TRSV_GEMV].defaultPattern = -1;
+
+	clblasSolvers[CLBLAS_SYMM].nrPatterns =
+		initSymmMemPatterns(clblasSolvers[CLBLAS_SYMM].memPatterns);
+	clblasSolvers[CLBLAS_SYMM].defaultPattern = -1;
+
+	clblasSolvers[CLBLAS_GEMM2].nrPatterns =
+		initGemmV2MemPatterns(clblasSolvers[CLBLAS_GEMM2].memPatterns);
+	clblasSolvers[CLBLAS_GEMM2].defaultPattern = -1;
+
+	clblasSolvers[CLBLAS_GEMM_TAIL].nrPatterns =
+		initGemmV2TailMemPatterns(clblasSolvers[CLBLAS_GEMM_TAIL].memPatterns);
+	clblasSolvers[CLBLAS_GEMM_TAIL].defaultPattern = -1;
+
+	clblasSolvers[CLBLAS_SYR].nrPatterns =
+        initSyrMemPatterns(clblasSolvers[CLBLAS_SYR].memPatterns);
+ 	clblasSolvers[CLBLAS_SYR].defaultPattern = -1;
+
+	clblasSolvers[CLBLAS_SYR2].nrPatterns =
+        initSyr2MemPatterns(clblasSolvers[CLBLAS_SYR2].memPatterns);
+    clblasSolvers[CLBLAS_SYR2].defaultPattern = -1;
+
+	clblasSolvers[CLBLAS_GER].nrPatterns =
+		initGerMemPatterns(clblasSolvers[CLBLAS_GER].memPatterns);
+	clblasSolvers[CLBLAS_GER].defaultPattern = -1;
+
+	clblasSolvers[CLBLAS_HER].nrPatterns =
+        initHerMemPatterns(clblasSolvers[CLBLAS_HER].memPatterns);
+ 	clblasSolvers[CLBLAS_HER].defaultPattern = -1;
+
+	clblasSolvers[CLBLAS_HER2].nrPatterns =
+        initHer2MemPatterns(clblasSolvers[CLBLAS_HER2].memPatterns);
+    clblasSolvers[CLBLAS_HER2].defaultPattern = -1;
+
+    clblasSolvers[CLBLAS_GBMV].nrPatterns =
+		initGbmvMemPatterns(clblasSolvers[CLBLAS_GBMV].memPatterns);
+	clblasSolvers[CLBLAS_GBMV].defaultPattern = -1;
+
+	clblasSolvers[CLBLAS_SWAP].nrPatterns =
+        initSwapMemPatterns(clblasSolvers[CLBLAS_SWAP].memPatterns);
+    clblasSolvers[CLBLAS_SWAP].defaultPattern = -1;
+
+    clblasSolvers[CLBLAS_SCAL].nrPatterns =
+        initScalMemPatterns(clblasSolvers[CLBLAS_SCAL].memPatterns);
+    clblasSolvers[CLBLAS_SCAL].defaultPattern = -1;
+
+    clblasSolvers[CLBLAS_COPY].nrPatterns =
+        initCopyMemPatterns(clblasSolvers[CLBLAS_COPY].memPatterns);
+    clblasSolvers[CLBLAS_COPY].defaultPattern = -1;
+
+     clblasSolvers[CLBLAS_AXPY].nrPatterns =
+        initAxpyMemPatterns(clblasSolvers[CLBLAS_AXPY].memPatterns);
+    clblasSolvers[CLBLAS_AXPY].defaultPattern = -1;
+
+    clblasSolvers[CLBLAS_DOT].nrPatterns =
+       initDotMemPatterns(clblasSolvers[CLBLAS_DOT].memPatterns);
+    clblasSolvers[CLBLAS_DOT].defaultPattern = -1;
+
+    clblasSolvers[CLBLAS_REDUCTION_EPILOGUE].nrPatterns =
+       initReductionMemPatterns(clblasSolvers[CLBLAS_REDUCTION_EPILOGUE].memPatterns);
+    clblasSolvers[CLBLAS_REDUCTION_EPILOGUE].defaultPattern = -1;
+
+    clblasSolvers[CLBLAS_ROTG].nrPatterns =
+       initRotgMemPatterns(clblasSolvers[CLBLAS_ROTG].memPatterns);
+    clblasSolvers[CLBLAS_ROTG].defaultPattern = -1;
+
+    clblasSolvers[CLBLAS_ROTMG].nrPatterns =
+       initRotmgMemPatterns(clblasSolvers[CLBLAS_ROTMG].memPatterns);
+    clblasSolvers[CLBLAS_ROTMG].defaultPattern = -1;
+
+    clblasSolvers[CLBLAS_ROTM].nrPatterns =
+       initRotmMemPatterns(clblasSolvers[CLBLAS_ROTM].memPatterns);
+    clblasSolvers[CLBLAS_ROTM].defaultPattern = -1;
+
+    clblasSolvers[CLBLAS_iAMAX].nrPatterns =
+       initiAmaxMemPatterns(clblasSolvers[CLBLAS_iAMAX].memPatterns);
+    clblasSolvers[CLBLAS_iAMAX].defaultPattern = -1;
+
+    clblasSolvers[CLBLAS_NRM2].nrPatterns =
+       initNrm2MemPatterns(clblasSolvers[CLBLAS_NRM2].memPatterns);
+    clblasSolvers[CLBLAS_NRM2].defaultPattern = -1;
+
+    clblasSolvers[CLBLAS_ASUM].nrPatterns =
+       initAsumMemPatterns(clblasSolvers[CLBLAS_ASUM].memPatterns);
+    clblasSolvers[CLBLAS_ASUM].defaultPattern = -1;
+
+    sidsNum = makeSolverID(BLAS_FUNCTIONS_NUMBER, 0);
+
+	//	Read environmental variable to limit or disable ( 0 ) the size of the kernel cache in memory
+	tmp = getenv( "AMD_CLBLAS_KCACHE_LIMIT_MB" );
+	if( tmp != NULL )
+	{
+		kCacheLimit = atol( tmp );
+#if defined( _WIN32 )
+		printf( "Kernel Cache limit: %Iu MB\n", kCacheLimit );
+#else
+		printf( "Kernel Cache limit: %zu MB\n", kCacheLimit );
+#endif
+		kCacheLimit *= (1024 * 1024);
+	}
+
+    if (kCacheLimit || (tmp == NULL)) {
+        clblasKernelCache = createKernelCache(sidsNum, kCacheLimit);
+    	if (clblasKernelCache == NULL) {
+        	return clblasOutOfHostMemory;
+        }
+    }
+    if (initSCImages()) {
+        destroyKernelCache(clblasKernelCache);
+        return clblasOutOfHostMemory;
+    }
+
+    decomposeEventsSetup();
+
+    initStorageCache();
+
+    clblasInitialized = 1;
+    return clblasSuccess;
+}
+
+void
+clblasTeardown(void)
+{
+    if (!clblasInitialized) {
+        return;
+    }
+
+    printMallocStatistics();
+
+    if (clblasKernelCache != NULL) {
+        printKernelCacheSize(clblasKernelCache);
+        destroyKernelCache(clblasKernelCache);
+        clblasKernelCache = NULL;
+    }
+    releaseSCImages();
+    decomposeEventsTeardown();
+
+    // win32 - crashes
+    destroyStorageCache();
+
+    printMemLeaksInfo();
+    releaseMallocTrace();
+
+    clblasInitialized = 0;
+}
diff --git a/src/library/blas/ixamax.c b/src/library/blas/ixamax.c
new file mode 100644
index 0000000..8e4f44e
--- /dev/null
+++ b/src/library/blas/ixamax.c
@@ -0,0 +1,263 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+//#define IAMAX_USE_ATOMIC_MIN
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+clblasStatus
+doiAmax(
+	CLBlasKargs *kargs,
+    size_t N,
+    cl_mem iMax,
+    size_t offiMax,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuf,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+        cl_int err;
+		ListHead seq, seq2;
+        clblasStatus retCode = clblasSuccess;
+        cl_event firstiAmaxCall;
+        CLBlasKargs redctnArgs;
+        ListNode *listNodePtr;
+        SolutionStep *step;
+
+		if (!clblasInitialized) {
+        return clblasNotInitialized;
+		}
+
+		/* Validate arguments */
+
+		retCode = checkMemObjects(X, scratchBuf, iMax, true, X_VEC_ERRSET, A_MAT_ERRSET, X_VEC_ERRSET );
+		if (retCode) {
+			printf("Invalid mem object..\n");
+            return retCode;
+		}
+
+		// Check wheather enough memory was allocated
+
+		if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET ))) {
+			printf("Invalid Size for X\n");
+            return retCode;
+		}
+		// Minimum size of scratchBuff is 2 * N
+		if ((retCode = checkVectorSizes(kargs->dtype, (2 * N), scratchBuf, 0, 1, A_MAT_ERRSET ))) {
+			printf("Insufficient ScratchBuff A\n");
+            return retCode;
+		}
+		if ((retCode = checkVectorSizes(TYPE_UNSIGNED_INT, 1, iMax, offiMax, 1, X_VEC_ERRSET ))) {
+			printf("Invalid Size for iX\n");
+            return retCode;
+	    }
+		///////////////////////////////////////////////////////////////
+
+		if ((commandQueues == NULL) || (numCommandQueues == 0))
+		{
+			return clblasInvalidValue;
+		}
+
+		/* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */
+		numCommandQueues = 1;
+		if (commandQueues[0] == NULL)
+		{
+            return clblasInvalidCommandQueue;
+		}
+
+		if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+		{
+			return clblasInvalidEventWaitList;
+		}
+
+        // cl_mem D is scratch buffer
+        // cl_mem A is the output Buffer i.e. iMAX, offA for offiMax
+        // cl_mem B is the input Buffer containing N Values
+		kargs->N = N;
+		kargs->B = X;
+        kargs->offb = offx;
+		kargs->ldb.vector = incx;   // Will be using this as incx
+        if(incx < 1) {              // According to netlib, if incx<1, NRM2 will be zero
+            kargs->N = 1;           // Makeing it launch only 1 work-group
+        }
+		kargs->D = scratchBuf;
+		kargs->A = iMax;
+		kargs->offA = offiMax;
+#ifdef IAMAX_USE_ATOMIC_MIN
+        kargs->redctnType = REDUCE_MAX_WITH_INDEX_ATOMICS;
+#else
+        kargs->redctnType = REDUCE_MAX_WITH_INDEX;
+#endif
+        memcpy(&redctnArgs, kargs, sizeof(CLBlasKargs));
+
+		listInitHead(&seq);
+		err = makeSolutionSeq(CLBLAS_iAMAX, kargs, numCommandQueues, commandQueues,
+        					  numEventsInWaitList, eventWaitList, &firstiAmaxCall, &seq);
+		if (err == CL_SUCCESS)
+        {
+            // The second kernel call needs to know the number of work-groups used
+            //    in the first kernel call. This number of work-groups is calculated here
+            //    and passed as N to second reduction kernel
+
+            err = executeSolutionSeq(&seq);
+            if (err == CL_SUCCESS)
+            {
+                listNodePtr = listNodeFirst(&seq);        // Get the node
+                step = container_of(listNodePtr, node, SolutionStep);
+
+                redctnArgs.N = step->pgran.numWGSpawned[0];     // 1D block was used
+                redctnArgs.dtype = (redctnArgs.dtype == TYPE_COMPLEX_FLOAT) ? TYPE_FLOAT :
+                    ((redctnArgs.dtype == TYPE_COMPLEX_DOUBLE) ? TYPE_DOUBLE : redctnArgs.dtype);
+
+                listInitHead(&seq2);
+                err = makeSolutionSeq(CLBLAS_REDUCTION_EPILOGUE, &redctnArgs, numCommandQueues, commandQueues,
+                           1, &firstiAmaxCall, events, &seq2);
+
+                if (err == CL_SUCCESS)
+                {
+                    err = executeSolutionSeq(&seq2);
+                }
+                freeSolutionSeq(&seq2);
+            }
+		}
+
+		freeSolutionSeq(&seq);
+		return (clblasStatus)err;
+}
+
+clblasStatus
+clblasiSamax(
+    size_t N,
+    cl_mem iMax,
+    size_t offiMax,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuf,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+	#ifdef DEBUG_iAMAX
+	printf("iSAMAX Called\n");
+	#endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_FLOAT;
+    kargs.pigFuncID = CLBLAS_iAMAX;
+
+    return doiAmax(&kargs, N, iMax, offiMax, X, offx, incx, scratchBuf,
+                    numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasiDamax(
+    size_t N,
+    cl_mem iMax,
+    size_t offiMax,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuf,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+	#ifdef DEBUG_iAMAX
+	printf("iDAMAX called\n");
+	#endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_DOUBLE;
+    kargs.pigFuncID = CLBLAS_iAMAX;
+
+    return doiAmax(&kargs, N, iMax, offiMax, X, offx, incx, scratchBuf,
+                    numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasiCamax(
+    size_t N,
+    cl_mem iMax,
+    size_t offiMax,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuf,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    #ifdef DEBUG_iAMAX
+    printf("iCAMAX Called\n");
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.pigFuncID = CLBLAS_iAMAX;
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+
+    return doiAmax(&kargs, N, iMax, offiMax, X, offx, incx, scratchBuf,
+                    numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasiZamax(
+    size_t N,
+    cl_mem iMax,
+    size_t offiMax,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuf,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    #ifdef DEBUG_iAMAX
+    printf("iZAMAX Called\n");
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.pigFuncID = CLBLAS_iAMAX;
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+
+    return doiAmax(&kargs, N, iMax, offiMax, X, offx, incx, scratchBuf,
+                    numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+}
+
diff --git a/src/library/blas/scimage.c b/src/library/blas/scimage.c
new file mode 100644
index 0000000..b021055
--- /dev/null
+++ b/src/library/blas/scimage.c
@@ -0,0 +1,312 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>
+
+#include <defbool.h>
+#include <clBLAS.h>
+#include <clblas-internal.h>
+#include <list.h>
+#include <mutex.h>
+
+#define IMAGES_LOCK()    mutexLock(imagesLock)
+#define IMAGES_UNLOCK()  mutexUnlock(imagesLock)
+
+typedef struct DeviceNode {
+    cl_device_id devID;
+    ListNode node;
+} DeviceNode;
+
+typedef struct SCImageNode {
+    cl_mem image;
+    size_t width;
+    size_t height;
+    // devices using this image for computing
+    ListHead usingDevices;
+    ListNode node;
+} SCImageNode;
+
+typedef struct SearchContext {
+    cl_context ctx;
+    cl_device_id devID;
+    cl_ulong bestSize;
+    cl_ulong minSize;
+    size_t minWidth;
+    cl_ulong minExtraSize;
+    SCImageNode *bestImgNode;
+} SearchContext;
+
+static const cl_image_format IMAGE_FORMAT = { CL_RGBA, CL_UNSIGNED_INT32 };
+
+static ListHead images;
+static mutex_t *imagesLock = NULL;
+
+static void
+freeDeviceNode(ListNode *node)
+{
+    DeviceNode *devNode;
+
+    devNode = container_of(node, node, DeviceNode);
+    listDel(node);
+    free(devNode);
+}
+
+static void
+freeImageNode(ListNode *node)
+{
+    SCImageNode *imgNode;
+
+    imgNode = container_of(node, node, SCImageNode);
+    clReleaseMemObject(imgNode->image);
+    listDoForEachSafe(&imgNode->usingDevices, freeDeviceNode);
+    free(imgNode);
+}
+
+static int
+imageNodeCmp(const ListNode *node, const void *key)
+{
+    SCImageNode *imgNode;
+    const cl_mem *image;
+
+    imgNode = container_of(node, node, SCImageNode);
+    image = (const cl_mem *)key;
+
+    return (imgNode->image == *image) ? 0 : 1;
+}
+
+static int
+deviceNodeCmp(const ListNode *node, const void *key)
+{
+    cl_device_id *devID = (cl_device_id*)key;
+    DeviceNode *devNode = container_of(node, node, DeviceNode);
+
+    return !(devNode->devID == *devID);
+}
+
+static void
+checkBestImage(ListNode *node, void *priv)
+{
+    SCImageNode *imgNode;
+    ListNode *dnode;
+    SearchContext *sctx = (SearchContext*)priv;
+    cl_ulong es, is;   // extra and image size
+
+    imgNode = container_of(node, node, SCImageNode);
+    is = imgNode->height * imgNode->width;
+    // check if the image is not yet in use and meet the size requirements
+    dnode = listNodeSearch(&imgNode->usingDevices, (const void*)&sctx->devID,
+                           deviceNodeCmp);
+    if ((dnode == NULL) && (imgNode->width >= sctx->minWidth)
+            && (is >= sctx->minSize)) {
+        es = (is >= sctx->bestSize) ? (is - sctx->bestSize) :
+                                    (sctx->bestSize - is);
+        if (es < sctx->minExtraSize) {
+            sctx->minExtraSize = es;
+
+            sctx->bestImgNode = imgNode;
+        }
+    }
+}
+
+int VISIBILITY_HIDDEN
+initSCImages(void)
+{
+    int ret = 0;
+
+    listInitHead(&images);
+    imagesLock = mutexInit();
+    if (imagesLock == NULL) {
+        ret = -1;
+    }
+
+    return ret;
+}
+
+void VISIBILITY_HIDDEN
+releaseSCImages(void)
+{
+    IMAGES_LOCK();
+    listDoForEachSafe(&images, freeImageNode);
+    listInitHead(&images);
+    IMAGES_UNLOCK();
+    mutexDestroy(imagesLock);
+}
+
+cl_mem VISIBILITY_HIDDEN
+getSCImage(
+    cl_context ctx,
+    cl_device_id devID,
+    cl_ulong bestSize,
+    cl_ulong minSize,
+    size_t minWidth)
+{
+    cl_mem img = NULL;
+    DeviceNode *devNode;
+    SearchContext sctx;
+
+    sctx.ctx = ctx;
+    sctx.devID = devID;
+    sctx.bestSize = bestSize;
+    sctx.minSize = minSize;
+    sctx.minWidth = minWidth;
+    sctx.minExtraSize = (cl_ulong)1 << 63;
+    sctx.bestImgNode = NULL;
+
+    devNode = malloc(sizeof(DeviceNode));
+    if (devNode == NULL) {
+        return NULL;
+    }
+
+    /*
+     * find an image serving turn to minimum of either
+     * unused image space or unfitted data size
+     */
+    IMAGES_LOCK();
+    listDoForEachPriv(&images, checkBestImage, &sctx);
+    if (sctx.bestImgNode != NULL) {
+        img = sctx.bestImgNode->image;
+        devNode->devID = devID;
+        listAddToTail(&sctx.bestImgNode->usingDevices, &devNode->node);
+        clRetainMemObject(img);
+    }
+    IMAGES_UNLOCK();
+
+    if (img == NULL) {
+        free(devNode);
+    }
+
+    return img;
+}
+
+void VISIBILITY_HIDDEN
+putSCImage(cl_device_id devID, cl_mem image)
+{
+    ListNode *node;
+    SCImageNode *imgNode;
+    DeviceNode *devNode = NULL;
+
+    IMAGES_LOCK();
+    node = listNodeSearch(&images, (const void*)&image, imageNodeCmp);
+    if (node != NULL) {
+        imgNode = container_of(node, node, SCImageNode);
+        node = listNodeSearch(&imgNode->usingDevices, (const void*)&devID,
+                              deviceNodeCmp);
+        if (node != NULL) {
+            devNode = container_of(node, node, DeviceNode);
+            listDel(node);
+        }
+    }
+    IMAGES_UNLOCK();
+
+    if (devNode != NULL) {
+        free(devNode);
+    }
+
+    clReleaseMemObject(image);
+}
+
+cl_ulong
+clblasAddScratchImage(
+    cl_context context,
+    size_t width,
+    size_t height,
+    clblasStatus *status)
+{
+    cl_int err;
+    cl_mem image;
+    SCImageNode *imgNode;
+    intptr_t tmp;
+
+    if (!clblasInitialized) {
+        if (status != NULL) {
+            *status = clblasNotInitialized;
+        }
+        return 0;
+    }
+
+    if (!scratchImagesEnabled()) {
+        if (status != NULL) {
+            *status = clblasSuccess;
+        }
+        return 0;
+    }
+
+    image = clCreateImage2D(context, CL_MEM_READ_WRITE, &IMAGE_FORMAT,
+                            width, height, 0, NULL, &err);
+    if (err != CL_SUCCESS) {
+        if (status != NULL) {
+            *status = (clblasStatus)err;
+        }
+        return 0;
+    }
+
+    imgNode = calloc(1, sizeof(SCImageNode));
+    if (imgNode == NULL) {
+        clReleaseMemObject(image);
+        if (status != NULL) {
+            *status = clblasOutOfHostMemory;
+        }
+        return 0;
+    }
+    imgNode->image = image;
+    imgNode->width = width;
+    imgNode->height = height;
+    listInitHead(&imgNode->usingDevices);
+
+    mutexLock(imagesLock);
+    if ((images.prev == NULL) && (images.next == NULL)) {
+        listInitHead(&images);
+    }
+    listAddToHead(&images, &(imgNode->node));
+    mutexUnlock(imagesLock);
+
+    if (status != NULL) {
+        *status = clblasSuccess;
+    }
+    tmp = (intptr_t)image;
+
+    return (cl_ulong)tmp;
+}
+
+clblasStatus
+clblasRemoveScratchImage(
+    cl_ulong imageID)
+{
+    intptr_t tmp = (intptr_t)imageID;
+    cl_mem image = (cl_mem)tmp;
+    ListNode *node;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+    if (!scratchImagesEnabled()) {
+        return clblasSuccess;
+    }
+
+    IMAGES_LOCK();
+    node = listNodeSearch(&images, &image, imageNodeCmp);
+    if (node == NULL) {
+        IMAGES_UNLOCK();
+        return clblasInvalidValue;
+    }
+    listDel(node);
+    IMAGES_UNLOCK();
+    freeImageNode(node);
+
+    return clblasSuccess;
+}
diff --git a/src/library/blas/xasum.c b/src/library/blas/xasum.c
new file mode 100644
index 0000000..0a27c39
--- /dev/null
+++ b/src/library/blas/xasum.c
@@ -0,0 +1,259 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+clblasStatus
+doAsum(
+	CLBlasKargs *kargs,
+    size_t N,
+    cl_mem asum,
+    size_t offAsum,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+        cl_int err;
+		ListHead seq, seq2;
+        clblasStatus retCode = clblasSuccess;
+        cl_event firstAsumCall;
+        CLBlasKargs redctnArgs;
+        ListNode *listNodePtr;
+        SolutionStep *step;
+
+        DataType asumType = (kargs->dtype == TYPE_COMPLEX_FLOAT) ? TYPE_FLOAT:
+                                ((kargs->dtype == TYPE_COMPLEX_DOUBLE) ? TYPE_DOUBLE: kargs->dtype);
+
+		if (!clblasInitialized) {
+        return clblasNotInitialized;
+		}
+
+		/* Validate arguments */
+
+		retCode = checkMemObjects(scratchBuff, asum, X, true, X_VEC_ERRSET, X_VEC_ERRSET, X_VEC_ERRSET );
+		if (retCode) {
+			printf("Invalid mem object..\n");
+            return retCode;
+		}
+
+		// Check wheather enough memory was allocated
+
+		if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET ))) {
+			printf("Invalid Size for X\n");
+            return retCode;
+		}
+		// Minimum size of scratchBuff is N
+		if ((retCode = checkVectorSizes(kargs->dtype, N, scratchBuff, 0, 1, X_VEC_ERRSET ))) {
+			printf("Insufficient ScratchBuff\n");
+            return retCode;
+		}
+
+		if ((retCode = checkVectorSizes(asumType, 1, asum, offAsum, 1, X_VEC_ERRSET ))) {
+			printf("Invalid Size for asum\n");
+            return retCode;
+		}
+		///////////////////////////////////////////////////////////////
+
+		if ((commandQueues == NULL) || (numCommandQueues == 0))
+		{
+			return clblasInvalidValue;
+		}
+
+		/* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */
+		numCommandQueues = 1;
+		if (commandQueues[0] == NULL)
+		{
+			return clblasInvalidCommandQueue;
+		}
+
+		if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+		{
+			return clblasInvalidEventWaitList;
+		}
+
+		kargs->N = N;
+		kargs->A = asum;
+        kargs->offA = offAsum;
+		kargs->B = X;
+		kargs->offBX = offx;
+		kargs->ldb.vector = incx;   // Will be using this as incx
+        if(incx <1){
+            kargs->N = 1;
+        }
+        kargs->D = scratchBuff;
+        kargs->redctnType = REDUCE_BY_SUM;
+        memcpy(&redctnArgs, kargs, sizeof(CLBlasKargs));
+
+        redctnArgs.dtype = asumType;
+
+		listInitHead(&seq);
+		err = makeSolutionSeq(CLBLAS_ASUM, kargs, numCommandQueues, commandQueues,
+        					  numEventsInWaitList, eventWaitList, &firstAsumCall, &seq);
+		if (err == CL_SUCCESS)
+        {
+            /** The second kernel call needs to know the number of work-groups used
+                in the first kernel call. This number of work-groups is calculated here
+                and passed as N to second reduction kernel
+            **/
+            err = executeSolutionSeq(&seq);
+            if (err == CL_SUCCESS)
+            {
+                listNodePtr = listNodeFirst(&seq);        // Get the node
+                step = container_of(listNodePtr, node, SolutionStep);
+
+                redctnArgs.N = step->pgran.numWGSpawned[0];     // 1D block was used
+
+                listInitHead(&seq2);
+                err = makeSolutionSeq(CLBLAS_REDUCTION_EPILOGUE, &redctnArgs, numCommandQueues, commandQueues,
+                           1, &firstAsumCall, events, &seq2);
+
+                if (err == CL_SUCCESS)
+                {
+                    err = executeSolutionSeq(&seq2);
+                }
+                freeSolutionSeq(&seq2);
+            }
+		}
+
+		freeSolutionSeq(&seq);
+		return (clblasStatus)err;
+}
+
+clblasStatus
+clblasSasum(
+    size_t N,
+    cl_mem asum,
+    size_t offAsum,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+	#ifdef DEBUG_ASUM
+	printf("SASUM Called\n");
+	#endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_FLOAT;
+    kargs.pigFuncID = CLBLAS_ASUM;
+
+    return doAsum(&kargs, N, asum, offAsum, X, offx, incx, scratchBuff,
+                    numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasDasum(
+    size_t N,
+    cl_mem asum,
+    size_t offAsum,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+	#ifdef DEBUG_ASUM
+	printf("DASUM called\n");
+	#endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_DOUBLE;
+    kargs.pigFuncID = CLBLAS_ASUM;
+
+    return doAsum(&kargs, N, asum, offAsum, X, offx, incx, scratchBuff,
+                    numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasScasum(
+    size_t N,
+    cl_mem asum,
+    size_t offAsum,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    #ifdef DEBUG_ASUM
+    printf("SCASUM Called\n");
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.pigFuncID = CLBLAS_ASUM;
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+
+    return doAsum(&kargs, N, asum, offAsum, X, offx, incx, scratchBuff,
+                    numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasDzasum(
+    size_t N,
+    cl_mem asum,
+    size_t offAsum,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    #ifdef DEBUG_DZASUM
+    printf("DZASUM Called\n");
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.pigFuncID = CLBLAS_ASUM;
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+
+    return doAsum(&kargs, N, asum, offAsum, X, offx, incx, scratchBuff,
+                    numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+}
diff --git a/src/library/blas/xaxpy.c b/src/library/blas/xaxpy.c
new file mode 100644
index 0000000..7499c41
--- /dev/null
+++ b/src/library/blas/xaxpy.c
@@ -0,0 +1,243 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+//#define DEBUG_AXPY
+
+#include <stdio.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+
+clblasStatus
+doAxpy(
+	CLBlasKargs *kargs,
+    size_t N,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		cl_int err;
+		ListHead seq;
+        clblasStatus retCode = clblasSuccess;
+
+		if (!clblasInitialized) {
+        return clblasNotInitialized;
+		}
+
+		/* Validate arguments */
+
+        retCode = checkMemObjects(X, Y, X, false, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET );
+		if (retCode) {
+			printf("Invalid mem object..\n");
+            return retCode;
+		}
+
+		// Check wheather enough memory was allocated
+
+		if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+			printf("Invalid Size for X\n");
+            return retCode;
+		}
+		if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+			printf("Invalid Size for Y\n");
+            return retCode;
+		}
+		///////////////////////////////////////////////////////////////
+
+		if ((commandQueues == NULL) || (numCommandQueues == 0))
+		{
+			return clblasInvalidValue;
+		}
+
+		/* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */
+		numCommandQueues = 1;
+		if (commandQueues[0] == NULL)
+		{
+			return clblasInvalidCommandQueue;
+		}
+
+		if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+		{
+			return clblasInvalidEventWaitList;
+		}
+
+		/*
+ 		 * ASSUMPTION:
+ 		 * doTRMV assumes "commandQueue" of 0. The same is reflected in
+		 * "makeSolutionSeq" as well. If either of them changes in future,
+		 * this code needs to be revisited.
+  		 */
+
+		kargs->N = N;
+		kargs->A = X;
+		kargs->offBX = offx;
+		kargs->ldb.vector = incx;	// Will be using this as incx
+		kargs->B = Y;
+		kargs->offCY = offy;
+		kargs->ldc.vector = incy;	// Will be using this as incy
+
+		#ifdef DEBUG_AXPY
+		printf("Calling makeSolutionSeq from DoAxpy: AXPY\n");
+		#endif
+
+		listInitHead(&seq);
+		err = makeSolutionSeq(CLBLAS_AXPY, kargs, numCommandQueues, commandQueues,
+        					        numEventsInWaitList, eventWaitList, events, &seq);
+		if (err == CL_SUCCESS) {
+       		err = executeSolutionSeq(&seq);
+		}
+
+		freeSolutionSeq(&seq);
+
+		return (clblasStatus)err;
+	}
+
+
+
+
+
+clblasStatus
+clblasSaxpy(
+    size_t N,
+    float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		#ifdef DEBUG_AXPY
+		printf("\nSAXPY Called\n");
+		#endif
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_FLOAT;
+        kargs.alpha.argFloat = alpha;
+
+		return doAxpy(&kargs, N, X, offx, incx, Y, offy, incy,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasDaxpy(
+    size_t N,
+    double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		#ifdef DEBUG_AXPY
+		printf("\nDAXPY Called\n");
+		#endif
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_DOUBLE;
+        kargs.alpha.argDouble = alpha;
+
+		return doAxpy(&kargs, N, X, offx, incx, Y, offy, incy,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasCaxpy(
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int  incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		#ifdef DEBUG_AXPY
+		printf("\nCAXPY Called\n");
+		#endif
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_COMPLEX_FLOAT;
+        kargs.alpha.argFloatComplex = alpha;
+
+		return doAxpy(&kargs, N, X, offx, incx, Y, offy, incy,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasZaxpy(
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		#ifdef DEBUG_AXPY
+		printf("\nZAXPY Called\n");
+		#endif
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_COMPLEX_DOUBLE;
+        kargs.alpha.argDoubleComplex = alpha;
+
+		return doAxpy(&kargs, N, X, offx, incx, Y, offy, incy,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
diff --git a/src/library/blas/xcopy.c b/src/library/blas/xcopy.c
new file mode 100644
index 0000000..e0ea2a0
--- /dev/null
+++ b/src/library/blas/xcopy.c
@@ -0,0 +1,228 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+//#define DEBUG_COPY
+
+#include <stdio.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+
+clblasStatus
+doCopy(
+	CLBlasKargs *kargs,
+    size_t N,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		cl_int err;
+		ListHead seq;
+        clblasStatus retCode = clblasSuccess;
+
+		if (!clblasInitialized) {
+        return clblasNotInitialized;
+		}
+
+		/* Validate arguments */
+
+        retCode = checkMemObjects(X, Y, X, false, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET );
+		if (retCode) {
+			printf("Invalid mem object..\n");
+            return retCode;
+		}
+
+		// Check wheather enough memory was allocated
+
+		if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+			printf("Invalid Size for X\n");
+            return retCode;
+		}
+		if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+			printf("Invalid Size for Y\n");
+            return retCode;
+		}
+		///////////////////////////////////////////////////////////////
+
+		if ((commandQueues == NULL) || (numCommandQueues == 0))
+		{
+			return clblasInvalidValue;
+		}
+
+		/* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */
+		numCommandQueues = 1;
+		if (commandQueues[0] == NULL)
+		{
+			return clblasInvalidCommandQueue;
+		}
+
+		if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+		{
+			return clblasInvalidEventWaitList;
+		}
+
+		kargs->N = N;
+		kargs->A = X;
+		kargs->offBX = offx;
+		kargs->ldb.vector = incx;	// Will be using this as incx
+		kargs->B = Y;
+		kargs->offCY = offy;
+		kargs->ldc.vector = incy;	// Will be using this as incy
+
+		#ifdef DEBUG_COPY
+		printf("Calling makeSolutionSeq from DoCopy: COPY\n");
+		#endif
+
+		listInitHead(&seq);
+		err = makeSolutionSeq(CLBLAS_COPY, kargs, numCommandQueues, commandQueues,
+        					        numEventsInWaitList, eventWaitList, events, &seq);
+		if (err == CL_SUCCESS) {
+       		err = executeSolutionSeq(&seq);
+		}
+
+		freeSolutionSeq(&seq);
+
+		return (clblasStatus)err;
+	}
+
+
+
+
+
+clblasStatus
+clblasScopy(
+    size_t N,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		#ifdef DEBUG_COPY
+		printf("\nSCOPY Called\n");
+		#endif
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_FLOAT;
+
+		return doCopy(&kargs, N, X, offx, incx, Y, offy, incy,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasDcopy(
+    size_t N,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		#ifdef DEBUG_COPY
+		printf("\nDCOPY Called\n");
+		#endif
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_DOUBLE;
+
+		return doCopy(&kargs, N, X, offx, incx, Y, offy, incy,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasCcopy(
+    size_t N,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int  incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		#ifdef DEBUG_COPY
+		printf("\nCCOPY Called\n");
+		#endif
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_COMPLEX_FLOAT;
+
+		return doCopy(&kargs, N, X, offx, incx, Y, offy, incy,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasZcopy(
+    size_t N,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		#ifdef DEBUG_COPY
+		printf("\nZCOPY Called\n");
+		#endif
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_COMPLEX_DOUBLE;
+
+		return doCopy(&kargs, N, X, offx, incx, Y, offy, incy,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
diff --git a/src/library/blas/xdot.c b/src/library/blas/xdot.c
new file mode 100644
index 0000000..f29cdb6
--- /dev/null
+++ b/src/library/blas/xdot.c
@@ -0,0 +1,350 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+clblasStatus
+doDot(
+	CLBlasKargs *kargs,
+    size_t N,
+    cl_mem dotProduct,
+    size_t offDP,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem scratchBuff,
+    int doConj,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+        cl_int err;
+		ListHead seq, seq2;
+        clblasStatus retCode = clblasSuccess;
+        cl_event firstDotCall;
+        CLBlasKargs redctnArgs;
+        ListNode *listNodePtr;
+        SolutionStep *step;
+
+		if (!clblasInitialized) {
+        return clblasNotInitialized;
+		}
+
+		/* Validate arguments */
+
+		retCode = checkMemObjects(X, Y, X, false, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET );
+		retCode |= checkMemObjects(scratchBuff, dotProduct, X, false, X_VEC_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET );
+		if (retCode) {
+			printf("Invalid mem object..\n");
+            return retCode;
+		}
+
+		// Check wheather enough memory was allocated
+
+		if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+			printf("Invalid Size for X\n");
+            return retCode;
+		}
+		if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+			printf("Invalid Size for Y\n");
+            return retCode;
+		}
+		// Minimum size of scratchBuff is N
+		if (retCode = checkVectorSizes(kargs->dtype, N, scratchBuff, 0, 1, X_VEC_ERRSET )) {
+			printf("Insufficient ScratchBuff\n");
+            return retCode;
+		}
+		if (retCode = checkVectorSizes(kargs->dtype, 1, dotProduct, offDP, 1, Y_VEC_ERRSET )) {
+			printf("Invalid Size for dotProduct\n");
+            return retCode;
+		}
+		///////////////////////////////////////////////////////////////
+
+		if ((commandQueues == NULL) || (numCommandQueues == 0))
+		{
+			return clblasInvalidValue;
+		}
+
+		/* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */
+		numCommandQueues = 1;
+		if (commandQueues[0] == NULL)
+		{
+			return clblasInvalidCommandQueue;
+		}
+
+		if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+		{
+			return clblasInvalidEventWaitList;
+		}
+
+		kargs->N = N;
+		kargs->A = dotProduct;
+        kargs->offA = offDP;
+        kargs->offa = offDP;
+		kargs->B = X;
+		kargs->offBX = offx;
+		kargs->ldb.vector = incx;   // Will be using this as incx
+		kargs->C = Y;
+		kargs->offCY = offy;
+		kargs->ldc.vector = incy;	// Will be using this as incy
+        kargs->D = scratchBuff;
+        kargs->redctnType = REDUCE_BY_SUM;
+        kargs->K = (size_t)doConj;
+        memcpy(&redctnArgs, kargs, sizeof(CLBlasKargs));
+
+		listInitHead(&seq);
+		err = makeSolutionSeq(CLBLAS_DOT, kargs, numCommandQueues, commandQueues,
+        					  numEventsInWaitList, eventWaitList, &firstDotCall, &seq);
+		if (err == CL_SUCCESS)
+        {
+            /** The second kernel call needs to know the number of work-groups used
+                in the first kernel call. This number of work-groups is calculated here
+                and passed as N to second reduction kernel
+            **/
+            err = executeSolutionSeq(&seq);
+            if (err == CL_SUCCESS)
+            {
+            listNodePtr = listNodeFirst(&seq);        // Get the node
+            step = container_of(listNodePtr, node, SolutionStep);
+
+                redctnArgs.N = step->pgran.numWGSpawned[0];     // 1D block was used
+
+                listInitHead(&seq2);
+            err = makeSolutionSeq(CLBLAS_REDUCTION_EPILOGUE, &redctnArgs, numCommandQueues, commandQueues,
+                           1, &firstDotCall, events, &seq2);
+
+            if (err == CL_SUCCESS)
+            {
+                    err = executeSolutionSeq(&seq2);
+            }
+                freeSolutionSeq(&seq2);
+		}
+		}
+
+		freeSolutionSeq(&seq);
+		return (clblasStatus)err;
+}
+
+clblasStatus
+clblasSdot(
+    size_t N,
+    cl_mem dotProduct,
+    size_t offDP,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    int doConj;
+	#ifdef DEBUG_DOT
+	printf("SDOT Called\n");
+	#endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_FLOAT;
+    kargs.pigFuncID = CLBLAS_DOT;
+    doConj = 0;
+
+    return doDot(&kargs, N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, doConj,
+                    numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasDdot(
+    size_t N,
+    cl_mem dotProduct,
+    size_t offDP,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    int doConj;
+	#ifdef DEBUG_DOT
+	printf("DDOT called\n");
+	#endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_DOUBLE;
+    kargs.pigFuncID = CLBLAS_DOT;
+    doConj = 0;
+
+    return doDot(&kargs, N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, doConj,
+                    numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasCdotu(
+    size_t N,
+    cl_mem dotProduct,
+    size_t offDP,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    int doConj;
+    #ifdef DEBUG_DOT
+    printf("CDOTU Called\n");
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.pigFuncID = CLBLAS_DOT;
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+    doConj = 0;
+
+    return doDot(&kargs, N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, doConj,
+                    numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasZdotu(
+    size_t N,
+    cl_mem dotProduct,
+    size_t offDP,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    int doConj;
+    #ifdef DEBUG_DOT
+    printf("ZDOTU Called\n");
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.pigFuncID = CLBLAS_DOT;
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+    doConj = 0;
+
+    return doDot(&kargs, N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, doConj,
+                    numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasCdotc(
+    size_t N,
+    cl_mem dotProduct,
+    size_t offDP,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    int doConj;
+    #ifdef DEBUG_DOT
+    printf("CDOTU Called\n");
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.pigFuncID = CLBLAS_DOT;
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+    doConj = 1;
+
+    return doDot(&kargs, N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, doConj,
+                    numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasZdotc(
+    size_t N,
+    cl_mem dotProduct,
+    size_t offDP,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    int doConj;
+    #ifdef DEBUG_DOT
+    printf("ZDOTU Called\n");
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.pigFuncID = CLBLAS_DOT;
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+    doConj = 1;
+
+    return doDot(&kargs, N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, doConj,
+                    numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+}
diff --git a/src/library/blas/xgbmv.c b/src/library/blas/xgbmv.c
new file mode 100644
index 0000000..205f8eb
--- /dev/null
+++ b/src/library/blas/xgbmv.c
@@ -0,0 +1,276 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <string.h>
+#include <clBLAS.h>
+#include <devinfo.h>
+
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+static clblasStatus
+doGbmv(
+    CLBlasKargs *kargs,
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    size_t KL,
+    size_t KU,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    cl_int err;
+    ListHead seq;
+    size_t sizev;
+    clblasStatus retCode = clblasSuccess;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+    if ((commandQueues == NULL) || (numCommandQueues == 0))
+    {
+        return clblasInvalidValue;
+    }
+
+    if (commandQueues[0] == NULL)
+    {
+        return clblasInvalidCommandQueue;
+    }
+
+    if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+    {
+        return clblasInvalidEventWaitList;
+    }
+    /* Validate arguments */
+
+    if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )))
+    {
+        return retCode;
+    }
+    if ((retCode = checkBandedMatrixSizes(kargs->dtype, order, clblasNoTrans,
+                                            M, N, KL, KU, A, offa, lda, A_MAT_ERRSET ))) {
+        return retCode;
+    }
+    sizev = (transA == clblasNoTrans) ? N : M;
+    if ((retCode = checkVectorSizes(kargs->dtype, sizev, x, offx, incx, X_VEC_ERRSET ))) {
+        return retCode;
+    }
+    sizev = (transA == clblasNoTrans) ? M : N;
+    if ((retCode = checkVectorSizes(kargs->dtype, sizev, y, offy, incy, Y_VEC_ERRSET ))) {
+        return retCode;
+    }
+
+    /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */
+    numCommandQueues = 1;
+
+    kargs->order = order;
+    kargs->transA = transA;
+    kargs->M = M;
+    kargs->N = N;
+    kargs->KL = KL;
+    kargs->KU = KU;
+    kargs->A = A;
+    kargs->offA = offa;
+    kargs->offa = offa;
+    kargs->lda.matrix = lda;
+    kargs->B = x;
+    kargs->offBX = offx;
+    kargs->ldb.vector = incx;
+    kargs->C = y;
+    kargs->offCY = offy;
+    kargs->ldc.vector = incy;
+
+    listInitHead(&seq);
+    err = makeSolutionSeq(CLBLAS_GBMV, kargs, numCommandQueues, commandQueues,
+                              numEventsInWaitList, eventWaitList, events, &seq);
+
+    if (err == CL_SUCCESS) {
+        err = executeSolutionSeq(&seq);
+    }
+
+    freeSolutionSeq(&seq);
+
+    return (clblasStatus)err;
+}
+
+clblasStatus
+clblasSgbmv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    size_t KL,
+    size_t KU,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_float beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_FLOAT;
+    kargs.pigFuncID = CLBLAS_GBMV;
+    kargs.alpha.argFloat = alpha;
+    kargs.beta.argFloat = beta;
+
+    return doGbmv(&kargs, order, transA, M, N, KL, KU, A, offa, lda, x, offx, incx,
+                  y, offy, incy, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasDgbmv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    size_t KL,
+    size_t KU,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_double beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_DOUBLE;
+    kargs.pigFuncID = CLBLAS_GBMV;
+    kargs.alpha.argDouble = alpha;
+    kargs.beta.argDouble = beta;
+
+    return doGbmv(&kargs, order, transA, M, N, KL, KU, A, offa, lda, x, offx, incx,
+                  y, offy, incy, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasCgbmv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    size_t KL,
+    size_t KU,
+    cl_float2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_float2 beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+    kargs.pigFuncID = CLBLAS_GBMV;
+    kargs.alpha.argFloatComplex = alpha;
+    kargs.beta.argFloatComplex = beta;
+
+    return doGbmv(&kargs, order, transA, M, N, KL, KU, A, offa, lda, x, offx, incx,
+                  y, offy, incy, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasZgbmv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    size_t KL,
+    size_t KU,
+    cl_double2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_double2 beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+    kargs.pigFuncID = CLBLAS_GBMV;
+    kargs.alpha.argDoubleComplex = alpha;
+    kargs.beta.argDoubleComplex = beta;
+
+    return doGbmv(&kargs, order, transA, M, N, KL, KU, A, offa, lda, x, offx, incx,
+                  y, offy, incy, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+
+
diff --git a/src/library/blas/xgemm.c b/src/library/blas/xgemm.c
new file mode 100644
index 0000000..895c99d
--- /dev/null
+++ b/src/library/blas/xgemm.c
@@ -0,0 +1,259 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+static clblasStatus
+doGemm(
+    CLBlasKargs *kargs,
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    cl_int err;
+    ListHead seq;
+    clblasStatus retCode = clblasSuccess;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+    /* Validate arguments */
+
+    if ((retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET))) {
+        return retCode;
+    }
+    if (K != 0) {
+        if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, M,
+                                        K, A, offA, lda, A_MAT_ERRSET ))) {
+            return retCode;
+        }
+        if ((retCode = checkMatrixSizes(kargs->dtype, order, transB,
+                                        K, N, B, offB, ldb, B_MAT_ERRSET ))) {
+            return retCode;
+        }
+    }
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans,
+                                    M, N, C, offC, ldc, C_MAT_ERRSET ))) {
+            return retCode;
+    }
+
+	#ifdef DEBUG_2
+	printf("DoGemm being called...\n");
+	#endif
+    kargs->order = order;
+    kargs->transA = transA;
+    kargs->transB = transB;
+    kargs->M = M;
+    kargs->N = N;
+    kargs->K = K;
+    kargs->A = A;
+    kargs->offA = offA;
+    kargs->lda.matrix = lda;
+    kargs->B = B;
+    kargs->offBX = offB;
+    kargs->ldb.matrix = ldb;
+    kargs->C = C;
+    kargs->offCY = offC;
+    kargs->ldc.matrix = ldc;
+
+    kargs->offsetM = 0;
+    kargs->offsetN = 0;
+    kargs->scimage[0] = 0;
+    kargs->scimage[1] = 0;
+
+    listInitHead(&seq);
+    err = makeSolutionSeq(CLBLAS_GEMM, kargs, numCommandQueues, commandQueues,
+        numEventsInWaitList, eventWaitList, events, &seq);
+    if (err == CL_SUCCESS) {
+        err = executeSolutionSeq(&seq);
+    }
+
+    freeSolutionSeq(&seq);
+
+    return (clblasStatus)err;
+}
+
+clblasStatus
+clblasSgemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_float beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_FLOAT;
+    kargs.alpha.argFloat = alpha;
+    kargs.beta.argFloat = beta;
+
+    return doGemm(&kargs, order, transA, transB, M, N, K, A, offA, lda,
+                  B, offB, ldb, C, offC, ldc, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasDgemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_double beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_DOUBLE;
+    kargs.alpha.argDouble = alpha;
+    kargs.beta.argDouble = beta;
+
+    return doGemm(&kargs, order, transA, transB, M, N, K, A, offA, lda,
+                  B, offB, ldb, C, offC, ldc, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasCgemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    FloatComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+    kargs.alpha.argFloatComplex = alpha;
+    kargs.beta.argFloatComplex = beta;
+
+    return doGemm(&kargs, order, transA, transB, M, N, K, A, offA, lda,
+                  B, offB, ldb, C, offC, ldc, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasZgemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    DoubleComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+    kargs.alpha.argDoubleComplex = alpha;
+    kargs.beta.argDoubleComplex = beta;
+
+    return doGemm(&kargs, order, transA, transB, M, N, K, A, offA, lda,
+                  B, offB, ldb, C, offC, ldc, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
diff --git a/src/library/blas/xgemm2.c b/src/library/blas/xgemm2.c
new file mode 100644
index 0000000..0a5ae43
--- /dev/null
+++ b/src/library/blas/xgemm2.c
@@ -0,0 +1,542 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+//#define DEBUG_GEMM_2
+
+int
+gemmHasMTail(size_t M,  int vecLen, clblasOrder order, clblasTranspose transA, clblasTranspose transB)
+{
+	transB = transB;    // Dummy- to remove warning
+    if (order == clblasColumnMajor)
+	{
+		if (transA == clblasNoTrans)
+		{
+			return (M % vecLen);
+		} else {
+			return 0;
+		}
+	} else {
+		printf("gemmHasMTail: Not handling Row Major - FIXME\n");
+		return 0;
+	}
+}
+
+int
+gemmHasNTail(size_t N, int vecLen, clblasOrder order, clblasTranspose transA, clblasTranspose transB)
+{
+	if (order == clblasColumnMajor)
+	{
+		if (transA == clblasNoTrans)
+		{
+			if (transB == clblasNoTrans)
+			{
+				return 0;
+			} else {
+				return (N % vecLen);
+			}
+		} else {
+			if (transB == clblasNoTrans)
+			{
+				return 0;
+			} else {
+				return (N % vecLen);
+			}
+		}
+	} else {
+		printf("gemmHasNTail: Not handling Row Major - FIXME\n");
+		return 0;
+	}
+}
+
+int
+gemmHasTails(size_t M,  size_t N, size_t K, int vecLen, clblasOrder order, clblasTranspose transA, clblasTranspose transB)
+{
+	K = K;  // Dummy- to remove warning
+    if (order == clblasColumnMajor)
+	{
+		if (transA == clblasNoTrans)
+		{
+			if (transB == clblasNoTrans)
+			{
+				return (M % vecLen);
+			} else {
+				return ((M % vecLen) || (N % vecLen));
+			}
+		} else {
+			if (transB == clblasNoTrans)
+			{
+				//
+				// Vectoring on A is on K dimension and we handle tail directly in the kernel
+				//
+				return 0;
+			} else {
+				return (N % vecLen);
+			}
+		}
+	} else {
+		printf("gemmHasTails: Not handling Row Major - FIXME\n");
+		return 0;
+	}
+}
+
+clblasStatus executeGEMM( CLBlasKargs *kargs, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList,
+                             const cl_event *eventWaitList, cl_event *events)
+{
+    cl_int err = CL_SUCCESS;
+    ListHead seq, tailSeq;
+	cl_event nontail;
+	cl_uint gemmVeclen;
+	CLBLASKernExtra *kextra;
+    size_t M, N, K;
+
+    M = kargs->M; N = kargs->N; K = kargs->K;
+    #ifdef DEBUG_GEMM_2
+    printf("executeGEMM Called\n");
+    #endif
+    listInitHead(&seq);
+    err = makeSolutionSeq(CLBLAS_GEMM2, kargs, numCommandQueues, commandQueues,
+        numEventsInWaitList, eventWaitList, &nontail, &seq);
+    if (err == CL_SUCCESS) {
+	    ListNode *f = listNodeFirst(&seq);
+		SolutionStep *gemm2;
+		size_t tailStartM, tailStartN;
+		bool processTails;
+
+		gemm2 = container_of(f, node, SolutionStep);
+		kextra = gemm2->kernels[CLBLAS_COMPUTING_KERNEL]->extra;
+		gemmVeclen = kextra->vecLen;
+
+		if (gemmHasTails(M, N, K, gemmVeclen, kargs->order, kargs->transA, kargs->transB) == 0)
+		{
+			#ifdef DEBUG_GEMM_2
+			printf("No M or N Tails to process..\n");
+			#endif
+			processTails = false;
+			gemm2->event = events;
+		} else {
+			processTails = true;
+			if (gemmHasMTail(M, gemmVeclen, kargs->order, kargs->transA, kargs->transB))
+			{
+				tailStartM = M - (M%gemmVeclen);
+			} else {
+				tailStartM = M;
+			}
+
+			if (gemmHasNTail(N, gemmVeclen, kargs->order, kargs->transA, kargs->transB))
+			{
+				tailStartN = N - (N%gemmVeclen);
+			} else {
+				tailStartN = N;
+            }
+		}
+        err = executeSolutionSeq(&seq);
+		if ((err == CL_SUCCESS) && (processTails == true))
+		{
+			CLBlasKargs targs;
+
+			memcpy(&targs, &gemm2->args, sizeof(CLBlasKargs));
+			targs.tailStartM = tailStartM;
+			targs.tailStartN = tailStartN;
+			#ifdef DEBUG_GEMM_2
+			printf("Processing Tails\n");
+			#endif
+    		listInitHead(&tailSeq);
+    		err = makeSolutionSeq(CLBLAS_GEMM_TAIL, &targs, numCommandQueues, commandQueues,
+        						  1, &nontail, events, &tailSeq);
+			if (err == CL_SUCCESS)
+			{
+				err = executeSolutionSeq(&tailSeq);
+			}
+			freeSolutionSeq(&tailSeq);
+		}
+    }
+    freeSolutionSeq(&seq);
+    return (clblasStatus) err;
+}
+
+static clblasStatus
+doGemm(
+    CLBlasKargs *kargs,
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus err;
+    clblasStatus retCode = clblasSuccess;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+    /* Validate arguments */
+
+    if (retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET)) {
+        return retCode;
+    }
+    if (K != 0) {
+        if (retCode = checkMatrixSizes(kargs->dtype, order, transA, M, K, A, offA, lda, A_MAT_ERRSET )) {
+            return retCode;
+        }
+        if (retCode = checkMatrixSizes(kargs->dtype, order, transB, K, N, B, offB, ldb, B_MAT_ERRSET )) {
+            return retCode;
+        }
+    }
+    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, C, offC, ldc, C_MAT_ERRSET )) {
+            return retCode;
+    }
+
+	numCommandQueues = 1;
+	#ifdef DEBUG_2
+	printf("DoGemm being called...\n");
+	#endif
+    kargs->pigFuncID = CLBLAS_GEMM2;
+    kargs->order = order;
+    kargs->transA = transA;
+    kargs->transB = transB;
+    kargs->M = M;
+    kargs->N = N;
+    kargs->K = K;
+    kargs->A = A;
+    kargs->offA = offA;
+    kargs->offa = offA;
+    kargs->lda.matrix = lda;
+    kargs->B = B;
+    kargs->offBX = offB;
+    kargs->ldb.matrix = ldb;
+    kargs->C = C;
+    kargs->offCY = offC;
+    kargs->ldc.matrix = ldc;
+
+    kargs->offsetM = 0;
+    kargs->offsetN = 0;
+    kargs->scimage[0] = 0;
+    kargs->scimage[1] = 0;
+
+    err = executeGEMM(kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    return err;
+			}
+
+/*
+clblasStatus
+clblasSgemmV2(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    cl_float alpha,
+    const cl_mem A,
+    size_t lda,
+    const cl_mem B,
+    size_t ldb,
+    cl_float beta,
+    cl_mem C,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_FLOAT;
+    kargs.alpha.argFloat = alpha;
+    kargs.beta.argFloat = beta;
+
+    return doGemm(&kargs, order, transA, transB, M, N, K, A, 0, lda, B, 0, ldb,
+                  C, 0, ldc, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasDgemmV2(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    cl_double alpha,
+    const cl_mem A,
+    size_t lda,
+    const cl_mem B,
+    size_t ldb,
+    cl_double beta,
+    cl_mem C,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_DOUBLE;
+    kargs.alpha.argDouble = alpha;
+    kargs.beta.argDouble = beta;
+
+    return doGemm(&kargs, order, transA, transB, M, N, K, A, 0, lda, B, 0, ldb,
+                  C, 0, ldc, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasCgemmV2(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t lda,
+    const cl_mem B,
+    size_t ldb,
+    FloatComplex beta,
+    cl_mem C,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+    kargs.alpha.argFloatComplex = alpha;
+    kargs.beta.argFloatComplex = beta;
+
+    return doGemm(&kargs, order, transA, transB, M, N, K, A, 0, lda, B, 0, ldb,
+                  C, 0, ldc, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasZgemmV2(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t lda,
+    const cl_mem B,
+    size_t ldb,
+    DoubleComplex beta,
+    cl_mem C,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+    kargs.alpha.argDoubleComplex = alpha;
+    kargs.beta.argDoubleComplex = beta;
+
+    return doGemm(&kargs, order, transA, transB, M, N, K, A, 0, lda, B, 0, ldb,
+                  C, 0, ldc, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasSgemmExV2(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    cl_float alpha,
+    const cl_mem A,
+	size_t offA,
+    size_t lda,
+    const cl_mem B,
+	size_t offB,
+    size_t ldb,
+    cl_float beta,
+    cl_mem C,
+	size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_FLOAT;
+    kargs.alpha.argFloat = alpha;
+    kargs.beta.argFloat = beta;
+
+    return doGemm(&kargs, order, transA, transB, M, N, K, A, offA, lda, B, offB, ldb,
+                  C, offC, ldc, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasDgemmExV2(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    cl_double alpha,
+    const cl_mem A,
+	size_t offA,
+    size_t lda,
+    const cl_mem B,
+	size_t offB,
+    size_t ldb,
+    cl_double beta,
+    cl_mem C,
+	size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_DOUBLE;
+    kargs.alpha.argDouble = alpha;
+    kargs.beta.argDouble = beta;
+
+    return doGemm(&kargs, order, transA, transB, M, N, K, A, offA, lda, B, offB, ldb,
+                  C, offC, ldc, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasCgemmExV2(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const cl_mem A,
+	size_t offA,
+    size_t lda,
+    const cl_mem B,
+	size_t offB,
+    size_t ldb,
+    FloatComplex beta,
+    cl_mem C,
+	size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+    kargs.alpha.argFloatComplex = alpha;
+    kargs.beta.argFloatComplex = beta;
+
+    return doGemm(&kargs, order, transA, transB, M, N, K, A, offA, lda, B, offB, ldb,
+                  C, offC, ldc, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasZgemmExV2(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const cl_mem A,
+	size_t offA,
+    size_t lda,
+    const cl_mem B,
+	size_t offB,
+    size_t ldb,
+    DoubleComplex beta,
+    cl_mem C,
+	size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+    kargs.alpha.argDoubleComplex = alpha;
+    kargs.beta.argDoubleComplex = beta;
+
+    return doGemm(&kargs, order, transA, transB, M, N, K, A, offA, lda, B, offB, ldb,
+                  C, offC, ldc, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+*/
diff --git a/src/library/blas/xgemv.c b/src/library/blas/xgemv.c
new file mode 100644
index 0000000..5999979
--- /dev/null
+++ b/src/library/blas/xgemv.c
@@ -0,0 +1,243 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <string.h>
+#include <clBLAS.h>
+#include <devinfo.h>
+
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+static clblasStatus
+doGemv(
+    CLBlasKargs *kargs,
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    cl_int err;
+    ListHead seq;
+    size_t sizev;
+    clblasStatus retCode = clblasSuccess;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+    /* Validate arguments */
+
+    if ((retCode = checkMemObjects( A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET ))) {
+        return retCode;
+    }
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans,
+                                    M, N, A, offA, lda, A_MAT_ERRSET ))) {
+        return retCode;
+    }
+    sizev = (transA == clblasNoTrans) ? N : M;
+    if ((retCode = checkVectorSizes(kargs->dtype, sizev, x, offx, incx, X_VEC_ERRSET ))) {
+        return retCode;
+    }
+    sizev = (transA == clblasNoTrans) ? M : N;
+    if ((retCode = checkVectorSizes(kargs->dtype, sizev, y, offy, incy, Y_VEC_ERRSET))) {
+        return retCode;
+    }
+
+    kargs->order = order;
+    kargs->transA = transA;
+    kargs->M = M;
+    kargs->N = N;
+    /*
+     * store original height of the matrix A
+     * FIXME: store it to a dedicated field
+     */
+    kargs->K = (transA == clblasNoTrans) ? M : N;
+    kargs->A = A;
+    kargs->offA = offA;
+    kargs->lda.matrix = lda;
+    kargs->B = x;
+    kargs->offBX = offx;
+    kargs->ldb.vector = incx;
+    kargs->C = y;
+    kargs->offCY = offy;
+    kargs->ldc.vector = incy;
+
+    listInitHead(&seq);
+    err = makeSolutionSeq(CLBLAS_GEMV, kargs, numCommandQueues, commandQueues,
+        numEventsInWaitList, eventWaitList, events, &seq);
+    if (err == CL_SUCCESS) {
+        err = executeSolutionSeq(&seq);
+    }
+
+    freeSolutionSeq(&seq);
+
+    return (clblasStatus)err;
+}
+
+clblasStatus
+clblasSgemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_float beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_FLOAT;
+    kargs.alpha.argFloat = alpha;
+    kargs.beta.argFloat = beta;
+
+    return doGemv(&kargs, order, transA, M, N, A, offA, lda, x, offx, incx,
+                  y, offy, incy, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasDgemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_double beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_DOUBLE;
+    kargs.alpha.argDouble = alpha;
+    kargs.beta.argDouble = beta;
+
+    return doGemv(&kargs, order, transA, M, N, A, offA, lda, x, offx, incx,
+                  y, offy, incy, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasCgemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    FloatComplex beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+    kargs.alpha.argFloatComplex = alpha;
+    kargs.beta.argFloatComplex = beta;
+
+    return doGemv(&kargs, order, transA, M, N, A, offA, lda, x, offx, incx,
+                  y, offy, incy, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasZgemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    DoubleComplex beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+    kargs.alpha.argDoubleComplex = alpha;
+    kargs.beta.argDoubleComplex = beta;
+
+    return doGemv(&kargs, order, transA, M, N, A, offA, lda, x, offx, incx,
+                  y, offy, incy, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
diff --git a/src/library/blas/xger.c b/src/library/blas/xger.c
new file mode 100644
index 0000000..92d4b31
--- /dev/null
+++ b/src/library/blas/xger.c
@@ -0,0 +1,368 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+//#define DEBUG_GER
+
+#include <stdio.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+
+clblasStatus
+doGer(
+	CLBlasKargs *kargs,
+	clblasOrder order,
+    size_t M,
+    size_t N,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem  A,
+    size_t offa,
+    size_t lda,
+	int doConj,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		cl_int err;
+		ListHead seq;
+        clblasStatus retCode = clblasSuccess;
+
+		if (!clblasInitialized) {
+        return clblasNotInitialized;
+		}
+
+		/* Validate arguments */
+
+		if (retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+			printf("Invalid mem object..\n");
+            return retCode;
+		}
+
+		// Check wheather enough memory was allocated
+
+		if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, A, offa, lda, A_MAT_ERRSET )) {
+
+			printf("Invalid Size for A %d\n",retCode );
+            return retCode;
+		}
+		if (retCode = checkVectorSizes(kargs->dtype, M, X, offx, incx, X_VEC_ERRSET )) {
+			printf("Invalid Size for X\n");
+            return retCode;
+		}
+		if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+			printf("Invalid Size for Y\n");
+            return retCode;
+		}
+		///////////////////////////////////////////////////////////////
+
+		if ((commandQueues == NULL) || (numCommandQueues == 0))
+		{
+			return clblasInvalidValue;
+		}
+
+		/* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */
+		numCommandQueues = 1;
+		if (commandQueues[0] == NULL)
+		{
+			return clblasInvalidCommandQueue;
+		}
+
+		if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+		{
+			return clblasInvalidEventWaitList;
+		}
+
+		/*
+ 		 * ASSUMPTION:
+ 		 * doTRMV assumes "commandQueue" of 0. The same is reflected in
+		 * "makeSolutionSeq" as well. If either of them changes in future,
+		 * this code needs to be revisited.
+  		 */
+
+		kargs->order = order;
+		kargs->M = M;
+		kargs->N = N;
+		kargs->A = A;
+		kargs->offa = offa;
+		kargs->offA = offa;
+		kargs->lda.matrix = lda;
+		kargs->B = X;
+		kargs->offBX = offx;
+		kargs->ldb.vector = incx;	// Will be using this as incx
+		kargs->C = Y;
+		kargs->offCY = offy;
+		kargs->ldc.vector = incy;	// Will be using this as incy
+		kargs->offsetM = 0;
+		kargs->offsetN = 0;
+		kargs->scimage[0] = 0;
+		kargs->scimage[1] = 0;
+		kargs->K = (size_t)doConj; // Will be using K as doConj parameter
+
+		#ifdef DEBUG_GER
+		printf("Calling makeSolutionSeq from DoGer: GER\n");
+		#endif
+
+		listInitHead(&seq);
+		err = makeSolutionSeq(CLBLAS_GER, kargs, numCommandQueues, commandQueues,
+        					  numEventsInWaitList, eventWaitList, events, &seq);
+		if (err == CL_SUCCESS) {
+       		err = executeSolutionSeq(&seq);
+		}
+
+		freeSolutionSeq(&seq);
+
+		return (clblasStatus)err;
+	}
+
+
+
+
+
+clblasStatus
+clblasSger(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem  A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+		int doConj;
+
+		#ifdef DEBUG_GER
+		printf("\nSGER Called\n");
+		#endif
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_FLOAT;
+		kargs.alpha.argFloat = alpha;
+		doConj = 0;
+
+		return doGer(&kargs, order, M, N, X, offx, incx, Y, offy, incy, A, offa, lda, doConj,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasDger(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+		int doConj;
+
+		#ifdef DEBUG_GER
+		printf("\nDGER Called\n");
+		#endif
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_DOUBLE;
+		kargs.alpha.argDouble = alpha;
+		doConj = 0;
+
+		return doGer(&kargs, order, M, N, X, offx, incx, Y, offy, incy, A, offa, lda, doConj,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasCgeru(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int  incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+		int doConj;
+
+		#ifdef DEBUG_GER
+		printf("\nCGERU Called\n");
+		#endif
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_COMPLEX_FLOAT;
+		kargs.alpha.argFloatComplex = alpha;
+		doConj = 0;
+
+		return doGer(&kargs, order, M, N, X, offx, incx, Y, offy, incy, A, offa, lda, doConj,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasZgeru(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+		int doConj;
+
+		#ifdef DEBUG_GER
+		printf("\nZGERU Called\n");
+		#endif
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_COMPLEX_DOUBLE;
+		kargs.alpha.argDoubleComplex = alpha;
+		doConj = 0;
+
+		return doGer(&kargs, order, M, N, X, offx, incx, Y, offy, incy, A, offa, lda, doConj,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasCgerc(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int  incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+		int doConj;
+
+		#ifdef DEBUG_GER
+		printf("\nCGERC Called\n");
+		#endif
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_COMPLEX_FLOAT;
+		kargs.alpha.argFloatComplex = alpha;
+		doConj = 1;
+
+		return doGer(&kargs, order, M, N, X, offx, incx, Y, offy, incy, A, offa, lda, doConj,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasZgerc(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+		int doConj;
+
+		#ifdef DEBUG_GER
+		printf("\nZGERC Called\n");
+		#endif
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_COMPLEX_DOUBLE;
+		kargs.alpha.argDoubleComplex = alpha;
+		doConj = 1;
+
+		return doGer(&kargs, order, M, N, X, offx, incx, Y, offy, incy, A, offa, lda, doConj,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+
diff --git a/src/library/blas/xhemm.c b/src/library/blas/xhemm.c
new file mode 100644
index 0000000..2c6efe1
--- /dev/null
+++ b/src/library/blas/xhemm.c
@@ -0,0 +1,114 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+extern clblasStatus
+doSymm( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, clblasSide side,
+        size_t M, size_t N,
+        const cl_mem A, size_t offa, size_t lda,
+        const cl_mem B, size_t offb, size_t ldb,
+        cl_mem C, size_t offc, size_t ldc,
+        cl_uint numCommandQueues, cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList, const cl_event *eventWaitList,
+        cl_event *events,
+        BlasFunctionID symm_or_hemm);
+
+
+clblasStatus
+clblasChemm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    size_t M,
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_float2 beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+        CLBlasKargs kargs;
+
+		#ifdef DEBUG_HEMM
+		printf("Chemm called\n");
+		#endif
+    	memset(&kargs, 0, sizeof(kargs));
+        kargs.dtype = TYPE_COMPLEX_FLOAT;
+    	kargs.alpha.argFloatComplex = alpha;
+    	kargs.beta.argFloatComplex  = beta;
+    	return doSymm(	&kargs, order, uplo, side, M, N, A, offa, lda, B, offb, ldb, C, offc, ldc,
+						numCommandQueues, commandQueues, numEventsInWaitList,
+						eventWaitList, events, CLBLAS_HEMM);
+	}
+
+clblasStatus
+clblasZhemm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    size_t M,
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_double2 beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+        CLBlasKargs kargs;
+
+		#ifdef DEBUG_HEMM
+		printf("Zhemm called\n");
+		#endif
+    	memset(&kargs, 0, sizeof(kargs));
+    	kargs.dtype = TYPE_COMPLEX_DOUBLE;
+    	kargs.alpha.argDoubleComplex = alpha;
+    	kargs.beta.argDoubleComplex  = beta;
+
+    	return doSymm(	&kargs, order, uplo, side, M, N, A, offa, lda, B, offb, ldb, C, offc, ldc,
+						numCommandQueues, commandQueues, numEventsInWaitList,
+						eventWaitList, events, CLBLAS_HEMM);
+	}
+
diff --git a/src/library/blas/xhemv.c b/src/library/blas/xhemv.c
new file mode 100644
index 0000000..0db6a8f
--- /dev/null
+++ b/src/library/blas/xhemv.c
@@ -0,0 +1,190 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <string.h>
+#include <clBLAS.h>
+#include <devinfo.h>
+
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+static clblasStatus
+doHemv(
+    CLBlasKargs *kargs,
+    clblasOrder order,
+    clblasUplo uplo,
+	size_t N,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    cl_int err;
+    ListHead seq1, seq2;
+	cl_event first_event;
+    clblasStatus retCode = clblasSuccess;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+    /* Validate arguments */
+
+    if (retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+        return retCode;
+    }
+    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A,
+                         offA, lda, A_MAT_ERRSET )) {
+        return retCode;
+    }
+    if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET )) {
+        return retCode;
+    }
+    if (retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET )) {
+        return retCode;
+    }
+	if ((commandQueues == NULL) || (numCommandQueues == 0))
+    {
+        return clblasInvalidValue;
+    }
+    if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+    {
+        return clblasInvalidEventWaitList;
+    }
+
+	numCommandQueues = 1;
+    kargs->order = order;
+    kargs->uplo = uplo;
+    kargs->N = N;
+    kargs->A = A;
+    kargs->offA = offA;
+	kargs->offa = offA;
+    kargs->lda.matrix = lda;
+    kargs->B = x;
+    kargs->offBX = offx;
+    kargs->ldb.vector = incx;
+    kargs->C = y;
+    kargs->offCY = offy;
+    kargs->ldc.vector = incy;
+	kargs->transA = clblasNoTrans;
+	kargs->diag = clblasNonUnit;
+
+	listInitHead(&seq1);
+    err = makeSolutionSeq(CLBLAS_HEMV, kargs, numCommandQueues, commandQueues,
+        numEventsInWaitList, eventWaitList, &first_event, &seq1);
+    if (err == CL_SUCCESS) {
+        err = executeSolutionSeq(&seq1);
+		if (err == CL_SUCCESS)
+		{
+			listInitHead(&seq2);
+			kargs->transA = clblasConjTrans;
+		    kargs->diag   = clblasUnit;
+			err = makeSolutionSeq(CLBLAS_HEMV, kargs, numCommandQueues, commandQueues,
+			       1, &first_event, events, &seq2);
+			if (err == CL_SUCCESS)
+			{
+				err = executeSolutionSeq(&seq2);
+			}
+			freeSolutionSeq(&seq2);
+		}
+    }
+
+    freeSolutionSeq(&seq1);
+    return (clblasStatus)err;
+
+	//printf("doHemv called\n");
+	//return 0;
+}
+
+clblasStatus
+clblasChemv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem A,
+	size_t offa,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    FloatComplex beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+    kargs.alpha.argFloatComplex = alpha;
+    kargs.beta.argFloatComplex = beta;
+
+    return doHemv(&kargs, order, uplo, N, A, offa, lda, x, offx, incx,
+                  y, offy, incy, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasZhemv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offa,
+	size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    DoubleComplex beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+    kargs.alpha.argDoubleComplex = alpha;
+    kargs.beta.argDoubleComplex = beta;
+
+    return doHemv(&kargs, order, uplo, N, A, offa, lda, x, offx, incx,
+                  y, offy, incy, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
diff --git a/src/library/blas/xher.c b/src/library/blas/xher.c
new file mode 100644
index 0000000..af36962
--- /dev/null
+++ b/src/library/blas/xher.c
@@ -0,0 +1,244 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+//#define DO_HER
+
+#include <stdio.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+clblasStatus
+doher(
+	CLBlasKargs *kargs,
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+{
+	cl_int err;
+    ListHead seq;
+    clblasStatus retCode = clblasSuccess;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+	#ifdef DEBUG_HER
+	printf("doher called\n");
+	#endif
+
+    /* Validate arguments */
+
+    if (retCode = checkMemObjects(A, X, 0, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET )) {
+   		printf("Invalid mem object..\n");
+        return retCode;
+    }
+
+    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+        printf("Invalid Size for A\n");
+        return retCode;
+    }
+    if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET)) {
+        printf("Invalid Size for X\n");
+        return retCode;
+    }
+
+	if ((commandQueues == NULL) || (numCommandQueues == 0))
+    {
+        return clblasInvalidValue;
+    }
+
+    if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+    {
+        return clblasInvalidEventWaitList;
+    }
+
+	kargs->order = order;
+	if(order == clblasRowMajor)
+	{
+		kargs->uplo = (uplo == clblasUpper) ? clblasLower : clblasUpper;
+	}
+	else
+	{
+		kargs->uplo = uplo;
+	}
+
+	kargs->N = N;
+    kargs->A = A;
+    kargs->lda.matrix = lda;
+    kargs->B = X;
+    kargs->ldb.vector = incx;
+    kargs->offBX = offx;
+    kargs->offa = offa;
+	kargs->offA = offa;
+
+	#ifdef DEBUG_HER
+    printf("Calling makeSolutionSeq : HER\n");
+    #endif
+
+	/*
+ 	 * Always use commandQueues (0)
+	 * PENDING:
+	 * 1. No Multi-GPU / Multi-command queue support
+	 * 2. This can be optimized to use the commandQ with the higher
+	 *	  memmory bandwidth that supports the data-type and the LDA
+	 */
+	numCommandQueues = 1;
+
+    listInitHead(&seq);
+    err = makeSolutionSeq(CLBLAS_HER, kargs, numCommandQueues, commandQueues,
+                          numEventsInWaitList, eventWaitList, events, &seq);
+    if (err == CL_SUCCESS) {
+        err = executeSolutionSeq(&seq);
+    }
+
+    freeSolutionSeq(&seq);
+    return (clblasStatus)err;
+}
+
+
+clblasStatus
+clblasCher(
+	clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+	{
+		CLBlasKargs kargs;
+
+    	memset(&kargs, 0, sizeof(kargs));
+    	kargs.dtype = TYPE_COMPLEX_FLOAT;
+		kargs.alpha.argFloat = alpha;
+        kargs.pigFuncID = CLBLAS_HER;
+
+		#ifdef DEBUG_HER
+		printf("CHER called\n");
+		#endif
+
+		return doher(&kargs, order, uplo, N, X, offx, incx, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasZher(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+    {
+        CLBlasKargs kargs;
+
+        memset(&kargs, 0, sizeof(kargs));
+        kargs.dtype = TYPE_COMPLEX_DOUBLE;
+		kargs.alpha.argDouble = alpha;
+        kargs.pigFuncID = CLBLAS_HER;
+
+        #ifdef DEBUG_HER
+        printf("ZHER called\n");
+        #endif
+
+        return doher(&kargs, order, uplo, N, X, offx, incx, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+clblasStatus
+clblasChpr(
+	clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+	{
+		CLBlasKargs kargs;
+
+    	memset(&kargs, 0, sizeof(kargs));
+    	kargs.dtype = TYPE_COMPLEX_FLOAT;
+		kargs.alpha.argFloat = alpha;
+        kargs.pigFuncID = CLBLAS_HPR;
+
+		return doher(&kargs, order, uplo, N, X, offx, incx, AP, offa, 0, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasZhpr(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+    {
+        CLBlasKargs kargs;
+
+        memset(&kargs, 0, sizeof(kargs));
+        kargs.dtype = TYPE_COMPLEX_DOUBLE;
+		kargs.alpha.argDouble = alpha;
+        kargs.pigFuncID = CLBLAS_HPR;
+
+        return doher(&kargs, order, uplo, N, X, offx, incx, AP, offa, 0, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
diff --git a/src/library/blas/xher2.c b/src/library/blas/xher2.c
new file mode 100644
index 0000000..cb67659
--- /dev/null
+++ b/src/library/blas/xher2.c
@@ -0,0 +1,275 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+//#define DEBUG_HER2
+
+#include <stdio.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+clblasStatus
+doHer2(
+    CLBlasKargs *kargs,
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+	const cl_mem Y,
+	size_t offy,
+	int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueue,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+{
+    cl_int err;
+    ListHead seq;
+    clblasStatus retCode = clblasSuccess;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+    #ifdef DEBUG_HER2
+    printf("doHer2 called\n");
+    #endif
+
+    /* Validate arguments */
+
+    if (retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+        printf("Invalid mem object..\n");
+        return retCode;
+    }
+
+    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+        printf("Invalid Size for A\n");
+        return retCode;
+    }
+    if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+        printf("Invalid Size for X\n");
+        return retCode;
+    }
+
+	if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+        printf("Invalid Size for Y\n");
+        return retCode;
+    }
+
+    if ((commandQueue == NULL) || (numCommandQueues == 0))
+    {
+        return clblasInvalidValue;
+    }
+
+    if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+    {
+        return clblasInvalidEventWaitList;
+    }
+
+    kargs->order = order;
+	if(order == clblasRowMajor)		// Handling row-major. Invert X, Y and uplo
+	{
+		kargs->uplo = (uplo == clblasUpper) ? clblasLower : clblasUpper;
+		kargs->B = Y;
+		kargs->ldb.vector = incy;
+		kargs->offBX = offy;
+		kargs->C = X;
+		kargs->ldc.vector = incx;
+		kargs->offCY = offx;
+	}
+	else
+	{
+		kargs->uplo = uplo;
+		kargs->B = X;
+		kargs->ldb.vector = incx;
+		kargs->offBX = offx;
+		kargs->C = Y;
+		kargs->ldc.vector = incy;
+		kargs->offCY = offy;
+	}
+    kargs->N = N;
+    kargs->A = A;
+    kargs->lda.matrix = lda;
+    kargs->offa = offa;
+	kargs->offA = offa;
+
+    #ifdef DEBUG_HER2
+    printf("Calling makeSolutionSeq : HER2\n");
+    #endif
+
+    /*
+     * Always use CommandQueue (0)
+     * PENDING:
+     * 1. No Multi-GPU / Multi-command queue support
+     * 2. This can be optimized to use the commandQ with the higher
+     *    memmory bandwidth that supports the data-type and the LDA
+     */
+    numCommandQueues = 1;
+
+    listInitHead(&seq);
+    err = makeSolutionSeq(CLBLAS_HER2, kargs, numCommandQueues, commandQueue,
+                          numEventsInWaitList, eventWaitList, events, &seq);
+    if (err == CL_SUCCESS) {
+        err = executeSolutionSeq(&seq);
+    }
+
+    freeSolutionSeq(&seq);
+    return (clblasStatus)err;
+}
+
+clblasStatus
+clblasCher2(
+	clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+	const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueue,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+	{
+		CLBlasKargs kargs;
+
+        memset(&kargs, 0, sizeof(kargs));
+        kargs.dtype = TYPE_COMPLEX_FLOAT;
+        kargs.alpha.argFloatComplex = alpha;
+        kargs.pigFuncID = CLBLAS_HER2;
+
+		#ifdef DEBUG_HER2
+		printf("Cher2 called\n");
+		#endif
+
+		return doHer2(&kargs, order, uplo, N, X, offx, incx, Y, offy, incy, A, offa, lda,
+						numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasZher2(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+	cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueue,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+    {
+		CLBlasKargs kargs;
+
+        memset(&kargs, 0, sizeof(kargs));
+        kargs.dtype = TYPE_COMPLEX_DOUBLE;
+        kargs.alpha.argDoubleComplex = alpha;
+        kargs.pigFuncID = CLBLAS_HER2;
+
+        #ifdef DEBUG_HER2
+        printf("Zher2 called\n");
+        #endif
+
+        return doHer2(&kargs, order, uplo, N, X, offx, incx, Y, offy, incy, A, offa, lda,
+                        numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events);
+    }
+
+clblasStatus
+clblasChpr2(
+	clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+	const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueue,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+	{
+		CLBlasKargs kargs;
+
+        memset(&kargs, 0, sizeof(kargs));
+        kargs.dtype = TYPE_COMPLEX_FLOAT;
+        kargs.alpha.argFloatComplex = alpha;
+        kargs.pigFuncID = CLBLAS_HPR2;
+
+		return doHer2(&kargs, order, uplo, N, X, offx, incx, Y, offy, incy, AP, offa, 0,
+						numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasZhpr2(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+	cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueue,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+    {
+		CLBlasKargs kargs;
+
+        memset(&kargs, 0, sizeof(kargs));
+        kargs.dtype = TYPE_COMPLEX_DOUBLE;
+        kargs.alpha.argDoubleComplex = alpha;
+        kargs.pigFuncID = CLBLAS_HPR2;
+
+        return doHer2(&kargs, order, uplo, N, X, offx, incx, Y, offy, incy, AP, offa, 0,
+                        numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events);
+    }
+
diff --git a/src/library/blas/xher2k.c b/src/library/blas/xher2k.c
new file mode 100644
index 0000000..302a648
--- /dev/null
+++ b/src/library/blas/xher2k.c
@@ -0,0 +1,246 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <string.h>
+#include <clBLAS.h>
+#include <devinfo.h>
+
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+//#define DEBUG_HER2K
+
+extern clblasStatus executeGEMM( CLBlasKargs *kargs, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList,
+                                    const cl_event *eventWaitList, cl_event *events);
+
+clblasStatus
+doHer2k(
+    CLBlasKargs *kargs,
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus err;
+    clblasUplo fUplo;
+    clblasTranspose fTransA;
+    cl_event firstHerkCall;
+    clblasStatus retCode = clblasSuccess;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+    if (numCommandQueues == 0 || commandQueues == NULL) {
+        return clblasInvalidValue;
+    }
+    numCommandQueues = 1;
+
+    if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+    {
+        return clblasInvalidEventWaitList;
+    }
+
+    // Validate arguments
+    if (retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET )) {
+        return retCode;
+    }
+
+    if (transA == clblasTrans) {
+        return clblasInvalidValue;
+    }
+
+    if (retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offa, lda, A_MAT_ERRSET )) {
+        return retCode;
+    }
+
+    if (retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, B, offb, ldb, B_MAT_ERRSET )) {
+        return retCode;
+    }
+
+    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, C, offc, ldc, C_MAT_ERRSET )) {
+        return retCode;
+    }
+
+    if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+    {
+        return clblasInvalidEventWaitList;
+    }
+
+    fUplo = (order == clblasRowMajor) ? ((uplo == clblasLower) ? clblasUpper : clblasLower) : uplo;
+    fTransA = (order == clblasRowMajor) ? ((transA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans) : transA;
+    kargs->order = (order == clblasRowMajor) ? clblasColumnMajor : order;
+
+    kargs->transA = fTransA;
+    kargs->transB = (fTransA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans;
+
+    kargs->uplo = fUplo;
+    kargs->M = N;
+    kargs->N = N;
+    kargs->K = K;
+    kargs->A = A;
+    kargs->offA = offa;
+    kargs->offa = offa;
+    kargs->lda.matrix = lda;
+    kargs->B = B;
+    kargs->offBX = offb;
+    kargs->ldb.matrix = ldb;
+    kargs->C = C;
+    kargs->offCY = offc;
+    kargs->ldc.matrix = ldc;
+    kargs->pigFuncID = CLBLAS_HERK;
+
+    err = executeGEMM(kargs,  numCommandQueues, commandQueues,
+                            numEventsInWaitList, eventWaitList, &firstHerkCall);
+
+    if( err == CL_SUCCESS )
+    {
+        kargs->A = B;
+        kargs->offA = offb;
+        kargs->offa = offb;
+        kargs->lda.matrix = ldb;
+        kargs->B = A;
+        kargs->offBX = offa;
+        kargs->ldb.matrix = lda;
+
+        if( kargs->dtype == TYPE_COMPLEX_FLOAT )
+        {
+            CIMAG( kargs->alpha.argFloatComplex ) *= -1.0;
+            CREAL( kargs->beta.argFloatComplex ) = 1.0;
+            CIMAG( kargs->beta.argFloatComplex ) = 0.0;
+        }
+        else
+        {
+            CIMAG( kargs->alpha.argDoubleComplex ) *= -1.0;
+            CREAL( kargs->beta.argDoubleComplex ) = 1.0;
+            CIMAG( kargs->beta.argDoubleComplex ) = 0.0;
+        }
+
+        err = executeGEMM(kargs,  numCommandQueues, commandQueues, 1, &firstHerkCall, events);
+    }
+
+    return (clblasStatus)err;
+}
+
+clblasStatus
+clblasCher2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_float beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    FloatComplex fBeta;
+
+    memset(&kargs, 0, sizeof(kargs));
+
+    CREAL(fBeta)  = beta;
+    CIMAG(fBeta)  = 0.0f;
+
+    kargs.alpha.argFloatComplex = alpha;
+    kargs.beta.argFloatComplex = fBeta;
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+
+    if( order == clblasRowMajor )
+    {
+        CIMAG( kargs.alpha.argFloatComplex ) *= -1.0;
+    }
+
+    return doHer2k(&kargs, order, uplo, trans, N, K, A, offa, lda, B, offb, ldb,
+                    C, offc, ldc, numCommandQueues, commandQueues,
+                    numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasZher2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_double beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    DoubleComplex fBeta;
+
+    memset(&kargs, 0, sizeof(kargs));
+
+    CREAL(fBeta)  = beta;
+    CIMAG(fBeta)  = 0.0f;
+
+    kargs.alpha.argDoubleComplex = alpha;
+    kargs.beta.argDoubleComplex = fBeta;
+
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+
+    if( order == clblasRowMajor )
+    {
+        CIMAG( kargs.alpha.argDoubleComplex ) *= -1.0;
+    }
+
+    return doHer2k(&kargs, order, uplo, trans, N, K, A, offa, lda, B, offb, ldb,
+                    C, offc, ldc, numCommandQueues, commandQueues,
+                    numEventsInWaitList, eventWaitList, events);
+}
+
diff --git a/src/library/blas/xherk.c b/src/library/blas/xherk.c
new file mode 100644
index 0000000..18d1fb4
--- /dev/null
+++ b/src/library/blas/xherk.c
@@ -0,0 +1,211 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <string.h>
+#include <clBLAS.h>
+#include <devinfo.h>
+
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+extern clblasStatus executeGEMM( CLBlasKargs *kargs, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList,
+                                    const cl_event *eventWaitList, cl_event *events);
+
+clblasStatus
+doHerk(
+    CLBlasKargs *kargs,
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus err;
+    clblasUplo fUplo;
+    clblasTranspose fTransA;
+    clblasStatus retCode = clblasSuccess;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+    if (numCommandQueues == 0 || commandQueues == NULL) {
+        return clblasInvalidValue;
+    }
+
+    if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+    {
+        return clblasInvalidEventWaitList;
+    }
+
+    // Validate arguments
+    if (retCode = checkMemObjects(A, C, NULL, false, A_MAT_ERRSET, C_MAT_ERRSET, END_ERRSET )) {
+        return retCode;
+    }
+
+    if (transA == clblasTrans) {
+        return clblasInvalidValue;
+    }
+
+    if (retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offA, lda, A_MAT_ERRSET )) {
+        return retCode;
+    }
+
+    if (retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET )) {
+        return retCode;
+    }
+
+    if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+    {
+        return clblasInvalidEventWaitList;
+    }
+
+    fUplo = (order == clblasRowMajor) ? ((uplo == clblasLower) ? clblasUpper : clblasLower) : uplo;
+    fTransA = (order == clblasRowMajor) ? ((transA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans) : transA;
+    kargs->order = (order == clblasRowMajor) ? clblasColumnMajor : order;
+    kargs->transA = fTransA;
+    kargs->transB = (fTransA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans;
+    kargs->uplo = fUplo;
+    kargs->M = N;
+    kargs->N = N;
+    kargs->K = K;
+    kargs->A = A;
+    kargs->offA = offA;
+    kargs->offa = offA;
+    kargs->lda.matrix = lda;
+    kargs->B = A;
+    kargs->offBX = offA;
+    kargs->ldb.matrix = lda;
+    kargs->C = C;
+    kargs->offCY = offC;
+    kargs->ldc.matrix = ldc;
+    kargs->pigFuncID = CLBLAS_HERK;
+
+	err = CL_SUCCESS;
+	#ifdef DEBUG_HERK
+        printf("doHerk called\n");
+    #endif
+
+    numCommandQueues = 1;
+    // Call GEMM to handle HERK.
+    err = executeGEMM(kargs,  numCommandQueues, commandQueues,
+        numEventsInWaitList, eventWaitList, events);
+/*
+    listInitHead(&seq);
+    err = makeSolutionSeq(CLBLAS_GEMM, kargs, numCommandQueues, commandQueues,
+        numEventsInWaitList, eventWaitList, events, &seq);
+    if (err == CL_SUCCESS) {
+        err = executeSolutionSeq(&seq);
+    }
+
+    freeSolutionSeq(&seq);
+*/
+    return (clblasStatus)err;
+}
+
+clblasStatus
+clblasCherk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    float beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    FloatComplex fAlpha, fBeta;
+
+    memset(&kargs, 0, sizeof(kargs));
+
+    CREAL(fAlpha) = alpha;
+    CIMAG(fAlpha) = 0.0f;
+    CREAL(fBeta)  = beta;
+    CIMAG(fBeta)  = 0.0f;
+
+    kargs.alpha.argFloatComplex = fAlpha;
+    kargs.beta.argFloatComplex = fBeta;
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+
+    return doHerk(&kargs, order, uplo, transA, N, K, A, offA, lda,
+                    C, offC, ldc, numCommandQueues, commandQueues,
+                    numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasZherk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    double beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    DoubleComplex fAlpha, fBeta;
+
+    memset(&kargs, 0, sizeof(kargs));
+
+    CREAL(fAlpha) = alpha;
+    CIMAG(fAlpha) = 0.0f;
+    CREAL(fBeta)  = beta;
+    CIMAG(fBeta)  = 0.0f;
+
+    kargs.alpha.argDoubleComplex = fAlpha;
+    kargs.beta.argDoubleComplex = fBeta;
+
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+
+    return doHerk(&kargs, order, uplo, transA, N, K, A, offA, lda,
+                    C, offC, ldc, numCommandQueues, commandQueues,
+                    numEventsInWaitList, eventWaitList, events);
+}
+
diff --git a/src/library/blas/xhpmv.c b/src/library/blas/xhpmv.c
new file mode 100644
index 0000000..991819c
--- /dev/null
+++ b/src/library/blas/xhpmv.c
@@ -0,0 +1,185 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <string.h>
+#include <clBLAS.h>
+#include <devinfo.h>
+
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+static clblasStatus
+doHpmv(
+    CLBlasKargs *kargs,
+    clblasOrder order,
+    clblasUplo uplo,
+	size_t N,
+    const cl_mem AP,
+    size_t offa,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    cl_int err;
+    ListHead seq1, seq2;
+	cl_event first_event;
+    clblasStatus retCode = clblasSuccess;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+    /* Validate arguments */
+
+    if (retCode = checkMemObjects(AP, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET)) {
+        return retCode;
+    }
+    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, AP,
+                         offa, 0, A_MAT_ERRSET )) {
+        return retCode;
+    }
+    if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+        return retCode;
+    }
+    if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+        return retCode;
+    }
+	if ((commandQueues == NULL) || (numCommandQueues == 0))
+    {
+        return clblasInvalidValue;
+    }
+    if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+    {
+        return clblasInvalidEventWaitList;
+    }
+
+	numCommandQueues = 1;
+    kargs->order = order;
+    kargs->uplo = uplo;
+    kargs->N = N;
+    kargs->A = AP;
+    kargs->offA = offa;
+	kargs->offa = offa;
+    kargs->lda.matrix = 0;      // Set lda as zero for packed matrices
+    kargs->B = X;
+    kargs->offBX = offx;
+    kargs->ldb.vector = incx;
+    kargs->C = Y;
+    kargs->offCY = offy;
+    kargs->ldc.vector = incy;
+	kargs->transA = clblasNoTrans;
+	kargs->diag = clblasNonUnit;
+
+    kargs->pigFuncID = CLBLAS_HPMV;
+
+	listInitHead(&seq1);
+    err = makeSolutionSeq(CLBLAS_TRMV, kargs, numCommandQueues, commandQueues,
+                            numEventsInWaitList, eventWaitList, &first_event, &seq1);
+    if (err == CL_SUCCESS) {
+        err = executeSolutionSeq(&seq1);
+		if (err == CL_SUCCESS)
+		{
+			listInitHead(&seq2);
+			kargs->transA = clblasConjTrans;
+		    kargs->diag   = clblasUnit;
+			err = makeSolutionSeq(CLBLAS_TRMV, kargs, numCommandQueues, commandQueues,
+			                            1, &first_event, events, &seq2);
+			if (err == CL_SUCCESS)
+			{
+				err = executeSolutionSeq(&seq2);
+			}
+			freeSolutionSeq(&seq2);
+		}
+    }
+
+    freeSolutionSeq(&seq1);
+    return (clblasStatus)err;
+}
+
+clblasStatus
+clblasChpmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem AP,
+	size_t offa,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_float2 beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+    kargs.alpha.argFloatComplex = alpha;
+    kargs.beta.argFloatComplex = beta;
+
+    return doHpmv(&kargs, order, uplo, N, AP, offa, X, offx, incx,
+                  Y, offy, incy, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasZhpmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem AP,
+    size_t offa,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_double2 beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+    kargs.alpha.argDoubleComplex = alpha;
+    kargs.beta.argDoubleComplex = beta;
+
+    return doHpmv(&kargs, order, uplo, N, AP, offa, X, offx, incx,
+                  Y, offy, incy, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
\ No newline at end of file
diff --git a/src/library/blas/xnrm2.c b/src/library/blas/xnrm2.c
new file mode 100644
index 0000000..833d855
--- /dev/null
+++ b/src/library/blas/xnrm2.c
@@ -0,0 +1,361 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+//#define USE_HYPOT
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+clblasStatus
+doNrm2_hypot(CLBlasKargs *kargs,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    cl_int err;
+	ListHead seq, seq2;
+    cl_event firstNrmCall;
+    CLBlasKargs redctnArgs;
+    ListNode *listNodePtr;
+    SolutionStep *step;
+
+    //
+    // Scratch buffer will be of %PTYPE
+    // Result of compelx nrm2 is scalar
+    //
+    DataType nrmType = (kargs->dtype == TYPE_COMPLEX_FLOAT)? TYPE_FLOAT :
+                       ((kargs->dtype == TYPE_COMPLEX_DOUBLE)? TYPE_DOUBLE : (kargs->dtype));
+
+    kargs->redctnType = REDUCE_BY_HYPOT;
+    memcpy(&redctnArgs, kargs, sizeof(CLBlasKargs));
+    redctnArgs.dtype = nrmType;
+
+	listInitHead(&seq);
+	err = makeSolutionSeq(CLBLAS_NRM2, kargs, numCommandQueues, commandQueues,
+        					  numEventsInWaitList, eventWaitList, &firstNrmCall, &seq);
+	if (err == CL_SUCCESS)
+    {
+        /** The second kernel call needs to know the number of work-groups used
+            in the first kernel call. This number of work-groups is calculated here
+            and passed as N to second reduction kernel
+        **/
+        err = executeSolutionSeq(&seq);
+        if (err == CL_SUCCESS)
+        {
+            listNodePtr = listNodeFirst(&seq);        // Get the node
+            step = container_of(listNodePtr, node, SolutionStep);
+            redctnArgs.N = step->pgran.numWGSpawned[0];     // 1D block was used
+
+            listInitHead(&seq2);
+            err = makeSolutionSeq(CLBLAS_REDUCTION_EPILOGUE, &redctnArgs, numCommandQueues, commandQueues,
+                      1, &firstNrmCall, events, &seq2);
+
+            if (err == CL_SUCCESS)
+            {
+                err = executeSolutionSeq(&seq2);
+            }
+            freeSolutionSeq(&seq2);
+        }
+    }
+
+	freeSolutionSeq(&seq);
+	return (clblasStatus)err;
+}
+
+clblasStatus
+doNrm2_ssq(CLBlasKargs *kargs,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    cl_int err;
+	ListHead seq, seq2;
+    cl_event firstNrmCall;
+    CLBlasKargs redctnArgs;
+    ListNode *listNodePtr;
+    SolutionStep *step;
+
+    //
+    // Scratch buffer will be of %PTYPE
+    // Result of compelx nrm2 is scalar
+    //
+    DataType nrmType = (kargs->dtype == TYPE_COMPLEX_FLOAT)? TYPE_FLOAT :
+                       ((kargs->dtype == TYPE_COMPLEX_DOUBLE)? TYPE_DOUBLE : (kargs->dtype));
+
+    kargs->redctnType = REDUCE_BY_SSQ;
+    memcpy(&redctnArgs, kargs, sizeof(CLBlasKargs));
+    redctnArgs.dtype = nrmType;
+
+	listInitHead(&seq);
+	err = makeSolutionSeq(CLBLAS_NRM2, kargs, numCommandQueues, commandQueues,
+        					  numEventsInWaitList, eventWaitList, &firstNrmCall, &seq);
+	if (err == CL_SUCCESS)
+    {
+        /** The second kernel call needs to know the number of work-groups used
+            in the first kernel call. This number of work-groups is calculated here
+            and passed as N to second reduction kernel
+        **/
+        err = executeSolutionSeq(&seq);
+        if (err == CL_SUCCESS)
+        {
+            listNodePtr = listNodeFirst(&seq);        // Get the node
+            step = container_of(listNodePtr, node, SolutionStep);
+            redctnArgs.N = step->pgran.numWGSpawned[0];     // 1D block was used
+
+            listInitHead(&seq2);
+            err = makeSolutionSeq(CLBLAS_REDUCTION_EPILOGUE, &redctnArgs, numCommandQueues, commandQueues,
+                      1, &firstNrmCall, events, &seq2);
+
+            if (err == CL_SUCCESS)
+            {
+                err = executeSolutionSeq(&seq2);
+            }
+            freeSolutionSeq(&seq2);
+        }
+    }
+
+	freeSolutionSeq(&seq);
+	return (clblasStatus)err;
+}
+
+
+clblasStatus
+doNrm2(
+    bool useHypot,
+	CLBlasKargs *kargs,
+	size_t N,
+    cl_mem NRM2,
+    size_t offNRM2,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus retCode = clblasSuccess;
+
+    DataType nrmType = (kargs->dtype == TYPE_COMPLEX_FLOAT)? TYPE_FLOAT :
+                       ((kargs->dtype == TYPE_COMPLEX_DOUBLE)? TYPE_DOUBLE : (kargs->dtype));
+
+	if (!clblasInitialized) {
+        return clblasNotInitialized;
+	}
+
+	/* Validate arguments */
+
+	retCode = checkMemObjects(X, NRM2, scratchBuff, true, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET );
+	if (retCode) {
+		printf("Invalid mem object..\n");
+        return retCode;
+	}
+
+	// Check wheather enough memory was allocated
+    retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET );
+	if (retCode) {
+		printf("Invalid Size for X\n");
+        return retCode;
+	}
+	// Minimum size of scratchBuff is 2*N
+	retCode = checkVectorSizes(kargs->dtype, (2*N), scratchBuff, 0, 1, X_VEC_ERRSET );
+    if (retCode) {
+		printf("Insufficient ScratchBuff\n");
+        return retCode;
+	}
+
+    retCode = checkVectorSizes(nrmType, 1, NRM2, offNRM2, 1, Y_VEC_ERRSET );
+	if (retCode) {
+		printf("Invalid Size for NRM2\n");
+        return retCode;
+	}
+	///////////////////////////////////////////////////////////////
+
+	if ((commandQueues == NULL) || (numCommandQueues == 0))
+	{
+		return clblasInvalidValue;
+	}
+
+	/* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */
+	numCommandQueues = 1;
+	if (commandQueues[0] == NULL)
+	{
+		return clblasInvalidCommandQueue;
+	}
+
+	if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+	{
+		return clblasInvalidEventWaitList;
+	}
+
+	kargs->N = N;
+	kargs->A = NRM2;
+    kargs->offA = offNRM2;
+    kargs->offa = offNRM2;
+	kargs->B = X;
+	kargs->offBX = offx;
+	kargs->ldb.vector = incx;
+    if(incx < 1) {              // According to netlib, if incx<1, NRM2 will be zero
+        kargs->N = 1;           // Makeing it launch only 1 work-group
+    }
+    kargs->D = scratchBuff;
+
+    if(useHypot)
+    {
+        return doNrm2_hypot(kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+    else
+    {
+        return doNrm2_ssq(kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+}
+
+clblasStatus
+clblasSnrm2(
+    size_t N,
+    cl_mem NRM2,
+    size_t offNRM2,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    bool useHypot;
+    CLBlasKargs kargs;
+
+    #ifdef USE_HYPOT
+        useHypot = true;
+    #else
+        useHypot = false;
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_FLOAT;
+
+    return doNrm2(useHypot, &kargs, N, NRM2, offNRM2, X, offx, incx, scratchBuff,
+                    numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasDnrm2(
+    size_t N,
+    cl_mem NRM2,
+    size_t offNRM2,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    bool useHypot;
+    CLBlasKargs kargs;
+
+    #ifdef USE_HYPOT
+        useHypot = true;
+    #else
+        useHypot = false;
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_DOUBLE;
+
+    return doNrm2(useHypot, &kargs, N, NRM2, offNRM2, X, offx, incx, scratchBuff,
+                    numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasScnrm2(
+    size_t N,
+    cl_mem NRM2,
+    size_t offNRM2,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    bool useHypot;
+    CLBlasKargs kargs;
+
+    #ifdef USE_HYPOT
+        useHypot = true;
+    #else
+        useHypot = false;
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+
+    return doNrm2(useHypot, &kargs, N, NRM2, offNRM2, X, offx, incx, scratchBuff,
+                    numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasDznrm2(
+    size_t N,
+    cl_mem NRM2,
+    size_t offNRM2,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    bool useHypot;
+    CLBlasKargs kargs;
+
+    #ifdef USE_HYPOT
+        useHypot = true;
+    #else
+        useHypot = false;
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+
+    return doNrm2(useHypot, &kargs, N, NRM2, offNRM2, X, offx, incx, scratchBuff,
+                    numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+}
+
diff --git a/src/library/blas/xrot.c b/src/library/blas/xrot.c
new file mode 100644
index 0000000..7fd981b
--- /dev/null
+++ b/src/library/blas/xrot.c
@@ -0,0 +1,224 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+
+clblasStatus
+doRot(
+	CLBlasKargs *kargs,
+    size_t N,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		cl_int err;
+		ListHead seq;
+        clblasStatus retCode = clblasSuccess;
+
+		if (!clblasInitialized) {
+        return clblasNotInitialized;
+		}
+
+		/* Validate arguments */
+
+        retCode = checkMemObjects(X, Y, X, false, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET );
+		if (retCode) {
+			printf("Invalid mem object..\n");
+            return retCode;
+		}
+
+		// Check wheather enough memory was allocated
+
+		if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+			printf("Invalid Size for X\n");
+            return retCode;
+		}
+		if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+			printf("Invalid Size for Y\n");
+            return retCode;
+		}
+		///////////////////////////////////////////////////////////////
+
+		if ((commandQueues == NULL) || (numCommandQueues == 0))
+		{
+			return clblasInvalidValue;
+		}
+
+		/* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */
+		numCommandQueues = 1;
+		if (commandQueues[0] == NULL)
+		{
+			return clblasInvalidCommandQueue;
+		}
+
+		if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+		{
+			return clblasInvalidEventWaitList;
+		}
+
+
+		kargs->N = N;
+		kargs->A = X;
+		kargs->offBX = offx;
+		kargs->ldb.vector = incx;	// Will be using this as incx
+		kargs->B = Y;
+		kargs->offCY = offy;
+		kargs->ldc.vector = incy;	// Will be using this as incy
+		kargs->pigFuncID = CLBLAS_ROT;  // Using ROTM kernel for ROT. Both are similar
+
+		listInitHead(&seq);
+		err = makeSolutionSeq(CLBLAS_ROTM, kargs, numCommandQueues, commandQueues,
+        					        numEventsInWaitList, eventWaitList, events, &seq);
+		if (err == CL_SUCCESS) {
+       		err = executeSolutionSeq(&seq);
+		}
+
+		freeSolutionSeq(&seq);
+
+		return (clblasStatus)err;
+	}
+
+
+
+
+
+clblasStatus
+clblasSrot(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_float C,
+    cl_float S,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_FLOAT;
+        kargs.alpha.argFloat = C;
+        kargs.beta.argFloat = S;
+
+		return doRot(&kargs, N, X, offx, incx, Y, offy, incy,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasDrot(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_double C,
+    cl_double S,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_DOUBLE;
+        kargs.alpha.argDouble = C;
+        kargs.beta.argDouble = S;
+
+		return doRot(&kargs, N, X, offx, incx, Y, offy, incy,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasCsrot(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    float C,
+    float S,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_COMPLEX_FLOAT;
+        kargs.alpha.argFloat = C;
+        kargs.beta.argFloat = S;
+
+		return doRot(&kargs, N, X, offx, incx, Y, offy, incy,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasZdrot(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    double C,
+    double S,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_COMPLEX_DOUBLE;
+        kargs.alpha.argDouble = C;
+        kargs.beta.argDouble = S;
+
+		return doRot(&kargs, N, X, offx, incx, Y, offy, incy,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
diff --git a/src/library/blas/xrotg.c b/src/library/blas/xrotg.c
new file mode 100644
index 0000000..fb9c8e1
--- /dev/null
+++ b/src/library/blas/xrotg.c
@@ -0,0 +1,234 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/***********************************************************************
+**  Rotgright (C) 2011 Advanced Micro Devices, Inc. All Rights Reserved.
+***********************************************************************/
+
+//#define DEBUG_ROTG
+
+#include <stdio.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+
+clblasStatus
+doRotg(
+	CLBlasKargs *kargs,
+    cl_mem A,
+    size_t offA,
+    cl_mem B,
+    size_t offB,
+    cl_mem C,
+    size_t offC,
+    cl_mem S,
+    size_t offS,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		cl_int err;
+		ListHead seq;
+        clblasStatus retCode = clblasSuccess;
+
+        // C is of real type even for complex numbers
+        DataType cType = (kargs->dtype == TYPE_COMPLEX_FLOAT)? TYPE_FLOAT :
+                            ((kargs->dtype == TYPE_COMPLEX_DOUBLE)? TYPE_DOUBLE : (kargs->dtype));
+
+		if (!clblasInitialized) {
+        return clblasNotInitialized;
+		}
+
+		/* Validate arguments */
+
+        retCode = checkMemObjects(A, B, A, false, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET );
+		if (retCode) {      // for mem objects A, B
+			printf("Invalid mem object..\n");
+            return retCode;
+		}
+		retCode = checkMemObjects(C, S, C, false, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET );
+		if (retCode) {      // for mem objects C, S
+			printf("Invalid mem object..\n");
+            return retCode;
+		}
+
+		// Check wheather enough memory was allocated
+
+		if (retCode = checkVectorSizes(kargs->dtype, 1, A, offA, 1, X_VEC_ERRSET )) {
+			printf("Invalid Size for A\n");
+            return retCode;
+		}
+		if (retCode = checkVectorSizes(kargs->dtype, 1, B, offB, 1, Y_VEC_ERRSET )) {
+			printf("Invalid Size for B\n");
+            return retCode;
+		}
+
+		if (retCode = checkVectorSizes(cType, 1, C, offC, 1, X_VEC_ERRSET )) {
+			printf("Invalid Size for C\n");
+            return retCode;
+		}
+
+		if (retCode = checkVectorSizes(kargs->dtype, 1, S, offS, 1, Y_VEC_ERRSET )) {
+			printf("Invalid Size for S\n");
+            return retCode;
+		}
+		///////////////////////////////////////////////////////////////
+
+		if ((commandQueues == NULL) || (numCommandQueues == 0))
+		{
+			return clblasInvalidValue;
+		}
+
+		/* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */
+		numCommandQueues = 1;
+		if (commandQueues[0] == NULL)
+		{
+			return clblasInvalidCommandQueue;
+		}
+
+		if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+		{
+			return clblasInvalidEventWaitList;
+		}
+
+		kargs->A = A;
+    	kargs->B = B;
+		kargs->C = C;
+    	kargs->D = S;
+		kargs->offa = offA;
+		kargs->offb = offB;
+        kargs->offc = offC;
+        kargs->offd = offS;
+
+		listInitHead(&seq);
+		err = makeSolutionSeq(CLBLAS_ROTG, kargs, numCommandQueues, commandQueues,
+        					        numEventsInWaitList, eventWaitList, events, &seq);
+		if (err == CL_SUCCESS) {
+       		err = executeSolutionSeq(&seq);
+		}
+
+		freeSolutionSeq(&seq);
+
+		return (clblasStatus)err;
+	}
+
+
+
+clblasStatus
+clblasSrotg(
+    cl_mem SA,
+    size_t offSA,
+    cl_mem SB,
+    size_t offSB,
+    cl_mem C,
+    size_t offC,
+    cl_mem S,
+    size_t offS,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_FLOAT;
+
+		return doRotg(&kargs, SA, offSA, SB, offSB, C, offC, S, offS,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasDrotg(
+    cl_mem DA,
+    size_t offDA,
+    cl_mem DB,
+    size_t offDB,
+    cl_mem C,
+    size_t offC,
+    cl_mem S,
+    size_t offS,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_DOUBLE;
+
+		return doRotg(&kargs, DA, offDA, DB, offDB, C, offC, S, offS,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasCrotg(
+    cl_mem CA,
+    size_t offCA,
+    cl_mem CB,
+    size_t offCB,
+    cl_mem C,
+    size_t offC,
+    cl_mem S,
+    size_t offS,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_COMPLEX_FLOAT;
+
+		return doRotg(&kargs, CA, offCA, CB, offCB, C, offC, S, offS,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasZrotg(
+    cl_mem CA,
+    size_t offCA,
+    cl_mem CB,
+    size_t offCB,
+    cl_mem C,
+    size_t offC,
+    cl_mem S,
+    size_t offS,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_COMPLEX_DOUBLE;
+
+		return doRotg(&kargs, CA, offCA, CB, offCB, C, offC, S, offS,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
diff --git a/src/library/blas/xrotm.c b/src/library/blas/xrotm.c
new file mode 100644
index 0000000..fcdfcb0
--- /dev/null
+++ b/src/library/blas/xrotm.c
@@ -0,0 +1,173 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+
+clblasStatus
+doRotm(
+	CLBlasKargs *kargs,
+    size_t N,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem param,
+    size_t offParam,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		cl_int err;
+		ListHead seq;
+        clblasStatus retCode = clblasSuccess;
+
+		if (!clblasInitialized) {
+        return clblasNotInitialized;
+		}
+
+		/* Validate arguments */
+
+        retCode = checkMemObjects(X, Y, param, true, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET );
+		if (retCode) {
+			printf("Invalid mem object..\n");
+            return retCode;
+		}
+
+		// Check wheather enough memory was allocated
+
+		if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+			printf("Invalid Size for X\n");
+            return retCode;
+		}
+		if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+			printf("Invalid Size for Y\n");
+            return retCode;
+		}
+		if (retCode = checkVectorSizes(kargs->dtype, 5, param, offParam, 1, Y_VEC_ERRSET )) {
+			printf("Invalid Size for PARAM\n"); // PARAM is of minimum length 5
+            return retCode;
+		}
+		///////////////////////////////////////////////////////////////
+
+		if ((commandQueues == NULL) || (numCommandQueues == 0))
+		{
+			return clblasInvalidValue;
+		}
+
+		/* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */
+		numCommandQueues = 1;
+		if (commandQueues[0] == NULL)
+		{
+			return clblasInvalidCommandQueue;
+		}
+
+		if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+		{
+			return clblasInvalidEventWaitList;
+		}
+
+
+		kargs->N = N;
+		kargs->A = X;
+		kargs->offBX = offx;
+		kargs->ldb.vector = incx;	// Will be using this as incx
+		kargs->B = Y;
+		kargs->offCY = offy;
+		kargs->ldc.vector = incy;	// Will be using this as incy
+		kargs->D = param;
+		kargs->offd = offParam;
+		kargs->pigFuncID = CLBLAS_ROTM;
+
+		listInitHead(&seq);
+		err = makeSolutionSeq(CLBLAS_ROTM, kargs, numCommandQueues, commandQueues,
+        					        numEventsInWaitList, eventWaitList, events, &seq);
+		if (err == CL_SUCCESS) {
+       		err = executeSolutionSeq(&seq);
+		}
+
+		freeSolutionSeq(&seq);
+
+		return (clblasStatus)err;
+	}
+
+
+
+
+
+clblasStatus
+clblasSrotm(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    const cl_mem SPARAM,
+    size_t offSparam,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_FLOAT;
+
+		return doRotm(&kargs, N, X, offx, incx, Y, offy, incy, SPARAM, offSparam,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasDrotm(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    const cl_mem DPARAM,
+    size_t offDparam,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_DOUBLE;
+
+		return doRotm(&kargs, N, X, offx, incx, Y, offy, incy, DPARAM, offDparam,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
diff --git a/src/library/blas/xrotmg.c b/src/library/blas/xrotmg.c
new file mode 100644
index 0000000..b3c2229
--- /dev/null
+++ b/src/library/blas/xrotmg.c
@@ -0,0 +1,189 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/***********************************************************************
+**  Rotgright (C) 2011 Advanced Micro Devices, Inc. All Rights Reserved.
+***********************************************************************/
+
+#include <stdio.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+
+clblasStatus
+doRotmg(
+	CLBlasKargs *kargs,
+    cl_mem D1,
+    size_t offD1,
+    cl_mem D2,
+    size_t offD2,
+    cl_mem X1,
+    size_t offX1,
+    cl_mem Y1,
+    size_t offY1,
+    cl_mem param,
+    size_t offParam,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		cl_int err;
+		ListHead seq;
+        clblasStatus retCode = clblasSuccess;
+
+		if (!clblasInitialized) {
+        return clblasNotInitialized;
+		}
+
+		/* Validate arguments */
+
+        retCode = checkMemObjects(D1, D2, X1, true, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET );
+		if (retCode) {      // for mem objects A, B
+			printf("Invalid mem object..\n");
+            return retCode;
+		}
+		retCode = checkMemObjects(Y1, param, Y1, false, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET );
+		if (retCode) {      // for mem objects C, S
+			printf("Invalid mem object..\n");
+            return retCode;
+		}
+
+		// Check wheather enough memory was allocated
+
+		if (retCode = checkVectorSizes(kargs->dtype, 1, D1, offD1, 1, X_VEC_ERRSET )) {
+			printf("Invalid Size for D1\n");
+            return retCode;
+		}
+		if (retCode = checkVectorSizes(kargs->dtype, 1, D2, offD2, 1, Y_VEC_ERRSET )) {
+			printf("Invalid Size for D2\n");
+            return retCode;
+		}
+		if (retCode = checkVectorSizes(kargs->dtype, 1, X1, offX1, 1, X_VEC_ERRSET )) {
+			printf("Invalid Size for X1\n");
+            return retCode;
+		}
+		if (retCode = checkVectorSizes(kargs->dtype, 1, Y1, offY1, 1, Y_VEC_ERRSET )) {
+			printf("Invalid Size for Y1\n");
+            return retCode;
+		}
+		if (retCode = checkVectorSizes(kargs->dtype, 1, param, offParam, 1, Y_VEC_ERRSET )) {
+			printf("Invalid Size for PARAM\n");
+            return retCode;
+		}
+		///////////////////////////////////////////////////////////////
+
+		if ((commandQueues == NULL) || (numCommandQueues == 0))
+		{
+			return clblasInvalidValue;
+		}
+
+		/* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */
+		numCommandQueues = 1;
+		if (commandQueues[0] == NULL)
+		{
+			return clblasInvalidCommandQueue;
+		}
+
+		if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+		{
+			return clblasInvalidEventWaitList;
+		}
+
+		kargs->A = D1;
+    	kargs->B = D2;
+		kargs->C = X1;
+    	kargs->D = Y1;
+    	kargs->E = param;
+		kargs->offa = offD1;
+		kargs->offb = offD2;
+        kargs->offc = offX1;
+        kargs->offd = offY1;
+        kargs->offe = offParam;
+
+		listInitHead(&seq);
+		err = makeSolutionSeq(CLBLAS_ROTMG, kargs, numCommandQueues, commandQueues,
+        					        numEventsInWaitList, eventWaitList, events, &seq);
+		if (err == CL_SUCCESS) {
+       		err = executeSolutionSeq(&seq);
+		}
+
+		freeSolutionSeq(&seq);
+
+		return (clblasStatus)err;
+	}
+
+
+
+clblasStatus
+clblasSrotmg(
+    cl_mem SD1,
+    size_t offSD1,
+    cl_mem SD2,
+    size_t offSD2,
+    cl_mem SX1,
+    size_t offSX1,
+    const cl_mem SY1,
+    size_t offSY1,
+    cl_mem SPARAM,
+    size_t offSparam,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_FLOAT;
+
+		return doRotmg(&kargs, SD1, offSD1, SD2, offSD2, SX1, offSX1, SY1, offSY1, SPARAM, offSparam,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasDrotmg(
+    cl_mem DD1,
+    size_t offDD1,
+    cl_mem DD2,
+    size_t offDD2,
+    cl_mem DX1,
+    size_t offDX1,
+    const cl_mem DY1,
+    size_t offDY1,
+    cl_mem DPARAM,
+    size_t offDparam,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_DOUBLE;
+
+		return doRotmg(&kargs, DD1, offDD1, DD2, offDD2, DX1, offDX1, DY1, offDY1, DPARAM, offDparam,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
diff --git a/src/library/blas/xscal.c b/src/library/blas/xscal.c
new file mode 100644
index 0000000..6722383
--- /dev/null
+++ b/src/library/blas/xscal.c
@@ -0,0 +1,277 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+//#define DEBUG_SCAL
+
+#include <stdio.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+
+clblasStatus
+doScal(
+	CLBlasKargs *kargs,
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		cl_int err;
+		ListHead seq;
+        clblasStatus retCode = clblasSuccess;
+
+		if (!clblasInitialized) {
+        return clblasNotInitialized;
+		}
+
+		/* Validate arguments */
+
+        retCode = checkMemObjects(X, X, X, false, X_VEC_ERRSET, X_VEC_ERRSET, X_VEC_ERRSET );
+		if (retCode) {
+			printf("Invalid mem object..\n");
+            return retCode;
+		}
+
+		// Check wheather enough memory was allocated
+
+		if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+			printf("Invalid Size for X\n");
+            return retCode;
+		}
+		///////////////////////////////////////////////////////////////
+
+		if ((commandQueues == NULL) || (numCommandQueues == 0))
+		{
+			return clblasInvalidValue;
+		}
+
+		/* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */
+		numCommandQueues = 1;
+		if (commandQueues[0] == NULL)
+		{
+			return clblasInvalidCommandQueue;
+		}
+
+		if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+		{
+			return clblasInvalidEventWaitList;
+		}
+
+		kargs->N = N;
+		kargs->A = X;
+		kargs->offBX = offx;
+		kargs->ldb.vector = incx;	// Will be using this as incx
+
+		if(incx < 0) {    // According to Netlib - return for negative incx
+		    return clblasSuccess;
+		}
+
+		#ifdef DEBUG_SCAL
+		printf("Calling makeSolutionSeq from DoScal: SCAL\n");
+		#endif
+
+		listInitHead(&seq);
+		err = makeSolutionSeq(CLBLAS_SCAL, kargs, numCommandQueues, commandQueues,
+        					        numEventsInWaitList, eventWaitList, events, &seq);
+		if (err == CL_SUCCESS) {
+       		err = executeSolutionSeq(&seq);
+		}
+
+		freeSolutionSeq(&seq);
+
+		return (clblasStatus)err;
+	}
+
+
+
+
+
+clblasStatus
+clblasSscal(
+    size_t N,
+    float alpha,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		#ifdef DEBUG_SCAL
+		printf("\nSSCAL Called\n");
+		#endif
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_FLOAT;
+        kargs.alpha.argFloat = alpha;
+
+		return doScal(&kargs, N, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasDscal(
+    size_t N,
+    double alpha,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		#ifdef DEBUG_SCAL
+		printf("\nDSCAL Called\n");
+		#endif
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_DOUBLE;
+        kargs.alpha.argDouble = alpha;
+
+		return doScal(&kargs, N, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasCscal(
+    size_t N,
+    cl_float2 alpha,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		#ifdef DEBUG_SCAL
+		printf("\nCSCAL Called\n");
+		#endif
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_COMPLEX_FLOAT;
+        kargs.alpha.argFloatComplex = alpha;
+
+		return doScal(&kargs, N, X, offx, incx,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasZscal(
+    size_t N,
+    cl_double2 alpha,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		#ifdef DEBUG_SCAL
+		printf("\nZSCAL Called\n");
+		#endif
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_COMPLEX_DOUBLE;
+        kargs.alpha.argDoubleComplex = alpha;
+
+		return doScal(&kargs, N, X, offx, incx,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasCsscal(
+    size_t N,
+    float alpha,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        CLBlasKargs kargs;
+        FloatComplex fAlpha;
+
+        #ifdef DEBUG_SSCAL
+        printf("\nCSSCAL Called\n");
+        #endif
+
+        CREAL(fAlpha) = alpha;
+        CIMAG(fAlpha) = 0.0f;
+
+        memset(&kargs, 0, sizeof(kargs));
+        kargs.alpha.argFloatComplex = fAlpha;
+        kargs.dtype = TYPE_COMPLEX_FLOAT;
+
+        return doScal(&kargs, N, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+clblasStatus
+clblasZdscal(
+    size_t N,
+    double alpha,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+         CLBlasKargs kargs;
+        DoubleComplex fAlpha;
+
+        #ifdef DEBUG_SSCAL
+        printf("\nZDSCAL Called\n");
+        #endif
+
+        CREAL(fAlpha) = alpha;
+        CIMAG(fAlpha) = 0.0f;
+
+        memset(&kargs, 0, sizeof(kargs));
+        kargs.alpha.argDoubleComplex = fAlpha;
+        kargs.dtype = TYPE_COMPLEX_DOUBLE;
+
+        return doScal(&kargs, N, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
diff --git a/src/library/blas/xshbmv.c b/src/library/blas/xshbmv.c
new file mode 100644
index 0000000..e0a5087
--- /dev/null
+++ b/src/library/blas/xshbmv.c
@@ -0,0 +1,264 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <string.h>
+#include <clBLAS.h>
+#include <devinfo.h>
+
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+static clblasStatus
+doSHbmv(
+    CLBlasKargs *kargs,
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    cl_int err;
+    ListHead seq;
+    clblasStatus retCode = clblasSuccess;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+    if ((commandQueues == NULL) || (numCommandQueues == 0))
+    {
+        return clblasInvalidValue;
+    }
+
+    if (commandQueues[0] == NULL)
+    {
+        return clblasInvalidCommandQueue;
+    }
+
+    if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+    {
+        return clblasInvalidEventWaitList;
+    }
+    /* Validate arguments */
+
+    if (retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET ))
+    {
+        return retCode;
+    }
+
+    if (retCode = checkBandedMatrixSizes(kargs->dtype, order, clblasNoTrans,
+                                            N, N, K, 0, A, offa, lda, A_MAT_ERRSET )) {
+        return retCode;
+    }
+    if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET )) {
+        return retCode;
+    }
+    if (retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET )) {
+        return retCode;
+    }
+
+    /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */
+    numCommandQueues = 1;
+
+    kargs->order = order;
+    kargs->uplo = uplo;
+    kargs->transA = clblasNoTrans;
+    kargs->N = N;
+    kargs->M = N;
+    kargs->KL = K;
+    kargs->KU = K;
+    kargs->A = A;
+    kargs->offA = offa;
+    kargs->offa = offa;
+    kargs->lda.matrix = lda;
+    kargs->B = x;
+    kargs->offBX = offx;
+    kargs->ldb.vector = incx;
+    kargs->C = y;
+    kargs->offCY = offy;
+    kargs->ldc.vector = incy;
+
+    listInitHead(&seq);
+    err = makeSolutionSeq(CLBLAS_GBMV, kargs, numCommandQueues, commandQueues,
+        numEventsInWaitList, eventWaitList, events, &seq);
+
+    if (err == CL_SUCCESS) {
+        err = executeSolutionSeq(&seq);
+    }
+
+    freeSolutionSeq(&seq);
+
+    return (clblasStatus)err;
+}
+
+clblasStatus
+clblasSsbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    size_t K,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_float beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_FLOAT;
+    kargs.pigFuncID = CLBLAS_SBMV;
+    kargs.alpha.argFloat = alpha;
+    kargs.beta.argFloat = beta;
+
+    return doSHbmv(&kargs, order, uplo, N, K, A, offa, lda, X, offx, incx,
+                  Y, offy, incy, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasDsbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    size_t K,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_double beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_DOUBLE;
+    kargs.pigFuncID = CLBLAS_SBMV;
+    kargs.alpha.argDouble = alpha;
+    kargs.beta.argDouble = beta;
+
+    return doSHbmv(&kargs, order, uplo, N, K, A, offa, lda, X, offx, incx,
+                  Y, offy, incy, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasChbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    size_t K,
+    cl_float2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_float2 beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+    kargs.pigFuncID = CLBLAS_HBMV;
+    kargs.alpha.argFloatComplex = alpha;
+    kargs.beta.argFloatComplex = beta;
+
+    return doSHbmv(&kargs, order, uplo, N, K, A, offa, lda, X, offx, incx,
+                  Y, offy, incy, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasZhbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    size_t K,
+    cl_double2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_double2 beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+    kargs.pigFuncID = CLBLAS_HBMV;
+    kargs.alpha.argDoubleComplex = alpha;
+    kargs.beta.argDoubleComplex = beta;
+
+    return doSHbmv(&kargs, order, uplo, N, K, A, offa, lda, X, offx, incx,
+                  Y, offy, incy, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
diff --git a/src/library/blas/xspmv.c b/src/library/blas/xspmv.c
new file mode 100644
index 0000000..d522ba8
--- /dev/null
+++ b/src/library/blas/xspmv.c
@@ -0,0 +1,187 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <string.h>
+#include <clBLAS.h>
+#include <devinfo.h>
+
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+static clblasStatus
+doSpmv(
+    CLBlasKargs *kargs,
+    clblasOrder order,
+    clblasUplo uplo,
+	size_t N,
+    const cl_mem AP,
+    size_t offa,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    cl_int err;
+    ListHead seq1, seq2;
+	cl_event first_event;
+    clblasStatus retCode = clblasSuccess;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+    /* Validate arguments */
+
+    if (retCode = checkMemObjects(AP, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+        return retCode;
+    }
+    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, AP,
+                         offa, 0, A_MAT_ERRSET )) {
+        return retCode;
+    }
+    if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+        return retCode;
+    }
+    if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+        return retCode;
+    }
+	if ((commandQueues == NULL) || (numCommandQueues == 0))
+    {
+        return clblasInvalidValue;
+    }
+    if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+    {
+        return clblasInvalidEventWaitList;
+    }
+
+	numCommandQueues = 1;
+    kargs->order = order;
+    kargs->uplo = uplo;
+    kargs->N = N;
+    kargs->A = AP;
+    kargs->offA = offa;
+	kargs->offa = offa;
+    kargs->lda.matrix = 0;      // Set lda as zero for packed matrices
+    kargs->B = X;
+    kargs->offBX = offx;
+    kargs->ldb.vector = incx;
+    kargs->C = Y;
+    kargs->offCY = offy;
+    kargs->ldc.vector = incy;
+
+	kargs->transA = clblasNoTrans;
+	kargs->diag = clblasNonUnit;
+
+    kargs->pigFuncID = CLBLAS_SPMV;
+
+	listInitHead(&seq1);
+    err = makeSolutionSeq(CLBLAS_TRMV, kargs, numCommandQueues, commandQueues,
+                            numEventsInWaitList, eventWaitList, &first_event, &seq1);
+    if (err == CL_SUCCESS) {
+        err = executeSolutionSeq(&seq1);
+		if (err == CL_SUCCESS)
+		{
+			listInitHead(&seq2);
+			kargs->transA = clblasTrans;
+		    kargs->diag   = clblasUnit;
+
+			err = makeSolutionSeq(CLBLAS_TRMV, kargs, numCommandQueues, commandQueues,
+			                            1, &first_event, events, &seq2);
+			if (err == CL_SUCCESS)
+			{
+				err = executeSolutionSeq(&seq2);
+			}
+			freeSolutionSeq(&seq2);
+		}
+    }
+
+    freeSolutionSeq(&seq1);
+    return (clblasStatus)err;
+}
+
+clblasStatus
+clblasSspmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem AP,
+	size_t offa,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_float beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_FLOAT;
+    kargs.alpha.argFloat = alpha;
+    kargs.beta.argFloat = beta;
+
+    return doSpmv(&kargs, order, uplo, N, AP, offa, X, offx, incx,
+                  Y, offy, incy, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasDspmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem AP,
+    size_t offa,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_double beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_DOUBLE;
+    kargs.alpha.argDouble = alpha;
+    kargs.beta.argDouble = beta;
+
+    return doSpmv(&kargs, order, uplo, N, AP, offa, X, offx, incx,
+                  Y, offy, incy, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
\ No newline at end of file
diff --git a/src/library/blas/xswap.c b/src/library/blas/xswap.c
new file mode 100644
index 0000000..3806618
--- /dev/null
+++ b/src/library/blas/xswap.c
@@ -0,0 +1,228 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+//#define DEBUG_SWAP
+
+#include <stdio.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+
+clblasStatus
+doSwap(
+	CLBlasKargs *kargs,
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		cl_int err;
+		ListHead seq;
+        clblasStatus retCode = clblasSuccess;
+
+		if (!clblasInitialized) {
+        return clblasNotInitialized;
+		}
+
+		/* Validate arguments */
+
+        retCode = checkMemObjects(X, Y, X, false, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET );
+		if (retCode) {
+			printf("Invalid mem object..\n");
+            return retCode;
+		}
+
+		// Check wheather enough memory was allocated
+
+		if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+			printf("Invalid Size for X\n");
+            return retCode;
+		}
+		if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+			printf("Invalid Size for Y\n");
+            return retCode;
+		}
+		///////////////////////////////////////////////////////////////
+
+		if ((commandQueues == NULL) || (numCommandQueues == 0))
+		{
+			return clblasInvalidValue;
+		}
+
+		/* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */
+		numCommandQueues = 1;
+		if (commandQueues[0] == NULL)
+		{
+			return clblasInvalidCommandQueue;
+		}
+
+		if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+		{
+			return clblasInvalidEventWaitList;
+		}
+
+		kargs->N = N;
+		kargs->A = X;
+		kargs->offBX = offx;
+		kargs->ldb.vector = incx;	// Will be using this as incx
+		kargs->B = Y;
+		kargs->offCY = offy;
+		kargs->ldc.vector = incy;	// Will be using this as incy
+
+		#ifdef DEBUG_SWAP
+		printf("Calling makeSolutionSeq from DoSwap: SWAP\n");
+		#endif
+
+		listInitHead(&seq);
+		err = makeSolutionSeq(CLBLAS_SWAP, kargs, numCommandQueues, commandQueues,
+        					        numEventsInWaitList, eventWaitList, events, &seq);
+		if (err == CL_SUCCESS) {
+       		err = executeSolutionSeq(&seq);
+		}
+
+		freeSolutionSeq(&seq);
+
+		return (clblasStatus)err;
+	}
+
+
+
+
+
+clblasStatus
+clblasSswap(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		#ifdef DEBUG_SWAP
+		printf("\nSSWAP Called\n");
+		#endif
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_FLOAT;
+
+		return doSwap(&kargs, N, X, offx, incx, Y, offy, incy,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasDswap(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		#ifdef DEBUG_SWAP
+		printf("\nDSWAP Called\n");
+		#endif
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_DOUBLE;
+
+		return doSwap(&kargs, N, X, offx, incx, Y, offy, incy,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasCswap(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int  incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		#ifdef DEBUG_SWAP
+		printf("\nCSWAP Called\n");
+		#endif
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_COMPLEX_FLOAT;
+
+		return doSwap(&kargs, N, X, offx, incx, Y, offy, incy,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasZswap(
+    size_t N,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		CLBlasKargs kargs;
+
+		#ifdef DEBUG_SWAP
+		printf("\nZSWAP Called\n");
+		#endif
+
+		memset(&kargs, 0, sizeof(kargs));
+		kargs.dtype = TYPE_COMPLEX_DOUBLE;
+
+		return doSwap(&kargs, N, X, offx, incx, Y, offy, incy,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
diff --git a/src/library/blas/xsymm.c b/src/library/blas/xsymm.c
new file mode 100644
index 0000000..e61a33f
--- /dev/null
+++ b/src/library/blas/xsymm.c
@@ -0,0 +1,436 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+#define SYMM_USING_GEMM
+//#define DEBUG_SYMM
+extern clblasStatus executeGEMM( CLBlasKargs *kargs, cl_uint numCommandQueues,
+                                    cl_command_queue *commandQueues,
+                                    cl_uint numEventsInWaitList, const cl_event *eventWaitList,
+                                    cl_event *events);
+
+clblasStatus
+doSymm(	CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, clblasSide side,
+		size_t M, size_t N,
+		const cl_mem A, size_t offa, size_t lda,
+		const cl_mem B, size_t offb, size_t ldb,
+		cl_mem C, size_t offc, size_t ldc,
+		cl_uint numCommandQueues, cl_command_queue *commandQueues,
+		cl_uint numEventsInWaitList, const cl_event *eventWaitList,
+		cl_event *events,
+        BlasFunctionID symm_or_hemm)
+{
+    cl_int err;
+    clblasStatus retCode = clblasSuccess;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+    /* Validate arguments */
+
+    if (retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET )) {
+		printf("SYMM:- Invalid mem object..\n");
+        return retCode;
+    }
+
+
+    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, B, offb, ldb, B_MAT_ERRSET )) {
+		printf("Invalid Size for B\n");
+        return retCode;
+    }
+
+    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, C, offc, ldc, C_MAT_ERRSET )) {
+		printf("Invalid Size for C\n");
+        return retCode;
+    }
+	if (side == clblasLeft)
+	{
+		// MxM x MxN
+    	if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, M, A, offa, lda, A_MAT_ERRSET )) {
+			printf("Invalid Size for A\n");
+            return retCode;
+    	}
+	} else {
+		// MxN x NxN
+    	if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+			printf("Invalid Size for A\n");
+            return retCode;
+    	}
+	}
+
+	#ifdef DEBUG_SYMM
+	printf("DoSymm being called...\n");
+	#endif
+
+	if ((commandQueues == NULL) || (numCommandQueues == 0))
+	{
+		return clblasInvalidValue;
+	}
+
+	if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+	{
+		return clblasInvalidEventWaitList;
+	}
+
+	numCommandQueues = 1;
+    kargs->order = order;
+    kargs->uplo = uplo;
+	kargs->side = side;
+    kargs->pigFuncID = symm_or_hemm;
+    kargs->M = M;
+	if (kargs->side == clblasLeft)
+	{
+		kargs->K = M;
+	} else {
+		kargs->K = N;
+	}
+    kargs->N = N;
+    kargs->A = A;
+    kargs->lda.matrix = lda;
+    kargs->B = B;
+    kargs->ldb.matrix = ldb;
+    kargs->C = C;
+    kargs->ldc.matrix = ldc;
+	kargs->offA = offa;
+	kargs->offa = offa;
+    kargs->offA = offa;
+    kargs->offBX = offb;
+    kargs->offCY = offc;
+    kargs->offsetM = 0;
+    kargs->offsetN = 0;
+    //kargs->offsetK = 0;   FIXME: not found offsetK in new AMD structure!
+    kargs->scimage[0] = 0;
+    kargs->scimage[1] = 0;
+	if (kargs->order == clblasRowMajor)
+	{
+		kargs->order = clblasColumnMajor;
+		kargs->M = N;
+		kargs->N = M;
+
+		if (kargs->side == clblasLeft)
+		{
+			kargs->side = clblasRight;
+		} else {
+			kargs->side = clblasLeft;
+		}
+
+		if (kargs->uplo == clblasUpper)
+		{
+			kargs->uplo = clblasLower;
+		} else {
+			kargs->uplo = clblasUpper;
+		}
+	}
+
+#ifndef SYMM_USING_GEMM
+	#ifdef DEBUG_SYMM
+	printf("Calling makeSolutionSeq : SYMM \n");
+	#endif
+    {
+        ListHead seq;
+
+        listInitHead(&seq);
+        err = makeSolutionSeq(CLBLAS_SYMM, kargs, numCommandQueues, commandQueues,
+            				  numEventsInWaitList, eventWaitList, events, &seq);
+        if (err == CL_SUCCESS) {
+       	    err = executeSolutionSeq(&seq);
+        }
+        freeSolutionSeq(&seq);
+    }
+#else
+    //
+    // SYMM_USING_GEMM
+    //
+    {
+        CLBlasKargs GEMMNArgs, GEMMTArgs, GEMMDArgs;
+        cl_event gemmNEvent, gemmTEvent ;
+        FloatComplex cBeta;
+        DoubleComplex zBeta;
+        clblasTranspose transposeFunction = clblasTrans;
+
+        memcpy(&GEMMNArgs, kargs, sizeof(CLBlasKargs));
+        memcpy(&GEMMTArgs, kargs, sizeof(CLBlasKargs));
+        memcpy(&GEMMDArgs, kargs, sizeof(CLBlasKargs));
+
+        switch(symm_or_hemm)
+        {
+            case CLBLAS_SYMM:
+                transposeFunction = clblasTrans;
+                GEMMDArgs.pigFuncID = CLBLAS_SYMM_DIAGONAL;
+                break;
+
+            case CLBLAS_HEMM:
+                transposeFunction = clblasConjTrans;
+                GEMMDArgs.pigFuncID = CLBLAS_HEMM_DIAGONAL;
+                break;
+
+            default:
+                printf("WARNING: doSymm():  Neither SYMM nor HEMM is calling this function.");
+                break;
+        }
+
+
+        //
+        // It is the diagonal piggy back for GEMMD. For others, it is just CLBLAS_SYMM
+        //
+
+        //
+        // Set the Transpose for GEMM'T' and GEMM'D'
+        // The other two do not have transpose by default
+        //
+        switch(kargs->side)
+        {
+            case clblasLeft:
+                GEMMTArgs.transA = transposeFunction;
+                if (kargs->uplo == clblasUpper)
+                {
+                    //
+                    // This is for proper TAIL handling for Right Lower case alone
+                    // For all other cases, NN kernel is good enough to handle tails
+                    //
+                   GEMMDArgs.transA = transposeFunction;
+                }
+                break;
+
+            case clblasRight:
+                GEMMTArgs.transB = transposeFunction;
+                if (kargs->uplo == clblasLower)
+                {
+                    //
+                    // This is for proper TAIL handling for Right Lower case alone
+                    // For all other cases, NN kernel is good enough to handle tails
+                    //
+                    GEMMDArgs.transB = transposeFunction;
+                }
+                break;
+
+            default:
+                break;
+        }
+
+        //
+        // Set the BETA multiplier to 1 for GEMMT and GEMMD
+        //
+        memset(&GEMMTArgs.beta, 0, sizeof(GEMMTArgs.beta));
+        memset(&GEMMDArgs.beta, 0, sizeof(GEMMDArgs.beta));
+        switch(kargs->dtype)
+        {
+            case TYPE_FLOAT:
+            GEMMTArgs.beta.argFloat = 1.0f;
+            GEMMDArgs.beta.argFloat = 1.0f;
+            break;
+
+            case TYPE_DOUBLE:
+            GEMMTArgs.beta.argDouble = 1.0;
+            GEMMDArgs.beta.argDouble = 1.0;
+            break;
+
+            case TYPE_COMPLEX_FLOAT:
+            CREAL(cBeta) = 1.0f;
+            CIMAG(cBeta) = 0.0f;
+            GEMMTArgs.beta.argFloatComplex = cBeta;
+            GEMMDArgs.beta.argFloatComplex = cBeta;
+            break;
+
+            case TYPE_COMPLEX_DOUBLE:
+            CREAL(zBeta) = 1.0;
+            CIMAG(zBeta) = 0.0;
+            GEMMTArgs.beta.argDoubleComplex = zBeta;
+            GEMMDArgs.beta.argDoubleComplex = zBeta;
+            break;
+        }
+
+        //
+        // GEMM Handler will notice the "pigFuncID" and set appropriate flags
+        //
+        err = executeGEMM(&GEMMNArgs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, &gemmNEvent);
+        if (err == CL_SUCCESS)
+        {
+            err = executeGEMM(&GEMMTArgs, numCommandQueues, commandQueues, 1, &gemmNEvent, &gemmTEvent);
+            if (err == CL_SUCCESS)
+            {
+                err = executeGEMM(&GEMMDArgs, numCommandQueues, commandQueues, 1, &gemmTEvent, events);
+            }
+        }
+    }
+#endif
+    return (clblasStatus)err;
+}
+
+clblasStatus
+clblasSsymm(
+	clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    size_t M,
+    size_t N,
+    float alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    float beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+    	CLBlasKargs kargs;
+
+    	memset(&kargs, 0, sizeof(kargs));
+    	kargs.dtype = TYPE_FLOAT;
+    	kargs.alpha.argFloat = alpha;
+    	kargs.beta.argFloat = beta;
+
+		#ifdef DEBUG_SYMM
+		printf("Ssymm called\n");
+		#endif
+    	return doSymm(	&kargs, order, uplo, side, M, N, A, offa, lda, B, offb, ldb, C, offc, ldc,
+						numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, CLBLAS_SYMM);
+	}
+
+clblasStatus
+clblasDsymm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    size_t M,
+    size_t N,
+    double alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    double beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+    	CLBlasKargs kargs;
+
+		#ifdef DEBUG_SYMM
+		printf("Dsymm called\n");
+		#endif
+    	memset(&kargs, 0, sizeof(kargs));
+    	kargs.dtype = TYPE_DOUBLE;
+    	kargs.alpha.argDouble = alpha;
+    	kargs.beta.argDouble = beta;
+
+    	return doSymm(	&kargs, order, uplo, side, M, N, A, offa, lda, B, offb, ldb, C, offc, ldc,
+						numCommandQueues, commandQueues, numEventsInWaitList,
+						eventWaitList, events, CLBLAS_SYMM);
+	}
+
+clblasStatus
+clblasCsymm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    size_t M,
+    size_t N,
+    cl_float2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_float2 beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+    	CLBlasKargs kargs;
+
+		#ifdef DEBUG_SYMM
+		printf("Csymm called\n");
+		#endif
+    	memset(&kargs, 0, sizeof(kargs));
+    	kargs.dtype = TYPE_COMPLEX_FLOAT;
+    	kargs.alpha.argFloatComplex = alpha;
+    	kargs.beta.argFloatComplex  = beta;
+
+    	return doSymm(	&kargs, order, uplo, side, M, N, A, offa, lda, B, offb, ldb, C, offc, ldc,
+						numCommandQueues, commandQueues, numEventsInWaitList,
+						eventWaitList, events, CLBLAS_SYMM);
+	}
+
+clblasStatus
+clblasZsymm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    size_t M,
+    size_t N,
+    cl_double2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_double2 beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+    	CLBlasKargs kargs;
+
+		#ifdef DEBUG_SYMM
+		printf("Zsymm called\n");
+		#endif
+    	memset(&kargs, 0, sizeof(kargs));
+    	kargs.dtype = TYPE_COMPLEX_DOUBLE;
+    	kargs.alpha.argDoubleComplex = alpha;
+    	kargs.beta.argDoubleComplex  = beta;
+
+    	return doSymm(	&kargs, order, uplo, side, M, N, A, offa, lda, B, offb, ldb, C, offc, ldc,
+						numCommandQueues, commandQueues, numEventsInWaitList,
+						eventWaitList, events, CLBLAS_SYMM);
+	}
+
diff --git a/src/library/blas/xsymv.c b/src/library/blas/xsymv.c
new file mode 100644
index 0000000..55b23e8
--- /dev/null
+++ b/src/library/blas/xsymv.c
@@ -0,0 +1,201 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+//#define USE_SYMV
+
+#include <string.h>
+#include <clBLAS.h>
+#include <devinfo.h>
+
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+static clblasStatus
+doSymv(
+    CLBlasKargs *kargs,
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    cl_int err;
+    ListHead seq;
+    clblasStatus retCode = clblasSuccess;
+    #ifdef USE_SYMV
+        ListHead seq2;
+        ListNode *listNodePtr;
+	    cl_event first_event;
+    #endif
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+    /* Validate arguments */
+
+    if (retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET)) {
+        return retCode;
+    }
+    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A,
+                         offA, lda, A_MAT_ERRSET )) {
+        return retCode;
+    }
+    if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET )) {
+        return retCode;
+    }
+    if (retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET )) {
+        return retCode;
+    }
+
+    kargs->order = order;
+    kargs->uplo = uplo;
+    kargs->N = N;
+    kargs->K = N; //store original N
+    kargs->A = A;
+    kargs->offA = offA;
+    kargs->offa = offA;
+    kargs->lda.matrix = lda;
+    kargs->B = x;
+    kargs->offBX = offx;
+    kargs->ldb.vector = incx;
+    kargs->C = y;
+    kargs->offCY = offy;
+    kargs->ldc.vector = incy;
+
+    #ifndef USE_SYMV
+
+        listInitHead(&seq);
+        err = makeSolutionSeq(CLBLAS_SYMV, kargs, numCommandQueues, commandQueues,
+            numEventsInWaitList, eventWaitList, events, &seq);
+        if (err == CL_SUCCESS) {
+            err = executeSolutionSeq(&seq);
+        }
+
+    #else   // version of SYMV using kprintf
+
+        numCommandQueues = 1;
+        listInitHead(&seq);
+
+	    kargs->transA = clblasNoTrans;
+	    kargs->diag = clblasNonUnit;
+		err = makeSolutionSeq(CLBLAS_HEMV, kargs, numCommandQueues, commandQueues,
+        					  numEventsInWaitList, eventWaitList, &first_event, &seq);
+		if (err == CL_SUCCESS)
+        {
+            listInitHead(&seq2);
+
+			kargs->transA = clblasTrans;
+		    kargs->diag   = clblasUnit;
+            err = makeSolutionSeq(CLBLAS_HEMV, kargs, numCommandQueues, commandQueues,
+                       1, &first_event, events, &seq2);
+
+            if (err == CL_SUCCESS)
+            {
+                // Adding node from seq2 to main seq
+                listNodePtr = listNodeFirst(&seq2);
+                listAddToTail(&seq, listNodePtr);
+
+                err = executeSolutionSeq(&seq);     // Executes both kernels in the seq one after other
+            }
+		}
+
+    #endif
+
+    freeSolutionSeq(&seq);
+    return (clblasStatus)err;
+}
+
+clblasStatus
+clblasSsymv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_float beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_FLOAT;
+    kargs.alpha.argFloat = alpha;
+    kargs.beta.argFloat = beta;
+
+    return doSymv(&kargs, order, uplo, N, A, offA, lda, x, offx, incx,
+                  y, offy, incy, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasDsymv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem x,
+    size_t offx,
+    int incx,
+    cl_double beta,
+    cl_mem y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_DOUBLE;
+    kargs.alpha.argDouble = alpha;
+    kargs.beta.argDouble = beta;
+
+    return doSymv(&kargs, order, uplo, N, A, offA, lda, x, offx, incx,
+                  y, offy, incy, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
diff --git a/src/library/blas/xsyr.c b/src/library/blas/xsyr.c
new file mode 100644
index 0000000..d2d1ae7
--- /dev/null
+++ b/src/library/blas/xsyr.c
@@ -0,0 +1,248 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+clblasStatus
+doSyr(
+	CLBlasKargs *kargs,
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueue,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+{
+	cl_int err;
+    ListHead seq;
+    clblasStatus retCode = clblasSuccess;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+	#ifdef DEBUG_SYR
+	printf("doSyr called\n");
+	#endif
+
+    /* Validate arguments */
+
+    if (retCode = checkMemObjects(A, X, 0, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET )) {
+   		printf("Invalid mem object..\n");
+        return retCode;
+    }
+
+	 /*
+     * PENDING:
+     * checkMatrixSizes() does not account of "offa" argument.
+     * Need to be added.
+     */
+    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+        printf("Invalid Size for A\n");
+        return retCode;
+    }
+    if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+        printf("Invalid Size for X\n");
+        return retCode;
+    }
+
+	if ((commandQueue == NULL) || (numCommandQueues == 0))
+    {
+        return clblasInvalidValue;
+    }
+
+    if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+    {
+        return clblasInvalidEventWaitList;
+    }
+
+	if(order == clblasRowMajor)
+	{
+		kargs->order = clblasColumnMajor;
+		kargs->uplo = (uplo == clblasUpper) ? clblasLower : clblasUpper;
+	}
+	else
+	{
+		kargs->order = order;
+    	kargs->uplo = uplo;
+    }
+	kargs->N = N;
+    kargs->A = A;
+    kargs->lda.matrix = lda;
+    kargs->B = X;
+    kargs->ldb.vector = incx;
+    kargs->offBX = offx;
+    kargs->offa = offa;
+	kargs->offA = offa;
+
+	#ifdef DEBUG_SYR
+    printf("Calling makeSolutionSeq : SYR\n");
+    #endif
+
+	/*
+ 	 * Always use CommandQueue (0)
+	 * PENDING:
+	 * 1. No Multi-GPU / Multi-command queue support
+	 * 2. This can be optimized to use the commandQ with the higher
+	 *	  memmory bandwidth that supports the data-type and the LDA
+	 */
+	numCommandQueues = 1;
+
+    listInitHead(&seq);
+    err = makeSolutionSeq(CLBLAS_SYR, kargs, numCommandQueues, commandQueue,
+                          numEventsInWaitList, eventWaitList, events, &seq);
+    if (err == CL_SUCCESS) {
+        err = executeSolutionSeq(&seq);
+    }
+
+    freeSolutionSeq(&seq);
+    return (clblasStatus)err;
+}
+
+
+clblasStatus
+clblasSsyr(
+	clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueue,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+	{
+		CLBlasKargs kargs;
+
+    	memset(&kargs, 0, sizeof(kargs));
+    	kargs.dtype = TYPE_FLOAT;
+		kargs.alpha.argFloat = alpha;
+        kargs.pigFuncID = CLBLAS_SYR;
+
+		#ifdef DEBUG_SYR
+		printf("Ssyr called\n");
+		#endif
+
+		return doSyr(&kargs, order, uplo, N, X, offx, incx, A, offa, lda, numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasDsyr(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueue,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+    {
+        CLBlasKargs kargs;
+
+        memset(&kargs, 0, sizeof(kargs));
+        kargs.dtype = TYPE_DOUBLE;
+		kargs.alpha.argDouble = alpha;
+        kargs.pigFuncID = CLBLAS_SYR;
+
+        #ifdef DEBUG_SYR
+        printf("Dsyr called\n");
+        #endif
+
+        return doSyr(&kargs, order, uplo, N, X, offx, incx, A, offa, lda, numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events);
+    }
+
+clblasStatus
+clblasSspr(
+	clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueue,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+	{
+		CLBlasKargs kargs;
+
+    	memset(&kargs, 0, sizeof(kargs));
+    	kargs.dtype = TYPE_FLOAT;
+		kargs.alpha.argFloat = alpha;
+        kargs.pigFuncID = CLBLAS_SPR;
+
+		return doSyr(&kargs, order, uplo, N, X, offx, incx, AP, offa, 0, numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasDspr(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueue,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+    {
+        CLBlasKargs kargs;
+
+        memset(&kargs, 0, sizeof(kargs));
+        kargs.dtype = TYPE_DOUBLE;
+		kargs.alpha.argDouble = alpha;
+        kargs.pigFuncID = CLBLAS_SPR;
+
+        return doSyr(&kargs, order, uplo, N, X, offx, incx, AP, offa, 0, numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events);
+    }
+
diff --git a/src/library/blas/xsyr2.c b/src/library/blas/xsyr2.c
new file mode 100644
index 0000000..2f0a185
--- /dev/null
+++ b/src/library/blas/xsyr2.c
@@ -0,0 +1,270 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+clblasStatus
+doSyr2(
+    CLBlasKargs *kargs,
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+	const cl_mem Y,
+	size_t offy,
+	int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueue,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+{
+    cl_int err;
+    ListHead seq;
+    clblasStatus retCode = clblasSuccess;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+    #ifdef DEBUG_SYR2
+    printf("doSyr2 called\n");
+    #endif
+
+    /* Validate arguments */
+
+    if (retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+        printf("Invalid mem object..\n");
+        return retCode;
+    }
+
+    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+        printf("Invalid Size for A\n");
+        return retCode;
+    }
+    if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+        printf("Invalid Size for X\n");
+        return retCode;
+    }
+
+	if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+        printf("Invalid Size for Y\n");
+        return retCode;
+    }
+
+    if ((commandQueue == NULL) || (numCommandQueues == 0))
+    {
+        return clblasInvalidValue;
+    }
+
+    if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+    {
+        return clblasInvalidEventWaitList;
+    }
+
+    if(order == clblasRowMajor)
+    {
+        kargs->order = clblasColumnMajor;
+        kargs->uplo = (uplo == clblasUpper) ? clblasLower : clblasUpper;
+    }
+    else
+    {
+        kargs->order = order;
+        kargs->uplo = uplo;
+    }
+    kargs->N = N;
+    kargs->A = A;
+    kargs->lda.matrix = lda;
+    kargs->B = X;
+    kargs->ldb.vector = incx;
+    kargs->offBX = offx;
+	kargs->C = Y;
+	kargs->ldc.vector = incy;
+	kargs->offCY = offy;
+    kargs->offa = offa;
+    kargs->offA = offa;
+
+    #ifdef DEBUG_SYR2
+    printf("Calling makeSolutionSeq : SYR2\n");
+    #endif
+
+    /*
+     * Always use CommandQueue (0)
+     * PENDING:
+     * 1. No Multi-GPU / Multi-command queue support
+     * 2. This can be optimized to use the commandQ with the higher
+     *    memmory bandwidth that supports the data-type and the LDA
+     */
+    numCommandQueues = 1;
+
+    listInitHead(&seq);
+    err = makeSolutionSeq(CLBLAS_SYR2, kargs, numCommandQueues, commandQueue,
+                          numEventsInWaitList, eventWaitList, events, &seq);
+    if (err == CL_SUCCESS) {
+        err = executeSolutionSeq(&seq);
+    }
+
+    freeSolutionSeq(&seq);
+    return (clblasStatus)err;
+}
+
+clblasStatus
+clblasSsyr2(
+	clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+	const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueue,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+	{
+		CLBlasKargs kargs;
+
+        memset(&kargs, 0, sizeof(kargs));
+        kargs.dtype = TYPE_FLOAT;
+        kargs.alpha.argFloat = alpha;
+        kargs.pigFuncID = CLBLAS_SYR2;
+
+		#ifdef DEBUG_SYR2
+		printf("Ssyr2 called\n");
+		#endif
+
+		return doSyr2(&kargs, order, uplo, N, X, offx, incx, Y, offy, incy, A, offa, lda,
+						numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasDsyr2(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+	cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueue,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+    {
+		CLBlasKargs kargs;
+
+        memset(&kargs, 0, sizeof(kargs));
+        kargs.dtype = TYPE_DOUBLE;
+        kargs.alpha.argDouble = alpha;
+        kargs.pigFuncID = CLBLAS_SYR2;
+
+        #ifdef DEBUG_SYR2
+        printf("Dsyr2 called\n");
+        #endif
+
+        return doSyr2(&kargs, order, uplo, N, X, offx, incx, Y, offy, incy, A, offa, lda,
+                        numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events);
+    }
+
+clblasStatus
+clblasSspr2(
+	clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+	const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueue,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+	{
+		CLBlasKargs kargs;
+
+        memset(&kargs, 0, sizeof(kargs));
+        kargs.dtype = TYPE_FLOAT;
+        kargs.alpha.argFloat = alpha;
+        kargs.pigFuncID = CLBLAS_SPR2;
+
+		return doSyr2(&kargs, order, uplo, N, X, offx, incx, Y, offy, incy, AP, offa, 0,
+						numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clblasDspr2(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+	cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue* commandQueue,
+    cl_uint numEventsInWaitList,
+    const cl_event* eventWaitList,
+    cl_event* events)
+    {
+		CLBlasKargs kargs;
+
+        memset(&kargs, 0, sizeof(kargs));
+        kargs.dtype = TYPE_DOUBLE;
+        kargs.alpha.argDouble = alpha;
+        kargs.pigFuncID = CLBLAS_SPR2;
+
+        return doSyr2(&kargs, order, uplo, N, X, offx, incx, Y, offy, incy, AP, offa, 0,
+                        numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events);
+    }
+
+
diff --git a/src/library/blas/xsyr2k.c b/src/library/blas/xsyr2k.c
new file mode 100644
index 0000000..e99a617
--- /dev/null
+++ b/src/library/blas/xsyr2k.c
@@ -0,0 +1,250 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <string.h>
+#include <clBLAS.h>
+#include <devinfo.h>
+
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+clblasStatus
+doSyr2k(
+    CLBlasKargs *kargs,
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transAB,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    cl_int err;
+    ListHead seq;
+    clblasStatus retCode = clblasSuccess;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+    if (numCommandQueues == 0 || commandQueues == NULL) {
+        return clblasInvalidValue;
+    }
+
+    // Validate arguments
+    if (retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET )) {
+        return retCode;
+    }
+
+    if (isComplexType(kargs->dtype) && transAB == clblasConjTrans) {
+        return clblasInvalidValue;
+    }
+
+    if (retCode = checkMatrixSizes(kargs->dtype, order, transAB, N, K, A, offA, lda, A_MAT_ERRSET )) {
+        return retCode;
+    }
+    if (retCode = checkMatrixSizes(kargs->dtype, order, transAB, N, K, B, offB, ldb, B_MAT_ERRSET )) {
+        return retCode;
+    }
+    if (retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET )) {
+        return retCode;
+    }
+
+    kargs->order = order;
+    kargs->transA = transAB;
+    kargs->transB = transAB;
+    kargs->uplo = uplo;
+    kargs->M = N;
+    kargs->N = N;
+    kargs->K = K;
+    kargs->A = A;
+    kargs->offA = offA;
+    kargs->lda.matrix = lda;
+    kargs->B = B;
+    kargs->offBX = offB;
+    kargs->ldb.matrix = ldb;
+    kargs->C = C;
+    kargs->offCY = offC;
+    kargs->ldc.matrix = ldc;
+
+    listInitHead(&seq);
+    err = makeSolutionSeq(CLBLAS_SYR2K, kargs, numCommandQueues, commandQueues,
+        numEventsInWaitList, eventWaitList, events, &seq);
+    if (err == CL_SUCCESS) {
+        err = executeSolutionSeq(&seq);
+    }
+
+    freeSolutionSeq(&seq);
+
+    return (clblasStatus)err;
+}
+
+clblasStatus
+clblasSsyr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transAB,
+    size_t N,
+    size_t K,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_float beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.alpha.argFloat = alpha;
+    kargs.beta.argFloat = beta;
+    kargs.dtype = TYPE_FLOAT;
+
+    return doSyr2k(&kargs, order, uplo, transAB, N, K, A, offA, lda, B, offB,
+                   ldb, C, offC, ldc, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasDsyr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transAB,
+    size_t N,
+    size_t K,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_double beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.alpha.argDouble = alpha;
+    kargs.beta.argDouble = beta;
+    kargs.dtype = TYPE_DOUBLE;
+
+    return doSyr2k(&kargs, order, uplo, transAB, N, K, A, offA, lda, B, offB,
+                   ldb, C, offC, ldc, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasCsyr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transAB,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    FloatComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.alpha.argFloatComplex = alpha;
+    kargs.beta.argFloatComplex = beta;
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+
+    return doSyr2k(&kargs, order, uplo, transAB, N, K, A, offA, lda, B, offB,
+                   ldb, C, offC, ldc, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasZsyr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transAB,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    DoubleComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.alpha.argDoubleComplex = alpha;
+    kargs.beta.argDoubleComplex = beta;
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+
+    return doSyr2k(&kargs, order, uplo, transAB, N, K, A, offA, lda, B, offB,
+                   ldb, C, offC, ldc, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
diff --git a/src/library/blas/xsyrk.c b/src/library/blas/xsyrk.c
new file mode 100644
index 0000000..4157d5e
--- /dev/null
+++ b/src/library/blas/xsyrk.c
@@ -0,0 +1,233 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <string.h>
+#include <clBLAS.h>
+#include <devinfo.h>
+
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+clblasStatus
+doSyrk(
+    CLBlasKargs *kargs,
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    cl_int err;
+    ListHead seq;
+    clblasStatus retCode = clblasSuccess;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+    if (numCommandQueues == 0 || commandQueues == NULL) {
+        return clblasInvalidValue;
+    }
+
+    // Validate arguments
+    if (retCode = checkMemObjects(A, C, NULL, false, A_MAT_ERRSET, C_MAT_ERRSET, END_ERRSET)) {
+        return retCode;
+    }
+
+    if (isComplexType(kargs->dtype) && transA == clblasConjTrans) {
+        return clblasInvalidValue;
+    }
+
+    if (retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offA, lda, A_MAT_ERRSET )) {
+        return retCode;
+    }
+    if (retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET )) {
+        return retCode;
+    }
+
+    kargs->order = order;
+    kargs->transA = transA;
+    kargs->transB = transA;
+    kargs->uplo = uplo;
+    kargs->M = N;
+    kargs->N = N;
+    kargs->K = K;
+    kargs->A = A;
+    kargs->offA = offA;
+    kargs->lda.matrix = lda;
+    kargs->B = A;
+    kargs->offBX = offA;
+    kargs->ldb.matrix = lda;
+    kargs->C = C;
+    kargs->offCY = offC;
+    kargs->ldc.matrix = ldc;
+
+    listInitHead(&seq);
+    err = makeSolutionSeq(CLBLAS_SYRK, kargs, numCommandQueues, commandQueues,
+        numEventsInWaitList, eventWaitList, events, &seq);
+    if (err == CL_SUCCESS) {
+        err = executeSolutionSeq(&seq);
+    }
+
+    freeSolutionSeq(&seq);
+
+    return (clblasStatus)err;
+}
+
+clblasStatus
+clblasSsyrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_float beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.alpha.argFloat = alpha;
+    kargs.beta.argFloat = beta;
+    kargs.dtype = TYPE_FLOAT;
+
+    return doSyrk(&kargs, order, uplo, transA, N, K, A, offA, lda,
+                   C, offC, ldc, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasDsyrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_double beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.alpha.argDouble = alpha;
+    kargs.beta.argDouble = beta;
+    kargs.dtype = TYPE_DOUBLE;
+
+    return doSyrk(&kargs, order, uplo, transA, N, K, A, offA, lda,
+                   C, offC, ldc, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasCsyrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    FloatComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.alpha.argFloatComplex = alpha;
+    kargs.beta.argFloatComplex = beta;
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+
+    return doSyrk(&kargs, order, uplo, transA, N, K, A, offA, lda,
+                   C, offC, ldc, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasZsyrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    DoubleComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.alpha.argDoubleComplex = alpha;
+    kargs.beta.argDoubleComplex = beta;
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+
+    return doSyrk(&kargs, order, uplo, transA, N, K, A, offA, lda,
+                   C, offC, ldc, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
diff --git a/src/library/blas/xtbmv.c b/src/library/blas/xtbmv.c
new file mode 100644
index 0000000..8f59bc9
--- /dev/null
+++ b/src/library/blas/xtbmv.c
@@ -0,0 +1,297 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+//#define DEBUG_TBMV
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+clblasStatus
+doTbmv(
+	CLBlasKargs *kargs,
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem x,
+    size_t offx,
+    int incx,
+	cl_mem y, // Scratch Buffer
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    cl_int err;
+    ListHead seq;
+	size_t sizeOfVector;
+	cl_event *newEventWaitList;
+    clblasStatus retCode = clblasSuccess;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+    /* Validate arguments */
+
+    if (retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+	printf("Invalid mem object..\n");
+        return retCode;
+    }
+
+    if (retCode = checkBandedMatrixSizes(kargs->dtype, order, trans, N, N, K, 0, A, offa, lda, A_MAT_ERRSET)) {
+		printf("Invalid Size for A\n");
+        return retCode;
+    }
+    if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET)) {
+		printf("Invalid Size for X\n");
+        return retCode;
+    }
+    if (retCode = checkVectorSizes(kargs->dtype, N, y, 0, incx, Y_VEC_ERRSET)) {
+		printf("Invalid Size for scratch vector\n");
+        return retCode;
+    }
+
+	#ifdef DEBUG_TBMV
+	printf("DoTbmv being called...\n");
+	#endif
+
+	if ((commandQueues == NULL) || (numCommandQueues == 0))
+	{
+		return clblasInvalidValue;
+	}
+    numCommandQueues = 1;
+
+	if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+	{
+		return clblasInvalidEventWaitList;
+	}
+
+	newEventWaitList = malloc((numEventsInWaitList+1) * sizeof(cl_event));
+	if (newEventWaitList == NULL)
+	{
+		return clblasOutOfHostMemory;
+	}
+	if (numEventsInWaitList != 0 )
+	{
+		memcpy(newEventWaitList, eventWaitList, numEventsInWaitList*sizeof(cl_event));
+	}
+
+	/*
+ 	 * ASSUMPTION:
+ 	 * doTBMV assumes "commandQueue" of 0. The same is reflected in
+	 * "makeSolutionSeq" as well. If either of them changes in future,
+	 * this code needs to be revisited.
+  	 */
+	sizeOfVector = (1 + (N-1)*abs(incx)) * dtypeSize(kargs->dtype);
+	err = clEnqueueCopyBuffer(commandQueues[0], x, y, offx*dtypeSize(kargs->dtype), 0, sizeOfVector,
+							  numEventsInWaitList, eventWaitList, &newEventWaitList[numEventsInWaitList]);
+	if (err != CL_SUCCESS)
+	{
+		free(newEventWaitList);
+		return err;
+	}
+
+    kargs->order = order;
+    kargs->uplo = uplo;
+    kargs->transA = trans;
+	kargs->diag = diag;
+	kargs->M = N;
+    kargs->N = N;
+    if( uplo == clblasUpper )
+    {
+        kargs->KL = 0;
+        kargs->KU = K;
+    }
+    else    {
+        kargs->KL = K;
+        kargs->KU = 0;
+    }
+    kargs->A = A;
+    kargs->lda.matrix = lda;
+    kargs->B = y;       // Now it becomes x = A * y
+    kargs->ldb.vector = incx;
+    kargs->C = x;
+    kargs->ldc.vector = incx;
+    kargs->offBX = 0;           // Not used by assignKargs(); Just for clarity
+    kargs->offCY = offx;
+	kargs->offa = offa;
+	kargs->offA = offa;
+
+	#ifdef DEBUG_TBMV
+	printf("Calling makeSolutionSeq : TBMV\n");
+	#endif
+
+    listInitHead(&seq);
+    err = makeSolutionSeq(CLBLAS_GBMV, kargs, numCommandQueues, commandQueues,
+        				  numEventsInWaitList+1, newEventWaitList, events, &seq);
+    if (err == CL_SUCCESS) {
+       	err = executeSolutionSeq(&seq);
+    }
+
+    freeSolutionSeq(&seq);
+	free(newEventWaitList);
+    return (clblasStatus)err;
+}
+
+clblasStatus
+clblasStbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+	#ifdef DEBUG_TBMV
+	printf("STBMV Called\n");
+	#endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_FLOAT;
+    kargs.pigFuncID = CLBLAS_TBMV;
+
+    return doTbmv(&kargs, order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasDtbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+	#ifdef DEBUG_TBMV
+	printf("DTBMV called\n");
+	#endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_DOUBLE;
+    kargs.pigFuncID = CLBLAS_TBMV;
+
+    return doTbmv(&kargs, order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+
+clblasStatus
+clblasCtbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+	#ifdef DEBUG_TBMV
+	printf("CTBMV called\n");
+	#endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+    kargs.pigFuncID = CLBLAS_TBMV;
+
+    return doTbmv(&kargs, order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasZtbmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+	#ifdef DEBUG_TBMV
+	printf("ZTBMV called\n");
+	#endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+    kargs.pigFuncID = CLBLAS_TBMV;
+
+    return doTbmv(&kargs, order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
diff --git a/src/library/blas/xtbsv.c b/src/library/blas/xtbsv.c
new file mode 100644
index 0000000..a24d74c
--- /dev/null
+++ b/src/library/blas/xtbsv.c
@@ -0,0 +1,824 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+//#define DEBUG_TBSV
+static clblasUplo
+
+getUpLo(CLBlasKargs *kargs)
+{
+    if (kargs->order == clblasRowMajor)
+    {
+        return kargs->uplo;
+    }
+
+    if (kargs->uplo == clblasUpper)
+    {
+        return clblasLower;
+    }
+    return clblasUpper;
+}
+
+static clblasStatus
+orchestrateNonTransposeTBSV(CLBlasKargs *kargs, ListHead *trtriSeq, ListHead *gbmvSeq, cl_uint numEventsInWaitList,
+                const cl_event *eventWaitList, cl_event *events)
+{
+    clblasStatus err;
+    SolutionStep *trtri, *gbmv;
+    size_t nLoops, i;
+    cl_event *triangleEventArray, *rectangleEventArray;
+    size_t TARGET_ROWS;
+    bool gbmvExecute;
+    size_t temp;
+
+    ListNode *f = listNodeFirst(trtriSeq);
+    trtri = container_of(f, node, SolutionStep);
+    f = listNodeFirst(gbmvSeq);
+    gbmv = container_of(f, node, SolutionStep);
+
+    TARGET_ROWS = trtri->subdims->y;
+    TARGET_ROWS = (TARGET_ROWS > kargs->K) ? kargs->K : TARGET_ROWS;
+    TARGET_ROWS = (TARGET_ROWS == 0) ? 1 : TARGET_ROWS;
+
+    trtri->numEventsInWaitList = numEventsInWaitList;
+    trtri->eventWaitList = eventWaitList;
+
+    if (kargs->N <= TARGET_ROWS)
+    {
+        trtri->event = events;
+        trtri->args.startRow = 0;
+        trtri->args.endRow = (cl_int)((kargs->N)-1);
+        err = executeSolutionSeq(trtriSeq);
+        return err;
+    }
+
+    //
+    // Allocate Event Chain
+    //
+    nLoops = ((kargs->N) / TARGET_ROWS);
+    if ((kargs->N % TARGET_ROWS))
+    {
+        nLoops++;
+    }
+    //
+    // Allocate Event Arrays to order the orchestration
+    //
+    triangleEventArray = malloc(nLoops*sizeof(cl_event));
+    rectangleEventArray = malloc(nLoops*sizeof(cl_event));
+    if ((triangleEventArray == NULL) || (rectangleEventArray == NULL))
+    {
+        if (triangleEventArray)
+        {
+            free (triangleEventArray);
+        }
+        if (rectangleEventArray)
+        {
+            free (rectangleEventArray);
+        }
+        return clblasOutOfHostMemory;
+    }
+    //
+    //  Solve 1 Triangle using Triangle Kernel Followed by Rectangle Kernels
+    //
+    trtri->event = &triangleEventArray[0];
+    if (getUpLo(kargs) == clblasUpper)
+    {
+        trtri->args.startRow = (cl_int)((kargs->N) - TARGET_ROWS);
+        trtri->args.endRow = (cl_int)((kargs->N)-1);
+    } else {
+        trtri->args.startRow = 0;
+        trtri->args.endRow = (cl_int)(TARGET_ROWS-1);
+    }
+    err = executeSolutionSeq(trtriSeq);
+
+/*#define GET_OFFA(offa, lda, r, c, k)\
+if(r < k) \
+offa = r * lda + col + k - r;\
+else if (r == k) \
+offa = r * lda + col;\
+else\
+offa = r * lda + col - (r - k);
+*/
+#define GET_OFFA_LOWER(offa, lda, row, col, kl) (offa) = ((row) * (lda)) + (col) + (kl) - (row);
+#define GET_OFFA_UPPER(offa, lda, row, col) (offa) = ((row) * (lda)) + (col) - (row);
+
+if (err == CL_SUCCESS)
+    {
+        //
+        // Solve the Rectangles one by one
+        //
+        //nLoops = 1;
+        for(i=1; i<nLoops; i++)
+        {
+            #ifdef DEBUG_TBSV
+                printf("Calling gbmv-");
+            #endif
+            gbmv->numEventsInWaitList = 1;
+            gbmv->eventWaitList = &triangleEventArray[i-1];
+            gbmv->event = &rectangleEventArray[i-1];
+
+            if (getUpLo(kargs) == clblasUpper)
+            {
+                gbmv->args.N = TARGET_ROWS;
+                gbmv->args.M = ((trtri->args.startRow) >= (int)(kargs->K)) ? kargs->K : (size_t)trtri->args.startRow;
+                gbmv->args.startRow = (trtri->args.startRow - gbmv->args.M);
+                gbmv->args.endRow = (trtri->args.startRow - 1);
+                gbmv->args.KU = (trtri->args.startRow >= (int)(kargs->K)) ? 0 : (kargs->K - trtri->args.startRow);
+                gbmv->args.KL = gbmv->args.M - 1;
+
+                GET_OFFA_UPPER(gbmv->args.offA, kargs->lda.matrix, gbmv->args.startRow, trtri->args.startRow);
+                gbmv->args.offA -= gbmv->args.KL;
+                gbmv->args.offA += kargs->offA;
+                gbmv->args.offa = gbmv->args.offA;
+
+                if(kargs->ldb.vector < 0)
+                {
+                    gbmv->args.offBX = kargs->offBX + ((i-1) * TARGET_ROWS) * abs(kargs->ldb.vector);
+                    gbmv->args.offCY = kargs->offBX + ((i * TARGET_ROWS) ) * abs(kargs->ldb.vector);
+                }
+                else
+                {
+                    gbmv->args.offBX = kargs->offBX + (trtri->args.startRow) * kargs->ldb.vector;
+                    gbmv->args.offCY = kargs->offBX + (gbmv->args.startRow) * kargs->ldb.vector;
+                }
+
+            } else {
+                gbmv->args.startRow = (cl_int)((i)*TARGET_ROWS);
+                gbmv->args.endRow   = (cl_int)((((TARGET_ROWS*i) + kargs->K) > kargs->N) ? kargs->N : (TARGET_ROWS*i + kargs->K));
+                gbmv->args.N = TARGET_ROWS;
+                gbmv->args.M = (gbmv->args.endRow - gbmv->args.startRow);
+                gbmv->args.KU = TARGET_ROWS - 1;
+                gbmv->args.KL = ((trtri->args.startRow + kargs->K) < kargs->N) ? (kargs->K - TARGET_ROWS) : (kargs->N - trtri->args.startRow - 1 - TARGET_ROWS);
+
+                GET_OFFA_LOWER(gbmv->args.offA, kargs->lda.matrix, gbmv->args.startRow, trtri->args.startRow, kargs->K);
+                gbmv->args.offA -= gbmv->args.KL;
+                gbmv->args.offA += kargs->offA;
+                gbmv->args.offa = gbmv->args.offA;
+                if(kargs->ldb.vector < 0)
+                {
+                    gbmv->args.offBX = kargs->offBX + (kargs->N - gbmv->args.startRow) * abs(kargs->ldb.vector);
+                    gbmv->args.offCY = kargs->offBX + (kargs->N - (gbmv->args.startRow + gbmv->args.M) ) * abs(kargs->ldb.vector);
+                }
+                else
+                {
+                    gbmv->args.offBX = kargs->offBX + (trtri->args.startRow) * kargs->ldb.vector;
+                    gbmv->args.offCY = kargs->offBX + (gbmv->args.startRow) * kargs->ldb.vector;
+                }
+
+            }
+
+            #ifdef DEBUG_TBSV
+            printf("GBMV ITER %d, startRow %d, endRow %d, N %d, M %d , KU %d, KL %d, offBX %d, offA %d, offCY %d\n", i-1, gbmv->args.startRow, gbmv->args.endRow, \
+                                            gbmv->args.N, gbmv->args.M, gbmv->args.KU, gbmv->args.KL, gbmv->args.offBX, gbmv->args.offA, gbmv->args.offCY);
+            #endif
+            // This is required when KL or KU is 0 for TBSV.
+            gbmvExecute = (gbmv->args.M != 0);
+            if(gbmvExecute)
+            {
+                if(kargs->order == clblasColumnMajor) //GBMV Swaps it back while assigning
+                {
+                    temp = gbmv->args.N;
+                    gbmv->args.N = gbmv->args.M;
+                    gbmv->args.M = temp;
+                    temp = gbmv->args.KU;
+                    gbmv->args.KU = gbmv->args.KL;
+                    gbmv->args.KL = temp;
+                }
+                err = executeSolutionSeq(gbmvSeq);
+            }
+
+            if (err != CL_SUCCESS)
+            {
+                printf("TBSV: WARNING: GBMV LOOP: Breaking after %d iterations  !!!\n", (int)i);
+                break;
+            }
+
+            #ifdef DEBUG_TBSV
+                printf("Calling TBSV\n");
+            #endif
+            if (getUpLo(kargs) == clblasUpper)
+            {
+                trtri->args.startRow = (cl_int)(((int)trtri->args.startRow - (int)TARGET_ROWS) >= 0) ? (trtri->args.startRow - TARGET_ROWS) : 0;
+                trtri->args.endRow = (cl_int)(gbmv->args.endRow);
+            } else {
+                trtri->args.startRow = gbmv->args.startRow;
+                trtri->args.endRow = (cl_int)(((gbmv->args.startRow + TARGET_ROWS-1) < kargs->N) ? (gbmv->args.startRow + TARGET_ROWS-1) : kargs->N-1);
+            }
+            #ifdef DEBUG_TBSV
+            printf("TRSV ITER %d, startRow %d , endRow %d\n", i, trtri->args.startRow, trtri->args.endRow);
+            #endif
+            trtri->event = &triangleEventArray[i];
+            if (i == (nLoops-1))
+            {
+                //
+                // TRTRI's last iteration must be tied to the "event" that the API
+                // user will choose to wait on.
+                //
+                trtri->event = events;
+            }
+            //
+            // For first iteration, TRTRI waits on what the API user has specified.
+            // Subsequent iterations will wait on the previous iteration's rectangle
+            // counterpart
+            //
+
+            trtri->numEventsInWaitList =1;
+            if(gbmvExecute)
+            {
+                trtri->eventWaitList = &rectangleEventArray[i-1];
+            }
+            else //GBMV is not executed when KL or KU of the band in TBSV is 0.
+            {
+                trtri->eventWaitList = &triangleEventArray[i-1];
+            }
+
+            err = executeSolutionSeq(trtriSeq);
+            if (err != CL_SUCCESS)
+            {
+                printf("TBSV: WARNING: TRSV LOOP: Breaking after %d iterations  !!!\n", (int)i);
+                break;
+            }
+
+        }
+    }
+
+    free(triangleEventArray);
+    free(rectangleEventArray);
+    return err;
+}
+
+static clblasStatus
+orchestrateTransposeTBSV(CLBlasKargs *kargs, ListHead *trtriSeq, ListHead *gbmvSeq, cl_uint numEventsInWaitList,
+                const cl_event *eventWaitList, cl_event *events)
+{
+    clblasStatus err;
+    SolutionStep *trtri, *gbmv;
+    size_t nLoops, i;
+    cl_event *triangleEventArray, *rectangleEventArray;
+    size_t TARGET_ROWS;
+    bool gbmvExecute;
+    size_t temp;
+    int TR_ER, N_SR, SD_ER;
+
+    ListNode *f = listNodeFirst(trtriSeq);
+    trtri = container_of(f, node, SolutionStep);
+    f = listNodeFirst(gbmvSeq);
+    gbmv = container_of(f, node, SolutionStep);
+
+    TARGET_ROWS = trtri->subdims->y;
+    TARGET_ROWS = (TARGET_ROWS > kargs->K) ? kargs->K : TARGET_ROWS;
+    TARGET_ROWS = (TARGET_ROWS == 0) ? 1 : TARGET_ROWS;
+
+    trtri->numEventsInWaitList = numEventsInWaitList;
+    trtri->eventWaitList = eventWaitList;
+
+    if (kargs->N <= TARGET_ROWS)
+    {
+        trtri->event = events;
+        trtri->args.startRow = 0;
+        trtri->args.endRow = (cl_int)((kargs->N));
+        err = executeSolutionSeq(trtriSeq);
+        return err;
+    }
+
+    //
+    // Allocate Event Chain
+    //
+    nLoops = ((kargs->N) / TARGET_ROWS);
+    if ((kargs->N % TARGET_ROWS))
+    {
+        nLoops++;
+    }
+    //
+    // Allocate Event Arrays to order the orchestration
+    //
+    triangleEventArray = malloc(nLoops*sizeof(cl_event));
+    rectangleEventArray = malloc(nLoops*sizeof(cl_event));
+    if ((triangleEventArray == NULL) || (rectangleEventArray == NULL))
+    {
+        if (triangleEventArray)
+        {
+            free (triangleEventArray);
+        }
+        if (rectangleEventArray)
+        {
+            free (rectangleEventArray);
+        }
+        return clblasOutOfHostMemory;
+    }
+    //
+    //  Solve 1 Triangle using Triangle Kernel Followed by Rectangle Kernels
+    //
+    trtri->event = &triangleEventArray[0];
+    if (getUpLo(kargs) == clblasUpper)
+    {
+        trtri->args.startRow = 0;
+        trtri->args.endRow = (cl_int)(TARGET_ROWS);
+    } else {
+        trtri->args.startRow = (cl_int)((kargs->N) - TARGET_ROWS);
+        trtri->args.endRow = (cl_int)((kargs->N));
+    }
+    err = executeSolutionSeq(trtriSeq);
+
+/*#define GET_OFFA(offa, lda, r, c, k)\
+if(r < k) \
+offa = r * lda + col + k - r;\
+else if (r == k) \
+offa = r * lda + col;\
+else\
+offa = r * lda + col - (r - k);
+*/
+#define GET_OFFA_LOWER(offa, lda, row, col, kl) (offa) = ((row) * (lda)) + (col) + (kl) - (row);
+#define GET_OFFA_UPPER(offa, lda, row, col) (offa) = ((row) * (lda)) + (col) - (row);
+
+    if (err == CL_SUCCESS)
+    {
+        //
+        // Solve the Rectangles one by one
+        //
+        //nLoops = 1;
+#define max(a, b) (((a) > (b)) ? (a) : (b))
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+
+
+        for(i=1; i<nLoops; i++)
+        {
+            #ifdef DEBUG_TBSV
+                printf("Calling gbmv-");
+            #endif
+            gbmv->numEventsInWaitList = 1;
+            gbmv->eventWaitList = &triangleEventArray[i-1];
+            gbmv->event = &rectangleEventArray[i-1];
+
+            if (getUpLo(kargs) == clblasUpper)
+            {
+                TR_ER = trtri->args.endRow - 1;
+                gbmv->args.N = max(0, min(((int)kargs->K), ((int)kargs->N - 1 - TR_ER)));
+                gbmv->args.M = TARGET_ROWS;
+                gbmv->args.startRow = (trtri->args.startRow);
+                gbmv->args.endRow = trtri->args.endRow;
+                N_SR = max(0, min(((int)kargs->K), ((int)kargs->N - 1 - (int)trtri->args.startRow)));
+                gbmv->args.KU = N_SR - TARGET_ROWS;
+                gbmv->args.KL = gbmv->args.M - 1;
+
+                GET_OFFA_UPPER(gbmv->args.offA, kargs->lda.matrix, gbmv->args.startRow, (gbmv->args.endRow));
+                gbmv->args.offA -= gbmv->args.KL;
+                gbmv->args.offA += kargs->offA;
+                gbmv->args.offa = gbmv->args.offA;
+
+                if(kargs->ldb.vector < 0)
+                {
+                    gbmv->args.offBX = kargs->offBX + (kargs->N - (gbmv->args.endRow)) * abs(kargs->ldb.vector);
+                    gbmv->args.offCY = kargs->offBX + (kargs->N - (gbmv->args.endRow + gbmv->args.N) ) * abs(kargs->ldb.vector);
+                }
+                else
+                {
+                    gbmv->args.offBX = kargs->offBX + (gbmv->args.startRow) * kargs->ldb.vector;
+                    gbmv->args.offCY = kargs->offBX + (gbmv->args.endRow) * kargs->ldb.vector;
+                }
+
+
+            } else {
+
+#define SUBDIAGS(r, k) ((r) <= (k)) ? (r) : (k);
+                gbmv->args.startRow = trtri->args.startRow;
+                gbmv->args.endRow   = trtri->args.endRow;
+
+                gbmv->args.N = SUBDIAGS((int)trtri->args.startRow, (int)kargs->K);
+                gbmv->args.M = TARGET_ROWS;
+                gbmv->args.KU = gbmv->args.N - 1;
+                SD_ER = SUBDIAGS((int)(trtri->args.endRow - 1), (int)kargs->K);
+                gbmv->args.KL = SD_ER - gbmv->args.N;
+
+                GET_OFFA_LOWER(gbmv->args.offA, kargs->lda.matrix, gbmv->args.startRow, (gbmv->args.startRow - gbmv->args.N), kargs->K);
+                gbmv->args.offA -= gbmv->args.KL;
+                gbmv->args.offA += kargs->offA;
+                gbmv->args.offa = gbmv->args.offA;
+                if(kargs->ldb.vector < 0)
+                {
+                    gbmv->args.offBX = kargs->offBX + (kargs->N - gbmv->args.endRow) * abs(kargs->ldb.vector);
+                    gbmv->args.offCY = kargs->offBX + (kargs->N - (gbmv->args.startRow) ) * abs(kargs->ldb.vector);
+                }
+                else
+                {
+                    gbmv->args.offBX = kargs->offBX + (gbmv->args.startRow) * kargs->ldb.vector;
+                    gbmv->args.offCY = kargs->offBX + (gbmv->args.startRow - gbmv->args.N) * kargs->ldb.vector;
+                }
+
+            }
+            #ifdef DEBUG_TBSV
+            printf("GBMV ITER %d, startRow %d, endRow %d, N %d, M %d , KU %d, KL %d, offBX %d, offA %d, offCY %d\n", i-1, gbmv->args.startRow, gbmv->args.endRow, \
+                                            gbmv->args.N, gbmv->args.M, gbmv->args.KU, gbmv->args.KL, gbmv->args.offBX, gbmv->args.offA, gbmv->args.offCY);
+            #endif
+            // This is required when KL or KU is 0 for TBSV.
+            gbmvExecute = (gbmv->args.N != 0);
+            if(gbmvExecute)
+            {
+                if(kargs->order == clblasColumnMajor) //GBMV Swaps it back while assigning
+                {
+                    temp = gbmv->args.N;
+                    gbmv->args.N = gbmv->args.M;
+                    gbmv->args.M = temp;
+                    temp = gbmv->args.KU;
+                    gbmv->args.KU = gbmv->args.KL;
+                    gbmv->args.KL = temp;
+                }
+                err = executeSolutionSeq(gbmvSeq);
+            }
+
+            if (err != CL_SUCCESS)
+            {
+                printf("TBSV: WARNING: GBMV LOOP: Breaking after %d iterations  !!!\n", (int)i);
+                break;
+            }
+
+            #ifdef DEBUG_TBSV
+                printf("Calling TBSV\n");
+            #endif
+            if (getUpLo(kargs) == clblasUpper)
+            {
+                trtri->args.startRow = (cl_int)(trtri->args.endRow);
+                trtri->args.endRow = (cl_int)(((int)trtri->args.endRow + (int)TARGET_ROWS) <= (int)kargs->N) ? (trtri->args.endRow + TARGET_ROWS) : kargs->N;
+            } else {
+                trtri->args.endRow = trtri->args.startRow;
+                trtri->args.startRow = (cl_int)((((int)trtri->args.startRow - (int)TARGET_ROWS) > 0) ? (trtri->args.startRow - TARGET_ROWS) : 0);
+            }
+            #ifdef DEBUG_TBSV
+            printf("TRSV ITER %d, startRow %d , endRow %d\n", i, trtri->args.startRow, trtri->args.endRow);
+            #endif
+            trtri->event = &triangleEventArray[i];
+            if (i == (nLoops-1))
+            {
+                //
+                // TRTRI's last iteration must be tied to the "event" that the API
+                // user will choose to wait on.
+                //
+                trtri->event = events;
+            }
+            //
+            // For first iteration, TRTRI waits on what the API user has specified.
+            // Subsequent iterations will wait on the previous iteration's rectangle
+            // counterpart
+            //
+
+            trtri->numEventsInWaitList =1;
+            if(gbmvExecute)
+            {
+                trtri->eventWaitList = &rectangleEventArray[i-1];
+            }
+            else //GBMV is not executed when KL or KU of the band in TBSV is 0.
+            {
+                trtri->eventWaitList = &triangleEventArray[i-1];
+            }
+
+            err = executeSolutionSeq(trtriSeq);
+            if (err != CL_SUCCESS)
+            {
+                printf("TBSV: WARNING: TRSV LOOP: Breaking after %d iterations  !!!\n", (int)i);
+                break;
+            }
+
+        }
+    }
+
+    free(triangleEventArray);
+    free(rectangleEventArray);
+    return err;
+}
+
+static clblasStatus
+orchestrateTBSV(CLBlasKargs *kargs, ListHead *trtriSeq, ListHead *gbmvSeq, cl_uint numEventsInWaitList,
+                const cl_event *eventWaitList, cl_event *events)
+{
+    clblasStatus err = clblasNotImplemented;
+
+    if  (   ((kargs->order == clblasRowMajor) && (kargs->transA == clblasNoTrans)) ||
+            ((kargs->order == clblasColumnMajor) && (kargs->transA != clblasNoTrans))
+        )
+    {
+        #ifdef DEBUG_TBSV
+        printf("Orchestrating the NO-Transpose case..\n");
+        #endif
+        err = orchestrateNonTransposeTBSV(kargs, trtriSeq, gbmvSeq, numEventsInWaitList, eventWaitList, events);
+    } else {
+        #ifdef DEBUG_TRSV
+        printf("Orchestrating the Transpose case..\n");
+        #endif
+        err = orchestrateTransposeTBSV(kargs, trtriSeq, gbmvSeq, numEventsInWaitList, eventWaitList, events);
+    }
+
+    return err;
+}
+
+
+clblasStatus
+doTbsv(
+	CLBlasKargs *kargs,
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem x,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    cl_int err = clblasNotImplemented;
+    ListHead seq;
+	CLBlasKargs gbmvKargs;
+	ListHead gbmvSeq;
+	//cl_context c;
+    clblasStatus retCode = clblasSuccess;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+    /* Validate arguments */
+
+    retCode = checkMemObjects(A, x, (cl_mem) NULL, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET);
+    if (retCode != clblasSuccess) {
+		#ifdef DEBUG_TBSV
+		printf("Invalid mem object..\n");
+		#endif
+        return retCode;
+    }
+
+	/*
+	 * PENDING:
+ 	 * checkMatrixSizes() does not account for "offa" argument.
+ 	 * Need to pass "offa" when "checkMatrixSizes()" is changed.
+	 */
+    retCode = checkBandedMatrixSizes(kargs->dtype, order, trans, N, N, K, 0, A, offa, lda, A_MAT_ERRSET );
+    if (retCode != clblasSuccess) {
+		#ifdef DEBUG_TBSV
+		printf("Invalid Size for A\n");
+		#endif
+        return retCode;
+    }
+    retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET );
+    if (retCode != clblasSuccess) {
+		#ifdef DEBUG_TBSV
+		printf("Invalid Size for X\n");
+		#endif
+        return retCode;
+    }
+
+	#ifdef DEBUG_TBSV
+	printf("DoTbsv being called...\n");
+	#endif
+
+	if ((commandQueues == NULL) || (numCommandQueues == 0))
+	{
+		return clblasInvalidValue;
+	}
+
+	if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+	{
+		return clblasInvalidEventWaitList;
+	}
+
+    if (commandQueues[0] == NULL)
+	{
+		return clblasInvalidCommandQueue;
+	}
+
+
+	numCommandQueues = 1; // NOTE: Hard-coding the number of command queues to 1i
+    kargs->order = order;
+    kargs->uplo = uplo;
+    kargs->transA = trans;
+	kargs->diag = diag;
+    kargs->M = N; // store Original N
+    kargs->N = N;
+    kargs->K = K;
+    kargs->A = A;
+    kargs->lda.matrix = lda;
+    kargs->B = x;
+    kargs->ldb.vector = incx;
+    kargs->offBX = offx;
+	kargs->offa = offa;
+	kargs->offA = offa;
+    kargs->C = x;
+    kargs->offCY = offx;
+    kargs->ldc.vector = incx;
+    kargs->startRow = 0;
+
+    if(trans == clblasNoTrans)
+    {
+        kargs->endRow = (order == clblasRowMajor) ?  N-1 : N;
+    }
+    else
+    {
+        kargs->endRow = (order == clblasRowMajor) ?  N : N-1;
+    }
+
+    memcpy(&gbmvKargs, kargs, sizeof(CLBlasKargs));
+    gbmvKargs.pigFuncID = CLBLAS_GBMV;
+
+    listInitHead(&seq);
+    listInitHead(&gbmvSeq);
+
+    err = makeSolutionSeq(CLBLAS_TRSV, kargs, numCommandQueues, commandQueues,
+                          numEventsInWaitList, eventWaitList, events, &seq);
+
+    if (err == CL_SUCCESS) {
+
+        err = makeSolutionSeq(CLBLAS_GBMV, &gbmvKargs, numCommandQueues, commandQueues,
+                                0, NULL, NULL, &gbmvSeq);
+        if (err == CL_SUCCESS)
+        {
+            err = orchestrateTBSV(kargs, &seq, &gbmvSeq, numEventsInWaitList, eventWaitList, events);
+        }
+    }
+
+    freeSolutionSeq(&seq);
+    freeSolutionSeq(&gbmvSeq);
+    return (clblasStatus)err;
+}
+
+clblasStatus
+clblasStbsv(
+clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    #ifdef DEBUG_TBSV
+    printf("STBSV Called\n");
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_FLOAT;
+    kargs.pigFuncID = CLBLAS_TBSV;
+    kargs.alpha.argFloat = -1.0;
+    kargs.beta.argFloat = 1.0;
+
+    return doTbsv(&kargs, order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasDtbsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    #ifdef DEBUG_TBSV
+    printf("DTBSV Called\n");
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_DOUBLE;
+    kargs.pigFuncID = CLBLAS_TBSV;
+    kargs.alpha.argDouble = -1.0;
+    kargs.beta.argDouble = 1.0;
+
+    return doTbsv(&kargs, order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasCtbsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    FloatComplex alpha, beta;
+    #ifdef DEBUG_TBSV
+    printf("CTBSV Called\n");
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+    kargs.pigFuncID = CLBLAS_TBSV;
+
+    CREAL(alpha) = -1.0;
+    CIMAG(alpha) = 0.0;
+    CREAL(beta) = 1.0;
+    CIMAG(beta) = 0.0;
+
+    kargs.alpha.argFloatComplex = alpha;
+    kargs.beta.argFloatComplex = beta;
+
+    return doTbsv(&kargs, order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasZtbsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    DoubleComplex alpha, beta;
+    #ifdef DEBUG_TBSV
+    printf("ZTBSV Called\n");
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+    kargs.pigFuncID = CLBLAS_TBSV;
+
+    CREAL(alpha) = -1.0;
+    CIMAG(alpha) = 0.0;
+    CREAL(beta) = 1.0;
+    CIMAG(beta) = 0.0;
+
+    kargs.alpha.argDoubleComplex = alpha;
+    kargs.beta.argDoubleComplex = beta;
+
+    return doTbsv(&kargs, order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
diff --git a/src/library/blas/xtrmm.c b/src/library/blas/xtrmm.c
new file mode 100644
index 0000000..b7611da
--- /dev/null
+++ b/src/library/blas/xtrmm.c
@@ -0,0 +1,245 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+static clblasStatus
+doTrmm(
+    CLBlasKargs *kargs,
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    cl_int err;
+    ListHead seq;
+    size_t msize;
+    clblasStatus retCode = clblasSuccess;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+    /* Validate arguments */
+
+    if (retCode = checkMemObjects(A, B, NULL, false, A_MAT_ERRSET, B_MAT_ERRSET, END_ERRSET )) {
+        return retCode;
+    }
+    msize = (side == clblasLeft) ? M : N;
+    if (retCode = checkMatrixSizes(kargs->dtype, order, transA, msize, msize, A,
+                         offA, lda, A_MAT_ERRSET )) {
+        return retCode;
+    }
+    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, B,
+                         offB, ldb, B_MAT_ERRSET )) {
+        return retCode;
+    }
+
+    kargs->order = order;
+    kargs->side = side;
+    kargs->uplo = uplo;
+    kargs->transA = transA;
+    kargs->diag = diag;
+    kargs->M = M;
+    kargs->N = N;
+    kargs->A = A;
+    kargs->offA = offA;
+    kargs->lda.matrix = lda;
+    kargs->B = B;
+    kargs->offBX = offB;
+    kargs->ldb.matrix = ldb;
+    // Store original problem size in K, this is used to know it while
+    // calculating result by parts using M or N as part size
+    if (side == clblasLeft) {
+        kargs->K = M;
+    }
+    else {
+        kargs->K = N;
+    }
+
+    kargs->offsetM = 0;
+    kargs->offsetN = 0;
+    kargs->scimage[0] = 0;
+
+#ifndef TRXM_MULTIPLE_QUEUES
+    if (numCommandQueues != 0) {
+        numCommandQueues = 1;
+    }
+#endif
+
+    listInitHead(&seq);
+    err = makeSolutionSeq(CLBLAS_TRMM, kargs, numCommandQueues, commandQueues,
+        numEventsInWaitList, eventWaitList, events, &seq);
+    if (err == CL_SUCCESS) {
+        err = executeSolutionSeq(&seq);
+    }
+
+    freeSolutionSeq(&seq);
+
+    return (clblasStatus)err;
+}
+
+clblasStatus
+clblasStrmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_FLOAT;
+    kargs.alpha.argFloat = alpha;
+
+    return doTrmm(&kargs, order, side, uplo, transA, diag, M, N, A, offA, lda,
+                  B, offB, ldb, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasDtrmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_DOUBLE;
+    kargs.alpha.argDouble = alpha;
+
+    return doTrmm(&kargs, order, side, uplo, transA, diag, M, N, A, offA, lda,
+                  B, offB, ldb, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasCtrmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+    kargs.alpha.argFloatComplex = alpha;
+
+    return doTrmm(&kargs, order, side, uplo, transA, diag, M, N, A, offA, lda,
+                  B, offB, ldb, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasZtrmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+    kargs.alpha.argDoubleComplex = alpha;
+
+    return doTrmm(&kargs, order, side, uplo, transA, diag, M, N, A, offA, lda,
+                  B, offB, ldb, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
diff --git a/src/library/blas/xtrmv.c b/src/library/blas/xtrmv.c
new file mode 100644
index 0000000..2f4e216
--- /dev/null
+++ b/src/library/blas/xtrmv.c
@@ -0,0 +1,417 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+clblasStatus
+doTrmv(
+	CLBlasKargs *kargs,
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem x,
+    size_t offx,
+    int incx,
+	cl_mem y, // Scratch Buffer
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    cl_int err;
+    ListHead seq;
+	size_t sizeOfVector;
+	cl_event *newEventWaitList;
+    clblasStatus retCode = clblasSuccess;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+    /* Validate arguments */
+
+    if (retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET)) {
+	printf("Invalid mem object..\n");
+        return retCode;
+    }
+
+    if (retCode = checkMatrixSizes(kargs->dtype, order, trans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+		printf("Invalid Size for A\n");
+        return retCode;
+    }
+    if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET )) {
+		printf("Invalid Size for X\n");
+        return retCode;
+    }
+    if (retCode = checkVectorSizes(kargs->dtype, N, y, 0, incx, Y_VEC_ERRSET )) {
+		printf("Invalid Size for scratch vector\n");
+        return retCode;
+    }
+
+	#ifdef DEBUG_TRMV
+	printf("DoTrmv being called...\n");
+	#endif
+
+	if ((commandQueues == NULL) || (numCommandQueues == 0))
+	{
+		return clblasInvalidValue;
+	}
+    numCommandQueues = 1;
+
+	if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+	{
+		return clblasInvalidEventWaitList;
+	}
+
+	newEventWaitList = malloc((numEventsInWaitList+1) * sizeof(cl_event));
+	if (newEventWaitList == NULL)
+	{
+		return clblasOutOfHostMemory;
+	}
+	if (numEventsInWaitList != 0 )
+	{
+		memcpy(newEventWaitList, eventWaitList, numEventsInWaitList*sizeof(cl_event));
+	}
+
+	/*
+ 	 * ASSUMPTION:
+ 	 * doTRMV assumes "commandQueue" of 0. The same is reflected in
+	 * "makeSolutionSeq" as well. If either of them changes in future,
+	 * this code needs to be revisited.
+  	 */
+	sizeOfVector = (1 + (N-1)*abs(incx)) * dtypeSize(kargs->dtype);
+	err = clEnqueueCopyBuffer(commandQueues[0], x, y, offx*dtypeSize(kargs->dtype), 0, sizeOfVector,
+							  numEventsInWaitList, eventWaitList, &newEventWaitList[numEventsInWaitList]);
+	if (err != CL_SUCCESS)
+	{
+		free(newEventWaitList);
+		return err;
+	}
+
+    kargs->order = order;
+    kargs->uplo = uplo;
+    kargs->transA = trans;
+	kargs->diag = diag;
+    kargs->N = N;
+    kargs->K = N; //store original N
+    kargs->A = A;
+    kargs->lda.matrix = lda;
+    kargs->B = x;
+    kargs->ldb.vector = incx;
+    kargs->C = y;
+    kargs->ldc.vector = incx;
+    kargs->offBX = offx;
+    kargs->offCY = 0; // Not used by assignKargs(); Just for clarity
+	kargs->offa = offa;
+	kargs->offA = offa;
+    kargs->offsetM = 0;
+    kargs->offsetN = 0;
+//    kargs->offsetK = 0;
+    kargs->scimage[0] = 0;
+    kargs->scimage[1] = 0;
+
+	#ifdef DEBUG_TRMV
+	printf("Calling makeSolutionSeq : TRMV\n");
+	#endif
+
+    listInitHead(&seq);
+    err = makeSolutionSeq(CLBLAS_TRMV, kargs, numCommandQueues, commandQueues,
+        				  numEventsInWaitList+1, newEventWaitList, events, &seq);
+    if (err == CL_SUCCESS) {
+       	err = executeSolutionSeq(&seq);
+    }
+
+    freeSolutionSeq(&seq);
+	free(newEventWaitList);
+    return (clblasStatus)err;
+}
+
+clblasStatus
+clblasStrmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+	#ifdef DEBUG_TRMV
+	printf("STRMV Called\n");
+	#endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_FLOAT;
+    kargs.pigFuncID = CLBLAS_TRMV;
+
+    return doTrmv(&kargs, order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasDtrmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+	#ifdef DEBUG_TRMV
+	printf("DTRMV called\n");
+	#endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_DOUBLE;
+    kargs.pigFuncID = CLBLAS_TRMV;
+
+    return doTrmv(&kargs, order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+
+clblasStatus
+clblasCtrmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+	#ifdef DEBUG_TRMV
+	printf("CTRMV called\n");
+	#endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+    kargs.pigFuncID = CLBLAS_TRMV;
+
+    return doTrmv(&kargs, order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasZtrmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+	#ifdef DEBUG_TRMV
+	printf("ZTRMV called\n");
+	#endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+    kargs.pigFuncID = CLBLAS_TRMV;
+
+    return doTrmv(&kargs, order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+
+clblasStatus
+clblasStpmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem AP,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+	#ifdef DEBUG_TPMV
+	printf("STPMV Called\n");
+	#endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_FLOAT;
+    kargs.pigFuncID = CLBLAS_TPMV;
+
+    return doTrmv(&kargs, order, uplo, trans, diag, N, AP, offa, 0 /* lda as zero for packed */, X, offx, incx, scratchBuff, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasDtpmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem AP,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+	#ifdef DEBUG_TPMV
+	printf("DTPMV called\n");
+	#endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_DOUBLE;
+    kargs.pigFuncID = CLBLAS_TPMV;
+
+    return doTrmv(&kargs, order, uplo, trans, diag, N, AP, offa, 0, X, offx, incx, scratchBuff, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+
+clblasStatus
+clblasCtpmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem AP,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+	#ifdef DEBUG_TPMV
+	printf("CTPMV called\n");
+	#endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+    kargs.pigFuncID = CLBLAS_TPMV;
+
+    return doTrmv(&kargs, order, uplo, trans, diag, N, AP, offa, 0, X, offx, incx, scratchBuff, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasZtpmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem AP,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+	#ifdef DEBUG_TPMV
+	printf("ZTPMV called\n");
+	#endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+    kargs.pigFuncID = CLBLAS_TPMV;
+
+    return doTrmv(&kargs, order, uplo, trans, diag, N, AP, offa, 0, X, offx, incx, scratchBuff, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
diff --git a/src/library/blas/xtrsm.c b/src/library/blas/xtrsm.c
new file mode 100644
index 0000000..9fb5b4a
--- /dev/null
+++ b/src/library/blas/xtrsm.c
@@ -0,0 +1,249 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+static clblasStatus
+doTrsm(
+    CLBlasKargs *kargs,
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    cl_int err;
+    ListHead seq;
+    size_t msize;
+    clblasStatus retCode = clblasSuccess;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+    /* Validate arguments */
+
+    if (retCode = checkMemObjects(A, B, NULL, false, A_MAT_ERRSET, B_MAT_ERRSET, END_ERRSET )) {
+        return retCode;
+    }
+    msize = (side == clblasLeft) ? M : N;
+
+    if (retCode = checkMatrixSizes(kargs->dtype, order, transA, msize, msize, A,
+                         offA, lda, A_MAT_ERRSET )) {
+        return retCode;
+    }
+    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, B,
+                         offB, ldb, B_MAT_ERRSET )) {
+        return retCode;
+    }
+
+    kargs->order = order;
+    kargs->side = side;
+    kargs->uplo = uplo;
+    kargs->transA = transA;
+    kargs->diag = diag;
+    kargs->M = M;
+    kargs->N = N;
+    kargs->A = A;
+    kargs->offA = offA;
+    kargs->lda.matrix = lda;
+    kargs->B = B;
+    kargs->offBX = offB;
+    kargs->ldb.matrix = ldb;
+    // Store original problem size in K, this is used to know it while
+    // calculating result by parts using M or N as part size
+    if (side == clblasLeft) {
+        kargs->K = M;
+    }
+    else {
+        kargs->K = N;
+    }
+
+    kargs->offsetM = 0;
+    kargs->offsetN = 0;
+    kargs->scimage[0] = 0;
+
+#ifndef TRXM_MULTIPLE_QUEUES
+    if (numCommandQueues != 0) {
+        numCommandQueues = 1;
+    }
+#endif
+
+    listInitHead(&seq);
+    err = makeSolutionSeq(CLBLAS_TRSM, kargs, numCommandQueues, commandQueues,
+        numEventsInWaitList, eventWaitList, events, &seq);
+    if (err == CL_SUCCESS) {
+        err = executeSolutionSeq(&seq);
+    }
+
+    freeSolutionSeq(&seq);
+
+    return (clblasStatus)err;
+}
+
+clblasStatus
+clblasStrsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    cl_float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_FLOAT;
+    kargs.alpha.argFloat = alpha;
+
+    return doTrsm(&kargs, order, side, uplo, transA, diag, M, N, A, offA, lda,
+                  B, offB, ldb, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasDtrsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    cl_double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_DOUBLE;
+    kargs.alpha.argDouble = alpha;
+
+    return doTrsm(&kargs, order, side, uplo, transA, diag, M, N, A, offA, lda,
+                  B, offB, ldb, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasCtrsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+    kargs.alpha.argFloatComplex = alpha;
+    kargs.offA = offA;
+    kargs.offBX = offB;
+
+    return doTrsm(&kargs, order, side, uplo, transA, diag, M, N, A, offA, lda,
+                  B, offB, ldb, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasZtrsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+    kargs.alpha.argDoubleComplex = alpha;
+
+    return doTrsm(&kargs, order, side, uplo, transA, diag, M, N, A, offA, lda,
+                  B, offB, ldb, numCommandQueues, commandQueues,
+                  numEventsInWaitList, eventWaitList, events);
+}
+
diff --git a/src/library/blas/xtrsv.c b/src/library/blas/xtrsv.c
new file mode 100644
index 0000000..c334228
--- /dev/null
+++ b/src/library/blas/xtrsv.c
@@ -0,0 +1,719 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <devinfo.h>
+#include "clblas-internal.h"
+#include "solution_seq.h"
+
+//#define DEBUG_TRSV
+
+static clblasUplo
+getUpLo(CLBlasKargs *kargs)
+{
+	if (kargs->order == clblasColumnMajor)
+	{
+		return kargs->uplo;
+	}
+
+	if (kargs->uplo == clblasUpper)
+	{
+		return clblasLower;
+	}
+	return clblasUpper;
+}
+
+
+static clblasStatus
+orchestrateNonTransposeTRSV(CLBlasKargs *kargs, ListHead *trtriSeq, ListHead *gemvSeq, cl_uint numEventsInWaitList,
+				const cl_event *eventWaitList, cl_event *events)
+{
+	clblasStatus err;
+	SolutionStep *trtri, *gemv;
+	size_t nLoops, i;
+	cl_event *eventArray;
+	size_t TARGET_ROWS;
+
+	ListNode *f = listNodeFirst(trtriSeq);
+	trtri = container_of(f, node, SolutionStep);
+	f = listNodeFirst(gemvSeq);
+	gemv = container_of(f, node, SolutionStep);
+	TARGET_ROWS = trtri->subdims->y;
+
+	if ((trtri->subdims->y) != (gemv->subdims->y))
+	{
+		printf("TRSV: WARNING:	TRTRI and GEMV dont have identical sub-divisions!!! %lu and %lu\n", trtri->subdims->y, gemv->subdims->y);
+		return clblasNotImplemented;
+	} else {
+		#ifdef DEBUG_TRSV
+		printf("TRSV: MESSAGE:	TRTRI and GEMV have identical sub-divisions! = %lu\n", TARGET_ROWS);
+		#endif
+	}
+
+	trtri->numEventsInWaitList = numEventsInWaitList;
+	trtri->eventWaitList = eventWaitList;
+
+	if (kargs->N <= TARGET_ROWS)
+	{
+		trtri->event = events;
+		trtri->args.startRow = 0;
+		trtri->args.endRow = (cl_int)((kargs->N)-1);
+		err = executeSolutionSeq(trtriSeq);
+		return err;
+	}
+
+	//
+	// Allocate Event Chain
+	//
+	nLoops = ((kargs->N) / TARGET_ROWS);
+	if ((kargs->N % TARGET_ROWS))
+	{
+		nLoops++;
+	}
+	#ifdef DEBUG_TRSV
+	printf("TRSV: Orchestrate No Transpose Case: nLoops = %d\n", nLoops);
+	#endif
+	eventArray = malloc(nLoops*sizeof(cl_event));
+	if (eventArray == NULL)
+	{
+		return clblasOutOfHostMemory;
+	}
+
+	//
+	//	Solve 1 Triangle using Triangle Kernel Followed by Rectangle Kernels
+	//
+	trtri->event = &eventArray[0];
+	if (getUpLo(kargs) == clblasUpper)
+	{
+		trtri->args.startRow = (cl_int)((kargs->N) - TARGET_ROWS);
+		trtri->args.endRow = (cl_int)((kargs->N)-1);
+	} else {
+		trtri->args.startRow = 0;
+		trtri->args.endRow = (cl_int)(TARGET_ROWS-1);
+	}
+	err = executeSolutionSeq(trtriSeq);
+	if (err == CL_SUCCESS)
+	{
+		//
+		// Solve the Rectangles one by one
+		//
+		for(i=1; i<nLoops; i++)
+		{
+			gemv->numEventsInWaitList = 1;
+			gemv->eventWaitList = &eventArray[i-1];
+			if (i < (nLoops-1))
+			{
+				gemv->event = &eventArray[i];
+			} else {
+				gemv->event = events;
+			}
+
+			if (getUpLo(kargs) == clblasUpper)
+			{
+				gemv->args.startRow = (cl_int)((kargs->N-1) - (i-1)*TARGET_ROWS);
+				gemv->args.endRow   = (cl_int)((kargs->N) - (i)*TARGET_ROWS);
+			} else {
+				gemv->args.startRow = (cl_int)((i-1)*TARGET_ROWS);
+				gemv->args.endRow   = (cl_int)((kargs->N) - (TARGET_ROWS*i));
+			}
+			err = executeSolutionSeq(gemvSeq);
+			if (err != CL_SUCCESS)
+			{
+				printf("TRSV: WARNING: GEMV LOOP: Breaking after %d iterations	!!!\n", (int)i);
+				break;
+			}
+		}
+	}
+
+	free(eventArray);
+	return err;
+}
+
+static clblasStatus
+orchestrateTransposeTRSV(CLBlasKargs *kargs, ListHead *trtriSeq, ListHead *gemvSeq, cl_uint numEventsInWaitList,
+				const cl_event *eventWaitList, cl_event *events)
+{
+	clblasStatus err;
+	SolutionStep *trtri, *gemv;
+	size_t nLoops, i;
+	cl_event *triangleEventArray;
+	cl_event *rectangleEventArray;
+	size_t TRIANGLE_HEIGHT;
+
+	ListNode *f = listNodeFirst(trtriSeq);
+	trtri = container_of(f, node, SolutionStep);
+	f = listNodeFirst(gemvSeq);
+	gemv = container_of(f, node, SolutionStep);
+	TRIANGLE_HEIGHT = trtri->subdims->y;
+
+	if ((trtri->subdims->y) != (gemv->subdims->y))
+	{
+		printf("TRSV: Transpose: WARNING: TRTRI and GEMV dont have identical sub-divisions!!! %lu and %lu\n", trtri->subdims->y, gemv->subdims->y);
+		return clblasNotImplemented;
+	} else {
+		#ifdef DEBUG_TRSV
+		printf("TRSV: Transpose: MESSAGE:	TRTRI and GEMV have identical sub-divisions! = %lu\n", TRIANGLE_HEIGHT);
+		#endif
+	}
+
+	trtri->numEventsInWaitList = numEventsInWaitList;
+	trtri->eventWaitList = eventWaitList;
+	if (kargs->N <= TRIANGLE_HEIGHT)
+	{
+		trtri->event = events;
+		trtri->args.startRow = 0;
+		trtri->args.endRow = (cl_int)(kargs->N);
+		err = executeSolutionSeq(trtriSeq);
+		return err;
+	}
+
+	//
+	// Allocate Event Chain
+	//
+	nLoops = ((kargs->N) / TRIANGLE_HEIGHT);
+	if ((kargs->N % TRIANGLE_HEIGHT))
+	{
+		nLoops++;
+	}
+	#ifdef DEBUG_TRSV
+	printf("nLoops: %d\n", nLoops);
+	#endif
+	//
+	// Allocate Event Arrays to order the orchestration
+	//
+	triangleEventArray = malloc(nLoops*sizeof(cl_event));
+	rectangleEventArray = malloc(nLoops*sizeof(cl_event));
+	if ((triangleEventArray == NULL) || (rectangleEventArray == NULL))
+	{
+		if (triangleEventArray)
+		{
+			free (triangleEventArray);
+		}
+		if (rectangleEventArray)
+		{
+			free (rectangleEventArray);
+		}
+		return clblasOutOfHostMemory;
+	}
+
+	//
+	//	Solve as chain of TRIANGLE, RECTANGLE kernels ending on a pair-less TRIANGLE
+	//
+	for(i=0; i<nLoops; i++)
+	{
+		//
+		// TRIANGLE EXECUTION
+		//
+		#ifdef DEBUG_TRSV
+		printf("Calling TRTRI-");
+		#endif
+		trtri->event = &triangleEventArray[i];
+		if (i == (nLoops-1))
+		{
+			//
+			// TRTRI's last iteration must be tied to the "event" that the API
+			// user will choose to wait on.
+			//
+			trtri->event = events;
+		}
+
+		if (i != 0)
+		{
+			//
+			// For first iteration, TRTRI waits on what the API user has specified.
+			// Subsequent iterations will wait on the previous iteration's rectangle
+			// counterpart
+			//
+			trtri->numEventsInWaitList =1;
+			trtri->eventWaitList = &rectangleEventArray[i-1];
+		}
+
+		if (getUpLo(kargs) == clblasUpper)
+		{
+			trtri->args.startRow 	= (cl_int)(TRIANGLE_HEIGHT*i);
+			trtri->args.endRow 		= (cl_int)(TRIANGLE_HEIGHT*(i+1));
+			if (trtri->args.endRow >= (cl_int)kargs->N)
+			{
+				trtri->args.endRow = (cl_int)kargs->N;
+			}
+		} else {
+			if (kargs->N < TRIANGLE_HEIGHT*(i+1))
+			{
+				trtri->args.startRow 	= 0;
+			} else {
+				trtri->args.startRow 	= (cl_int)((kargs->N) - TRIANGLE_HEIGHT*(i+1));
+			}
+			trtri->args.endRow 		= (cl_int)((kargs->N) - TRIANGLE_HEIGHT*(i));
+		}
+		err = executeSolutionSeq(trtriSeq);
+		if (err != CL_SUCCESS)
+		{
+			printf("TRSV: Transpose: Breaking in the middle of loop due to error status, i=%d\n", (int)i);
+			break;
+		}
+		if (i == (nLoops-1))
+		{
+			break;
+		}
+		#ifdef DEBUG_TRSV
+		printf("Calling gemv-");
+		#endif
+		gemv->numEventsInWaitList = 1;
+		gemv->eventWaitList = &triangleEventArray[i];
+		gemv->event = &rectangleEventArray[i];
+		gemv->args.startRow = trtri->args.startRow;
+		gemv->args.endRow = trtri->args.endRow;
+		err = executeSolutionSeq(gemvSeq);
+		if (err != CL_SUCCESS)
+		{
+			printf("TRSV: Transpose: WARNING: GEMV LOOP: Breaking after %d iterations	!!!\n", (int)i);
+			break;
+		}
+	}
+
+	free(triangleEventArray);
+	free(rectangleEventArray);
+	return err;
+}
+
+static clblasStatus
+orchestrateTRSV(CLBlasKargs *kargs, ListHead *trtriSeq, ListHead *gemvSeq, cl_uint numEventsInWaitList,
+				const cl_event *eventWaitList, cl_event *events)
+{
+	clblasStatus err = clblasNotImplemented;
+
+	if 	(	((kargs->order == clblasColumnMajor) && (kargs->transA == clblasNoTrans))	||
+			((kargs->order == clblasRowMajor) && (kargs->transA != clblasNoTrans))
+		)
+	{
+		#ifdef DEBUG_TRSV
+		printf("Orchestrating the NO-Transpose case..\n");
+		#endif
+		err = orchestrateNonTransposeTRSV(kargs, trtriSeq, gemvSeq, numEventsInWaitList, eventWaitList, events);
+	} else {
+		#ifdef DEBUG_TRSV
+		printf("Orchestrating the Transpose case..\n");
+		#endif
+		err = orchestrateTransposeTRSV(kargs, trtriSeq, gemvSeq, numEventsInWaitList, eventWaitList, events);
+	}
+
+	return err;
+}
+
+clblasStatus
+doTrsv(
+	CLBlasKargs *kargs,
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem x,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    cl_int err = clblasNotImplemented;
+    ListHead seq;
+	CLBlasKargs gemvKargs;
+	ListHead gemvSeq;
+	// cl_context c;
+    clblasStatus retCode = clblasSuccess;
+
+    if (!clblasInitialized) {
+        return clblasNotInitialized;
+    }
+
+    /* Validate arguments */
+
+    if (retCode = checkMemObjects(A, x, (cl_mem) NULL, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET)) {
+		#ifdef DEBUG_TRSV
+		printf("Invalid mem object..\n");
+		#endif
+        return retCode;
+    }
+
+	/*
+	 * PENDING:
+ 	 * checkMatrixSizes() does not account for "offa" argument.
+ 	 * Need to pass "offa" when "checkMatrixSizes()" is changed.
+	 */
+    if (retCode = checkMatrixSizes(kargs->dtype, order, trans, N, N, A, offa, lda, A_MAT_ERRSET)) {
+		#ifdef DEBUG_TRSV
+		printf("Invalid Size for A\n");
+		#endif
+        return retCode;
+    }
+    if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET )) {
+		#ifdef DEBUG_TRSV
+		printf("Invalid Size for X\n");
+		#endif
+        return retCode;
+    }
+
+	#ifdef DEBUG_TRSV
+	printf("DoTrsv being called...\n");
+	#endif
+
+	if ((commandQueues == NULL) || (numCommandQueues == 0))
+	{
+		return clblasInvalidValue;
+	}
+
+	if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
+	{
+		return clblasInvalidEventWaitList;
+	}
+
+    if (commandQueues[0] == NULL)
+	{
+		return clblasInvalidCommandQueue;
+	}
+
+	numCommandQueues = 1; // NOTE: Hard-coding the number of command queues to 1
+    kargs->order = order;
+    kargs->uplo = uplo;
+    kargs->transA = trans;
+	kargs->diag = diag;
+    kargs->M = N; // store Original N
+    kargs->N = N; // The field "kargs->N" is the one used by the generator.
+    kargs->K = N; // store original N
+    kargs->A = A;
+    kargs->lda.matrix = lda;
+    kargs->B = x;
+    kargs->ldb.vector = incx;
+    kargs->offBX = offx;
+	kargs->offa = offa;
+	kargs->offA = offa;
+    kargs->offsetM = 0;
+    kargs->offsetN = 0;
+    kargs->scimage[0] = 0;
+    kargs->scimage[1] = 0;
+	memcpy(&gemvKargs, kargs, sizeof(CLBlasKargs));
+
+	#ifdef DEBUG_TRSV
+	printf("Calling makeSolutionSeq : TRSV\n");
+	#endif
+
+    listInitHead(&seq);
+	listInitHead(&gemvSeq);
+    //err = makeSolutionSeq(CLBLAS_TRSV, kargs, numCommandQueues, commandQueues,
+        				  //0, NULL, NULL, &seq);
+
+	/*
+    Problem of context getting released on entry seems to be gone on the new driver.
+    Uncomment these lines if problem recurs
+
+    getQueueContext(commandQueues[0], &c);
+	clRetainContext(c);
+	#ifdef DEBUG_TRSV
+	clGetContextInfo(c, CL_CONTEXT_REFERENCE_COUNT, sizeof(cl_uint), &refcnt, NULL);
+	printf("doTrsv(): REFCNT ON ENTRY= %u\n", refcnt);
+	#endif
+    */
+
+    err = makeSolutionSeq(CLBLAS_TRSV, kargs, numCommandQueues, commandQueues,
+        				  numEventsInWaitList, eventWaitList, events, &seq);
+	if (err == CL_SUCCESS)
+	{
+		err = makeSolutionSeq(CLBLAS_TRSV_GEMV, &gemvKargs, numCommandQueues, commandQueues,
+								0, NULL, NULL, &gemvSeq);
+		if (err == CL_SUCCESS)
+		{
+			#ifdef DEBUG_TRSV
+			printf("Orchestrating TRSV\n");
+			#endif
+			err = orchestrateTRSV(kargs, &seq, &gemvSeq, numEventsInWaitList, eventWaitList, events);
+		}
+	}
+
+    freeSolutionSeq(&seq);
+	freeSolutionSeq(&gemvSeq);
+	#ifdef DEBUG_TRSV
+	if (clGetContextInfo(c, CL_CONTEXT_REFERENCE_COUNT, sizeof(cl_uint), &refcnt, NULL) != CL_SUCCESS)
+	{
+		printf("doTrsv(): clGetContextInfo failed..\n");
+	} else {
+		printf("doTrsv(): REFCNT EXIT = %u\n", refcnt);
+	}
+	#endif
+    return  err;
+}
+
+clblasStatus
+clblasStrsv(
+clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    #ifdef DEBUG_TRSV
+    printf("STRSV Called\n");
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_FLOAT;
+    kargs.pigFuncID = CLBLAS_TRSV;
+
+    return doTrsv(&kargs, order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasDtrsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    #ifdef DEBUG_TRSV
+    printf("DTRSV Called\n");
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_DOUBLE;
+    kargs.pigFuncID = CLBLAS_TRSV;
+
+    return doTrsv(&kargs, order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasCtrsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    #ifdef DEBUG_TRSV
+    printf("CTRSV Called\n");
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+    kargs.pigFuncID = CLBLAS_TRSV;
+
+    return doTrsv(&kargs, order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasZtrsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    #ifdef DEBUG_TRSV
+    printf("ZTRSV Called\n");
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+    kargs.pigFuncID = CLBLAS_TRSV;
+
+    return doTrsv(&kargs, order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasStpsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    #ifdef DEBUG_TRSV
+    printf("STPSV Called\n");
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_FLOAT;
+    kargs.pigFuncID = CLBLAS_TPSV;
+
+    return doTrsv(&kargs, order, uplo, trans, diag, N, A, offa, 0, X, offx, incx, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasDtpsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    #ifdef DEBUG_TRSV
+    printf("DTPSV Called\n");
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_DOUBLE;
+    kargs.pigFuncID = CLBLAS_TPSV;
+
+    return doTrsv(&kargs, order, uplo, trans, diag, N, A, offa, 0, X, offx, incx, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasCtpsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    #ifdef DEBUG_TRSV
+    printf("CTPSV Called\n");
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_FLOAT;
+    kargs.pigFuncID = CLBLAS_TPSV;
+
+    return doTrsv(&kargs, order, uplo, trans, diag, N, A, offa, 0, X, offx, incx, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clblasZtpsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    CLBlasKargs kargs;
+    #ifdef DEBUG_TRSV
+    printf("ZTPSV Called\n");
+    #endif
+
+    memset(&kargs, 0, sizeof(kargs));
+    kargs.dtype = TYPE_COMPLEX_DOUBLE;
+    kargs.pigFuncID = CLBLAS_TPSV;
+
+    return doTrsv(&kargs, order, uplo, trans, diag, N, A, offa, 0, X, offx, incx, numCommandQueues, commandQueues,
+                   numEventsInWaitList, eventWaitList, events);
+}
+
diff --git a/src/library/common/clkern.c b/src/library/common/clkern.c
new file mode 100644
index 0000000..5197d2f
--- /dev/null
+++ b/src/library/common/clkern.c
@@ -0,0 +1,258 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+#include <clkern.h>
+#include <stdlib.h>
+#include <trace_malloc.h>
+
+enum {
+    MAX_SOURCE_SIZE = 1048576,
+    MAX_OPENCL_DEVICES = 64
+};
+
+static size_t
+getBinSizeAndIdx(cl_program program, int *idx)
+{
+    size_t allSizes[MAX_OPENCL_DEVICES], size = 0;
+    size_t i, retSize;
+
+    clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
+                     sizeof(allSizes), &allSizes, &retSize);
+    retSize /= sizeof(size);
+    for (i = 0; i < retSize; i++) {
+        size = allSizes[i];
+        if (size) {
+            break;
+        }
+    }
+
+    if (idx && (i < retSize)) {
+        *idx = (int)i;
+    }
+
+    return size;
+}
+
+cl_int
+launchClKernel(
+    KernelDesc *kernDesc,
+    cl_command_queue queue,
+    KernelErrorInfo *errInfo)
+{
+    cl_int status;
+    unsigned int i;
+    KernelArg *karg;
+    KernelErrorInfo ei;
+    unsigned long t;
+    unsigned int nrArgs;
+
+    errInfo->phase = -1;
+    errInfo->wrongArg = (unsigned int)-1;
+    ei.phase = -1;
+    ei.wrongArg = (unsigned int)-1;
+
+    status = clGetKernelInfo(kernDesc->kernel, CL_KERNEL_NUM_ARGS,
+                             sizeof(nrArgs), &nrArgs, NULL);
+    if (status != CL_SUCCESS) {
+        return status;
+    }
+
+    karg = kernDesc->args;
+    for (i = 0; (i < nrArgs) && (status == CL_SUCCESS); i++, karg++) {
+        status = clSetKernelArg(kernDesc->kernel, i, karg->typeSize,
+                                karg->arg.data);
+        if (status != CL_SUCCESS) {
+            ei.wrongArg = i;
+            ei.phase = PHASE_SET_ARGS;
+        }
+        else if (karg->hostBuf && (karg->dir & MEMOBJ_WRITE)) {
+            status = clEnqueueWriteBuffer(queue, karg->arg.mem,
+                                          CL_TRUE, 0, karg->hostBufLen,
+                                          karg->hostBuf, 0, NULL, NULL);
+            if (status != CL_SUCCESS) {
+                ei.wrongArg = i;
+                ei.phase = PHASE_ENQUEUE_WRITE;
+            }
+        }
+    }
+
+    if (status == CL_SUCCESS) {
+        status = clEnqueueNDRangeKernel(queue,
+                                        kernDesc->kernel,
+                                        (cl_uint)kernDesc->workDim,
+                                        NULL,
+                                        (const size_t*)kernDesc->globalThreads,
+                                        (const size_t*)kernDesc->localThreads,
+                                        (cl_uint)kernDesc->waitListSize,
+                                        kernDesc->eventWaitList,
+                                        kernDesc->event);
+        if ((status == CL_SUCCESS) && !kernDesc->nowait) {
+            status = clWaitForEvents(1, kernDesc->event);
+        }
+
+        if (status != CL_SUCCESS) {
+            ei.phase = PHASE_ENQUEUE_KERNEL;
+        }
+
+        if ((status == CL_SUCCESS) && kernDesc->needExecTime &&
+            kernDesc->event) {
+
+            if (kernDesc->nowait) {
+                status = clWaitForEvents(1, kernDesc->event);
+                if (status != CL_SUCCESS) {
+                    ei.phase = PHASE_PROFILING;
+                }
+            }
+
+            if (status == CL_SUCCESS) {
+                status = clGetEventProfilingInfo(*kernDesc->event,
+                                                 CL_PROFILING_COMMAND_START,
+                                                 sizeof(t), &t, NULL);
+                if (status == CL_SUCCESS) {
+                    status = clGetEventProfilingInfo(*kernDesc->event,
+                                                     CL_PROFILING_COMMAND_END,
+                                                     sizeof(kernDesc->execTime),
+                                                     &kernDesc->execTime, NULL);
+                    kernDesc->execTime -= t;
+                }
+                if (status != CL_SUCCESS) {
+                    ei.phase = PHASE_PROFILING;
+                }
+            }
+        }
+    }
+
+    karg = kernDesc->args;
+    for (i = 0; (i < nrArgs) && (status == CL_SUCCESS); i++, karg++) {
+        if (karg->hostBuf && (karg->dir & MEMOBJ_READ)) {
+            status = clEnqueueReadBuffer(queue, karg->arg.mem,
+                                         CL_TRUE, 0, karg->hostBufLen,
+                                         karg->hostBuf, 0, NULL, NULL);
+            if (status != CL_SUCCESS) {
+                ei.wrongArg = i;
+                ei.phase = PHASE_ENQUEUE_READ;
+            }
+        }
+    }
+
+    if ((status != CL_SUCCESS) && errInfo) {
+        errInfo->phase = ei.phase;
+        if (ei.phase != PHASE_ENQUEUE_KERNEL) {
+            errInfo->wrongArg = ei.wrongArg;
+        }
+    }
+
+    return status;
+}
+
+cl_program
+buildClProgram(
+    const char *source,
+    const char *buildOpts,
+    cl_context ctx,
+    cl_device_id devID,
+    char *logBuf,
+    size_t logBufSize,
+    cl_int *status)
+{
+    cl_program program = NULL;
+    cl_int stat = CL_SUCCESS;
+
+    program = clCreateProgramWithSource(ctx, 1, (const char**)&source,
+                                        NULL, &stat);
+    if (program != NULL) {
+        stat = clBuildProgram(program, 1, (const cl_device_id*)&devID,
+                              buildOpts, NULL, NULL);
+        if (stat != CL_SUCCESS) {
+            if (logBuf) {
+                logBuf[0] = '\0';
+                clGetProgramBuildInfo(program, devID,
+                                      CL_PROGRAM_BUILD_LOG,
+                                      logBufSize, logBuf, NULL);
+            }
+            clReleaseProgram(program);
+            program = NULL;
+        }
+    }
+
+    if (status) {
+        *status = stat;
+    }
+
+    return program;
+}
+
+cl_program
+createClProgramWithBinary(
+    cl_context ctx,
+    cl_device_id devID,
+    unsigned char *binary,
+    size_t binSize,
+    cl_int *status)
+{
+    cl_program program;
+    cl_int s;
+
+    program = clCreateProgramWithBinary(ctx, 1, &devID, &binSize,
+                                        (const unsigned char**)&binary,
+                                        NULL, &s);
+    if (program != NULL) {
+        s = clBuildProgram(program, 1, &devID, NULL, NULL, NULL);
+        if (s != CL_SUCCESS) {
+            clReleaseProgram(program);
+            program = NULL;
+        }
+    }
+
+    if (status != NULL) {
+        *status = s;
+    }
+
+    return program;
+}
+
+size_t
+getProgramBinarySize(cl_program program)
+{
+    return getBinSizeAndIdx(program, NULL);
+}
+
+unsigned char
+*getProgramBinary(cl_program program)
+{
+    unsigned char *binaries[MAX_OPENCL_DEVICES];
+    unsigned char *bin = NULL;
+    size_t size;
+    int idx = 0;
+
+    memset(binaries, 0, sizeof(binaries));
+    size = getBinSizeAndIdx(program, &idx);
+    bin = binaries[idx] = malloc(size);
+    if (bin != NULL) {
+        cl_int err;
+
+        err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(binaries),
+                               binaries, NULL);
+        if (err != CL_SUCCESS) {
+            free(bin);
+            bin = NULL;
+        }
+    }
+
+    return bin;
+}
diff --git a/src/library/common/devinfo-cache.c b/src/library/common/devinfo-cache.c
new file mode 100644
index 0000000..290aa0c
--- /dev/null
+++ b/src/library/common/devinfo-cache.c
@@ -0,0 +1,907 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <math.h>
+#include <stdlib.h>
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include <devinfo.h>
+
+static cl_ulong closestPowerOf2(cl_ulong x);
+
+static const char L2BENCH_NAME[] = "l2Bench";
+static const char *L2BENCH =
+    "__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |   \n"
+    "                               CLK_ADDRESS_NONE            |   \n"
+    "                               CLK_FILTER_NEAREST;             \n"
+
+    "__kernel                                                       \n"
+    "void l2Bench(                                                  \n"
+    "    __read_only image2d_t in,                                  \n"
+    "    size_t rounds,                                             \n"
+    "    __global float4 *out)                                      \n"
+    "{                                                              \n"
+    "    int width, height;                                         \n"
+    "    size_t gid, nrWorkItems;                                   \n"
+    "    size_t pixelsPerWorkItem;                                  \n"
+    "    size_t x, y, k, i;                                         \n"
+    "    float4 v, sum;                                             \n"
+
+    "    width = get_image_width(in);                               \n"
+    "    height = get_image_height(in);                             \n"
+
+    "    gid = get_global_id(0);                                    \n"
+    "    nrWorkItems = get_global_size(0);                          \n"
+
+    "    pixelsPerWorkItem = (width * height) / nrWorkItems;        \n"
+
+    "    sum = (float4)(0.0);                                       \n"
+
+    "    for (k = 0; k < rounds; k++) {                             \n"
+    "        x = (gid * pixelsPerWorkItem) % width;                 \n"
+    "        y = (gid * pixelsPerWorkItem) / width;                 \n"
+
+    "        for (i = 0; i < pixelsPerWorkItem; i++) {              \n"
+    "            v = read_imagef(in, sampler, (int2)(x, y));        \n"
+    "            sum += v;                                          \n"
+
+    "            x++;                                               \n"
+    "            y += x / width;                                    \n"
+    "            x %= width;                                        \n"
+
+    "        }                                                      \n"
+    "    }                                                          \n"
+    "    *out = sum;                                                \n"
+    "}                                                              \n";
+
+static const char L1BENCH_NAME[] = "l1Bench";
+static const char *L1BENCH =
+    "__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |   \n"
+    "                               CLK_ADDRESS_NONE            |   \n"
+    "                               CLK_FILTER_NEAREST;             \n"
+
+    "__kernel                                                       \n"
+    "void l1Bench(                                                  \n"
+    "    __read_only image2d_t in,                                  \n"
+    "    size_t l2Size,                                             \n"
+    "    size_t rounds,                                             \n"
+    "    __global float4 *out)                                      \n"
+    "{                                                              \n"
+    "    int width, height;                                         \n"
+    "    size_t gid, nrWorkItems;                                   \n"
+    "    size_t pixelsPerWorkItem;                                  \n"
+    "    size_t x, y, k, i;                                         \n"
+    "    float4 v, sum;                                             \n"
+
+    "    width = get_image_width(in);                               \n"
+    "    height = get_image_height(in);                             \n"
+
+    "    gid = get_global_id(0);                                    \n"
+    "    nrWorkItems = get_global_size(0);                          \n"
+
+    "    pixelsPerWorkItem = (width * height) / nrWorkItems;        \n"
+
+    "    sum = (float4)(0.0);                                       \n"
+
+    "    for (k = 0; k < rounds; k++) {                             \n"
+    "        x = (gid * pixelsPerWorkItem) % width;                 \n"
+    "        y = (gid * pixelsPerWorkItem) / width;                 \n"
+
+    "        for (i = 0; i < pixelsPerWorkItem - l2Size / sizeof(float4); i++) { \n"
+    "            v = read_imagef(in, sampler, (int2)(x, y));        \n"
+    "            sum += v;                                          \n"
+
+    "            x++;                                               \n"
+    "            y += x / width;                                    \n"
+    "            x %= width;                                        \n"
+
+    "        }                                                      \n"
+    "    }                                                          \n"
+    "    *out = sum;                                                \n"
+    "}                                                              \n";
+
+cl_ulong
+deviceL2CacheSize(
+    cl_device_id device,
+    cl_int *error)
+{
+    const size_t MAX_CACHE_SIZE = 1024 * 1024;
+    const size_t MIN_CACHE_SIZE =    1 * 1024;
+    const size_t STEP           =    4 * 1024;
+
+    /* Bigger number of rounds increases time measurement precision,
+     * but slows the test down.
+     */
+    const unsigned int ROUNDS = 32;
+
+    /* Repeat each kernel run sereval times for higher reliability. */
+    const unsigned int RELIABILITY_ROUNDS = 5;
+
+    cl_int err;
+
+    cl_uint maxComputeUnits;
+    cl_bool imageSupport;
+
+    cl_platform_id platform;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx;
+    cl_command_queue queue;
+    cl_program program;
+    cl_kernel kernel;
+    cl_event event;
+
+    cl_float *in;
+    size_t width, height;
+    const cl_image_format format = { CL_RGBA, CL_FLOAT };
+    cl_mem imgIn;
+    size_t origin[3], region[3];
+
+    cl_float4 out;
+    cl_mem bufOut;
+
+    size_t global_work_size, local_work_size;
+    cl_ulong start, end, avg;
+    cl_long *times;
+    cl_double d, max;
+
+    size_t steps;
+    size_t i, t;
+
+    /* Collect device properties. */
+    err = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS,
+        sizeof(cl_uint), &maxComputeUnits, NULL);
+    if (err != CL_SUCCESS) {
+        if (error != NULL) {
+            *error = err;
+        }
+        return 0;
+    }
+    err = clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT,
+        sizeof(cl_bool), &imageSupport, NULL);
+    if (err != CL_SUCCESS) {
+        if (error != NULL) {
+            *error = err;
+        }
+        return 0;
+    }
+    if (imageSupport == CL_FALSE) {
+        if (error != NULL) {
+            *error = CL_INVALID_OPERATION;  /* like clCreateImage2D() does */
+        }
+        return 0;
+    }
+
+    steps = (MAX_CACHE_SIZE - MIN_CACHE_SIZE) / STEP;
+    times = calloc(steps, sizeof(cl_long));
+    if (times == NULL) {
+        if (error != NULL) {
+            *error = CL_OUT_OF_HOST_MEMORY;
+        }
+        return 0;
+    }
+
+    /* Create necessary OpenCL objects */
+    err = clGetDeviceInfo(device, CL_DEVICE_PLATFORM,
+        sizeof(cl_platform_id), &platform, NULL);
+    if (err != CL_SUCCESS) {
+        free(times);
+        if (error != NULL) {
+            *error = err;
+        }
+        return 0;
+    }
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        free(times);
+        if (error != NULL) {
+            *error = err;
+        }
+        return 0;
+    }
+    queue = clCreateCommandQueue(ctx, device, CL_QUEUE_PROFILING_ENABLE, &err);
+    if (err != CL_SUCCESS) {
+        clReleaseContext(ctx);
+        free(times);
+        if (error != NULL) {
+            *error = err;
+        }
+        return 0;
+    }
+
+    program = clCreateProgramWithSource(ctx, 1, &L2BENCH, NULL, &err);
+    if (err != CL_SUCCESS) {
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        free(times);
+        if (error != NULL) {
+            *error = err;
+        }
+        return 0;
+    }
+    err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
+    if (err != CL_SUCCESS) {
+        clReleaseProgram(program);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        free(times);
+        if (error != NULL) {
+            *error = err;
+        }
+        return 0;
+    }
+    kernel = clCreateKernel(program, L2BENCH_NAME, &err);
+    clReleaseProgram(program);
+    if (err != CL_SUCCESS) {
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        free(times);
+        if (error != NULL) {
+            *error = err;
+        }
+        return 0;
+    }
+
+    /* Main idea of this test is to run one work-item on each compute unit.
+     * This will make clear L2 cache hit/miss picture.
+     */
+    global_work_size = maxComputeUnits;
+    local_work_size = 1;
+
+    /* Prepare output buffer */
+    bufOut = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+        sizeof(cl_float4), &out, &err);
+    if (err != CL_SUCCESS) {
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        free(times);
+        if (error != NULL) {
+            *error = err;
+        }
+        return 0;
+    }
+
+    for (t = 0; t < steps; t++) {
+        width = (size_t)sqrt((double)(MAX_CACHE_SIZE - t * STEP) / sizeof(cl_float4));
+        height = width;
+
+        /* Prepare image buffer */
+        in = calloc(width * height, sizeof(cl_float4));
+        if (in == NULL) {
+            clReleaseMemObject(bufOut);
+            clReleaseCommandQueue(queue);
+            clReleaseContext(ctx);
+            free(times);
+            if (error != NULL) {
+                *error = CL_OUT_OF_HOST_MEMORY;
+            }
+            return 0;
+        }
+        imgIn = clCreateImage2D(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+            &format, width, height, 0, in, &err);
+        if (err != CL_SUCCESS) {
+            free(in);
+            clReleaseMemObject(bufOut);
+            clReleaseCommandQueue(queue);
+            clReleaseContext(ctx);
+            free(times);
+            if (error != NULL) {
+                *error = err;
+            }
+            return 0;
+        }
+        origin[0] = origin[1] = origin[2] = 0;
+        region[0] = width;
+        region[1] = height;
+        region[2] = 1;
+
+        avg = 0;
+        for (i = 0; i < RELIABILITY_ROUNDS; i++) {
+            err = clEnqueueWriteImage(queue, imgIn, CL_TRUE, origin, region,
+                0, 0, in, 0, NULL, NULL);
+            if (err != CL_SUCCESS) {
+                clReleaseMemObject(imgIn);
+                free(in);
+                clReleaseMemObject(bufOut);
+                clReleaseCommandQueue(queue);
+                clReleaseContext(ctx);
+                free(times);
+                if (error != NULL) {
+                    *error = err;
+                }
+                return 0;
+            }
+            err = clEnqueueWriteBuffer(queue, bufOut, CL_TRUE, 0,
+                sizeof(cl_float4), &out, 0, NULL, NULL);
+            if (err != CL_SUCCESS) {
+                clReleaseMemObject(imgIn);
+                free(in);
+                clReleaseMemObject(bufOut);
+                clReleaseCommandQueue(queue);
+                clReleaseContext(ctx);
+                free(times);
+                if (error != NULL) {
+                    *error = err;
+                }
+                return 0;
+            }
+
+            err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &imgIn);
+            if (err != CL_SUCCESS) {
+                clReleaseMemObject(imgIn);
+                free(in);
+                clReleaseMemObject(bufOut);
+                clReleaseCommandQueue(queue);
+                clReleaseContext(ctx);
+                free(times);
+                if (error != NULL) {
+                    *error = err;
+                }
+                return 0;
+            }
+            err = clSetKernelArg(kernel, 1, sizeof(ROUNDS), &ROUNDS);
+            if (err != CL_SUCCESS) {
+                clReleaseMemObject(imgIn);
+                free(in);
+                clReleaseMemObject(bufOut);
+                clReleaseCommandQueue(queue);
+                clReleaseContext(ctx);
+                free(times);
+                if (error != NULL) {
+                    *error = err;
+                }
+                return 0;
+            }
+            err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufOut);
+            if (err != CL_SUCCESS) {
+                clReleaseMemObject(imgIn);
+                free(in);
+                clReleaseMemObject(bufOut);
+                clReleaseCommandQueue(queue);
+                clReleaseContext(ctx);
+                free(times);
+                if (error != NULL) {
+                    *error = err;
+                }
+                return 0;
+            }
+
+            err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL,
+                &global_work_size, &local_work_size, 0, NULL, &event);
+            if (err != CL_SUCCESS) {
+                clReleaseMemObject(imgIn);
+                free(in);
+                clReleaseMemObject(bufOut);
+                clReleaseCommandQueue(queue);
+                clReleaseContext(ctx);
+                free(times);
+                if (error != NULL) {
+                    *error = err;
+                }
+                return 0;
+            }
+            err = clWaitForEvents(1, &event);
+            if (err != CL_SUCCESS) {
+                clReleaseEvent(event);
+                clReleaseMemObject(imgIn);
+                free(in);
+                clReleaseMemObject(bufOut);
+                clReleaseCommandQueue(queue);
+                clReleaseContext(ctx);
+                free(times);
+                if (error != NULL) {
+                    *error = err;
+                }
+                return 0;
+            }
+
+            start = end = 0UL;
+            err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
+                sizeof(cl_ulong), &start, NULL);
+            if (err != CL_SUCCESS) {
+                clReleaseEvent(event);
+                clReleaseMemObject(imgIn);
+                free(in);
+                clReleaseMemObject(bufOut);
+                clReleaseCommandQueue(queue);
+                clReleaseContext(ctx);
+                free(times);
+                if (error != NULL) {
+                    *error = err;
+                }
+                return 0;
+            }
+            err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
+                sizeof(cl_ulong), &end, NULL);
+            if (err != CL_SUCCESS) {
+                clReleaseEvent(event);
+                clReleaseMemObject(imgIn);
+                free(in);
+                clReleaseMemObject(bufOut);
+                clReleaseCommandQueue(queue);
+                clReleaseContext(ctx);
+                free(times);
+                if (error != NULL) {
+                    *error = err;
+                }
+                return 0;
+            }
+
+            clReleaseEvent(event);
+
+            /* NOTE: Sometimes the difference between start and end times
+             * can be unexpectedly large - a tens of seconds.
+             * This is a wrong behavior.
+             */
+            //assert(end - start < 10000000000UL);
+
+            avg += end - start;
+        }
+
+        times[t] = avg / (width * height);
+
+        clReleaseMemObject(imgIn);
+        free(in);
+    }
+    clReleaseMemObject(bufOut);
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    max = 0;
+    i = MAX_CACHE_SIZE + 1;
+    for (t = 1; t < steps; t++) {
+        d = (cl_double)times[t - 1];
+        d /= times[t];
+        if (d > max) {
+            max = d;
+            i = MAX_CACHE_SIZE - t * STEP;
+        }
+    }
+    free(times);
+
+    if (i == MAX_CACHE_SIZE + 1)
+        return 0;
+    return closestPowerOf2(i);
+}
+
+cl_ulong
+deviceL1CacheSize(
+    cl_device_id device,
+    cl_ulong l2CacheSize,
+    cl_int *error)
+{
+    const size_t MIN_CACHE_SIZE = 1024;
+    const size_t STEP           = 1024;
+    size_t L2_SIZE              = (size_t)l2CacheSize;
+
+    /* Bigger number of rounds increases time measurement precision,
+     * but slows the test down.
+     */
+    const unsigned int ROUNDS = 64;
+
+    /* Repeat each kernel run sereval times for higher reliability. */
+    const unsigned int RELIABILITY_ROUNDS = 10;
+
+    cl_int err;
+
+    cl_uint maxComputeUnits;
+    cl_bool imageSupport;
+
+    cl_platform_id platform;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx;
+    cl_command_queue queue;
+    cl_program program;
+    cl_kernel kernel;
+    cl_event event;
+
+    cl_float *in;
+    size_t width, height;
+    const cl_image_format format = { CL_RGBA, CL_FLOAT };
+    cl_mem imgIn;
+    size_t origin[3], region[3];
+
+    cl_float4 out;
+    cl_mem bufOut;
+
+    size_t global_work_size, local_work_size;
+    cl_ulong start, end, avg;
+    cl_long *times;
+    cl_double d, max;
+
+    size_t steps;
+    size_t i, t;
+
+    /* Collect device properties. */
+    err = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS,
+        sizeof(cl_uint), &maxComputeUnits, NULL);
+    if (err != CL_SUCCESS) {
+        if (error != NULL) {
+            *error = err;
+        }
+        return 0;
+    }
+    err = clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT,
+        sizeof(cl_bool), &imageSupport, NULL);
+    if (err != CL_SUCCESS) {
+        if (error != NULL) {
+            *error = err;
+        }
+        return 0;
+    }
+    if (imageSupport == CL_FALSE) {
+        if (error != NULL) {
+            *error = CL_INVALID_OPERATION;  /* like clCreateImage2D() does */
+        }
+        return 0;
+    }
+
+    steps = 1 + (L2_SIZE - MIN_CACHE_SIZE) / STEP;
+    times = calloc(steps, sizeof(cl_long));
+    if (times == NULL) {
+        if (error != NULL) {
+            *error = CL_OUT_OF_HOST_MEMORY;
+        }
+        return 0;
+    }
+
+    /* Create necessary OpenCL objects */
+    err = clGetDeviceInfo(device, CL_DEVICE_PLATFORM,
+        sizeof(cl_platform_id), &platform, NULL);
+    if (err != CL_SUCCESS) {
+        free(times);
+        if (error != NULL) {
+            *error = err;
+        }
+        return 0;
+    }
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        free(times);
+        if (error != NULL) {
+            *error = err;
+        }
+        return 0;
+    }
+    queue = clCreateCommandQueue(ctx, device, CL_QUEUE_PROFILING_ENABLE, &err);
+    if (err != CL_SUCCESS) {
+        clReleaseContext(ctx);
+        free(times);
+        if (error != NULL) {
+            *error = err;
+        }
+        return 0;
+    }
+
+    program = clCreateProgramWithSource(ctx, 1, &L1BENCH, NULL, &err);
+    if (err != CL_SUCCESS) {
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        free(times);
+        if (error != NULL) {
+            *error = err;
+        }
+        return 0;
+    }
+    err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
+    if (err != CL_SUCCESS) {
+        clReleaseProgram(program);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        free(times);
+        if (error != NULL) {
+            *error = err;
+        }
+        return 0;
+    }
+    kernel = clCreateKernel(program, L1BENCH_NAME, &err);
+    clReleaseProgram(program);
+    if (err != CL_SUCCESS) {
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        free(times);
+        if (error != NULL) {
+            *error = err;
+        }
+        return 0;
+    }
+
+    /* Main idea of this test is to run one work-item on each compute unit.
+     * Image region assigned to one work-item consists of two parts:
+     *     - part with size of probable L1 cache
+     *     - part with size of L2 cache
+     * This makes cache misses in L1 to be misses in L2 as well.
+     * It is also assumed, that each Compute Unit has its own L1 cache.
+     */
+    global_work_size = maxComputeUnits;
+    local_work_size = 1;
+
+    /* Prepare output buffer */
+    bufOut = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+        sizeof(cl_float4), &out, &err);
+    if (err != CL_SUCCESS) {
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        free(times);
+        if (error != NULL) {
+            *error = err;
+        }
+        return 0;
+    }
+
+    for (t = 0; t < steps; t++) {
+
+        width = 64;         /* One image line takes 1KB */
+        height = (L2_SIZE - t * STEP + L2_SIZE) * global_work_size /
+                        (sizeof(cl_float4) * width);
+
+        /* Prepare image buffer */
+        in = calloc(width * height, sizeof(cl_float4));
+        if (in == NULL) {
+            clReleaseMemObject(bufOut);
+            clReleaseCommandQueue(queue);
+            clReleaseContext(ctx);
+            free(times);
+            if (error != NULL) {
+                *error = CL_OUT_OF_HOST_MEMORY;
+            }
+            return 0;
+        }
+        imgIn = clCreateImage2D(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+            &format, width, height, 0, in, &err);
+        if (err != CL_SUCCESS) {
+            free(in);
+            clReleaseMemObject(bufOut);
+            clReleaseCommandQueue(queue);
+            clReleaseContext(ctx);
+            free(times);
+            if (error != NULL) {
+                *error = err;
+            }
+            return 0;
+        }
+        origin[0] = origin[1] = origin[2] = 0;
+        region[0] = width;
+        region[1] = height;
+        region[2] = 1;
+
+        avg = 0;
+        for (i = 0; i < RELIABILITY_ROUNDS; i++) {
+            err = clEnqueueWriteImage(queue, imgIn, CL_TRUE, origin, region,
+                0, 0, in, 0, NULL, NULL);
+            if (err != CL_SUCCESS) {
+                clReleaseMemObject(imgIn);
+                free(in);
+                clReleaseMemObject(bufOut);
+                clReleaseCommandQueue(queue);
+                clReleaseContext(ctx);
+                free(times);
+                if (error != NULL) {
+                    *error = err;
+                }
+                return 0;
+            }
+            err = clEnqueueWriteBuffer(queue, bufOut, CL_TRUE, 0,
+                sizeof(cl_float4), &out, 0, NULL, NULL);
+            if (err != CL_SUCCESS) {
+                clReleaseMemObject(imgIn);
+                free(in);
+                clReleaseMemObject(bufOut);
+                clReleaseCommandQueue(queue);
+                clReleaseContext(ctx);
+                free(times);
+                if (error != NULL) {
+                    *error = err;
+                }
+                return 0;
+            }
+
+            err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &imgIn);
+            if (err != CL_SUCCESS) {
+                clReleaseMemObject(imgIn);
+                free(in);
+                clReleaseMemObject(bufOut);
+                clReleaseCommandQueue(queue);
+                clReleaseContext(ctx);
+                free(times);
+                if (error != NULL) {
+                    *error = err;
+                }
+                return 0;
+            }
+            err = clSetKernelArg(kernel, 1, sizeof(L2_SIZE), &L2_SIZE);
+            if (err != CL_SUCCESS) {
+                clReleaseMemObject(imgIn);
+                free(in);
+                clReleaseMemObject(bufOut);
+                clReleaseCommandQueue(queue);
+                clReleaseContext(ctx);
+                free(times);
+                if (error != NULL) {
+                    *error = err;
+                }
+                return 0;
+            }
+            err = clSetKernelArg(kernel, 2, sizeof(ROUNDS), &ROUNDS);
+            if (err != CL_SUCCESS) {
+                clReleaseMemObject(imgIn);
+                free(in);
+                clReleaseMemObject(bufOut);
+                clReleaseCommandQueue(queue);
+                clReleaseContext(ctx);
+                free(times);
+                if (error != NULL) {
+                    *error = err;
+                }
+                return 0;
+            }
+            err = clSetKernelArg(kernel, 3, sizeof(cl_mem), &bufOut);
+            if (err != CL_SUCCESS) {
+                clReleaseMemObject(imgIn);
+                free(in);
+                clReleaseMemObject(bufOut);
+                clReleaseCommandQueue(queue);
+                clReleaseContext(ctx);
+                free(times);
+                if (error != NULL) {
+                    *error = err;
+                }
+                return 0;
+            }
+
+            err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL,
+                &global_work_size, &local_work_size, 0, NULL, &event);
+            if (err != CL_SUCCESS) {
+                clReleaseMemObject(imgIn);
+                free(in);
+                clReleaseMemObject(bufOut);
+                clReleaseCommandQueue(queue);
+                clReleaseContext(ctx);
+                free(times);
+                if (error != NULL) {
+                    *error = err;
+                }
+                return 0;
+            }
+            err = clWaitForEvents(1, &event);
+            if (err != CL_SUCCESS) {
+                clReleaseEvent(event);
+                clReleaseMemObject(imgIn);
+                free(in);
+                clReleaseMemObject(bufOut);
+                clReleaseCommandQueue(queue);
+                clReleaseContext(ctx);
+                free(times);
+                if (error != NULL) {
+                    *error = err;
+                }
+                return 0;
+            }
+
+            start = end = 0UL;
+            err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
+                sizeof(cl_ulong), &start, NULL);
+            if (err != CL_SUCCESS) {
+                clReleaseEvent(event);
+                clReleaseMemObject(imgIn);
+                free(in);
+                clReleaseMemObject(bufOut);
+                clReleaseCommandQueue(queue);
+                clReleaseContext(ctx);
+                free(times);
+                if (error != NULL) {
+                    *error = err;
+                }
+                return 0;
+            }
+            err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
+                sizeof(cl_ulong), &end, NULL);
+            if (err != CL_SUCCESS) {
+                clReleaseEvent(event);
+                clReleaseMemObject(imgIn);
+                free(in);
+                clReleaseMemObject(bufOut);
+                clReleaseCommandQueue(queue);
+                clReleaseContext(ctx);
+                free(times);
+                if (error != NULL) {
+                    *error = err;
+                }
+                return 0;
+            }
+
+            clReleaseEvent(event);
+
+            /* NOTE: Sometimes the difference between start and end times
+             * can be unexpectedly large - a tens of seconds.
+             * This is a wrong behavior.
+             */
+            //assert(end - start < 10000000000UL);
+
+            avg += end - start;
+        }
+
+        times[t] = avg / ((L2_SIZE - t * STEP) * global_work_size);
+
+        clReleaseMemObject(imgIn);
+        free(in);
+    }
+    clReleaseMemObject(bufOut);
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    max = 0;
+    i = L2_SIZE + 1;
+    for (t = 1; t < steps; t++) {
+        d = (cl_double)times[t - 1];
+        d /= times[t];
+        if (d > max) {
+            max = d;
+            i = L2_SIZE - t * STEP;
+        }
+    }
+    free(times);
+
+    if (i == L2_SIZE + 1)
+        return 0;
+    return closestPowerOf2(i);
+}
+
+cl_uint
+deviceL1CacheAssoc(
+    cl_device_id device,
+    cl_ulong l1CacheSize,
+    cl_int *error)
+{
+    /* TODO: Implementation needed. */
+
+    (void)device;
+    (void)l1CacheSize;
+
+    if (error != NULL) {
+        *error = CL_SUCCESS;
+    }
+    return 32;
+}
+
+static cl_ulong
+closestPowerOf2(cl_ulong x)
+{
+    cl_ulong below, above;
+
+    if (x == 0) {
+        return 0;
+    }
+    for (above = 1; above < x; above <<= 1) {
+        ; /* just iterate */
+    }
+    if (above == x) {
+        return x;
+    }
+    below = above >> 1;
+
+    if ((x - below) < (above - x)) {
+        return below;
+    }
+    return above;
+}
diff --git a/src/library/common/devinfo.c b/src/library/common/devinfo.c
new file mode 100644
index 0000000..dc37426
--- /dev/null
+++ b/src/library/common/devinfo.c
@@ -0,0 +1,312 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#include <defbool.h>
+
+#include <devinfo.h>
+
+static DeviceVendor
+stringToVendor(const char *str)
+{
+    DeviceVendor vendor;
+
+    if (!strcmp(str, "Advanced Micro Devices, Inc.")) {
+        vendor = VENDOR_AMD;
+    }
+    else if (!strcmp(str, "NVIDIA Corporation")) {
+        vendor = VENDOR_NVIDIA;
+    }
+    else {
+        vendor = VENDOR_UNKNOWN;
+    }
+
+    return vendor;
+}
+
+static DeviceChip
+stringToChip(const char *str)
+{
+    DeviceChip chip;
+
+    if (!strcmp(str, "Redwood")) {
+        chip = REDWOOD;
+    }
+    else if (!strcmp(str, "Juniper")) {
+        chip = JUNIPER;
+    }
+    else if (!strcmp(str, "Cypress")) {
+        chip = CYPRESS;
+    }
+    else if (!strcmp(str, "Hemlock")) {
+        chip = HEMLOCK;
+    }
+    else if (!strcmp(str, "Cayman")) {
+        chip = CAYMAN;
+    }
+    else if (!strcmp(str, "Tahiti")) {
+        chip = TAHITI;
+    }
+    else if (!strcmp(str, "GeForce GTX 480")) {
+        chip = GEFORCE_GTX_480;
+    }
+    else if (!strcmp(str, "GeForce GTX 580")) {
+        chip = GEFORCE_GTX_580;
+    }
+    else {
+        chip = CHIP_UNKNOWN;
+    }
+
+    return chip;
+}
+
+static DeviceFamily
+devFamily(DeviceChip chip)
+{
+    DeviceFamily fam;
+
+    switch (chip) {
+    case REDWOOD:
+    case JUNIPER:
+    case CYPRESS:
+    case HEMLOCK:
+        fam = GPU_FAMILY_EVERGREEN;
+        break;
+    case GEFORCE_GTX_480:
+    case GEFORCE_GTX_580:
+        fam = GPU_FAMILY_FERMI;
+        break;
+    default:
+        fam = DEVICE_FAMILY_UNKNOWN;
+        break;
+    }
+
+    return fam;
+}
+
+cl_int
+identifyDevice(TargetDevice *target)
+{
+    cl_int err;
+    char s[4096];
+    DeviceIdent *ident = &target->ident;
+
+    err = clGetDeviceInfo(target->id, CL_DEVICE_VENDOR, sizeof(s), s, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    ident->vendor = stringToVendor(s);
+    err = clGetDeviceInfo(target->id, CL_DEVICE_NAME, sizeof(s), s, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    ident->chip = stringToChip(s);
+    ident->family = devFamily(ident->chip);
+
+    return CL_SUCCESS;
+}
+
+cl_uint
+deviceWavefront(
+    cl_device_id device,
+    cl_int *error)
+{
+    (void)device;
+
+    if (error != NULL) {
+        *error = CL_SUCCESS;
+    }
+    return 64;
+}
+
+bool
+deviceHasNativeComplex(
+    cl_device_id device,
+    cl_int *error)
+{
+    (void)device;
+
+    if (error != NULL) {
+        *error = CL_SUCCESS;
+    }
+    return false;
+}
+
+cl_uint
+deviceComputeUnits(
+    cl_device_id device,
+    cl_int *error)
+{
+    cl_int err;
+    cl_uint v;
+
+    v = 0;
+    err = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS,
+        sizeof(v), &v, NULL);
+    if (error != NULL) {
+        *error = err;
+    }
+    return v;
+}
+
+size_t
+deviceMaxWorkgroupSize(
+    cl_device_id device,
+    cl_int *error)
+{
+    cl_int err;
+    size_t v;
+
+    v = 64;
+    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+        sizeof(v), &v, NULL);
+    if (error != NULL) {
+        *error = err;
+    }
+    return v;
+}
+
+cl_ulong
+deviceLDSSize(
+    cl_device_id device,
+    cl_int *error)
+{
+    cl_int err;
+    cl_long v;
+
+    v = 0;
+    err = clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE,
+        sizeof(v), &v, NULL);
+    if (error != NULL) {
+        *error = err;
+    }
+    return v;
+}
+
+cl_uint
+deviceDataAlignment(
+    cl_device_id device,
+    cl_int *error)
+{
+    cl_int err;
+    cl_uint v;
+
+    v = 0;
+    err = clGetDeviceInfo(device, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE,
+        sizeof(v), &v, NULL);
+    if (error != NULL) {
+        *error = err;
+    }
+    return v;
+}
+
+cl_uint
+deviceAddressBits(
+    cl_device_id device,
+    cl_int *error)
+{
+    cl_int err;
+    cl_uint v;
+
+    v = 0;
+    err = clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS,
+        sizeof(v), &v, NULL);
+    if (error != NULL) {
+        *error = err;
+    }
+    return v;
+}
+
+bool
+deviceHasNativeDouble(
+    cl_device_id device,
+    cl_int *error)
+{
+    cl_int err;
+    cl_uint v;
+    size_t len;
+    char *extensions, *s;
+
+    /* Check for cl_khr_fp64 extension */
+
+    err = clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
+        sizeof(cl_uint), &v, NULL);
+    if (err != CL_SUCCESS) {
+        if (error != NULL) {
+            *error = err;
+        }
+        return false;
+    }
+    if (v != 0) {
+        if (error != NULL) {
+            *error = CL_SUCCESS;
+        }
+        return true;
+    }
+
+    /* Check extensions */
+
+    err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &len);
+    if (err != CL_SUCCESS) {
+        if (error != NULL) {
+            *error = err;
+        }
+        return false;
+    }
+    extensions = calloc(1, len);
+    if (extensions == NULL) {
+        if (error != NULL) {
+            *error = CL_OUT_OF_HOST_MEMORY;
+        }
+        return false;
+    }
+    err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, len, extensions, NULL);
+    if (err != CL_SUCCESS) {
+        free(extensions);
+        if (error != NULL) {
+            *error = err;
+        }
+        return false;
+    }
+
+    /* Check for cl_amd_fp64 extension */
+    s = strstr(extensions, "cl_amd_fp64");      /* strlen("cl_amd_fp64") = 11 */
+    if (s != NULL) {
+        if ((s[11] == ' ') || (s[11] == '\0')) {
+            free(extensions);
+            if (error != NULL) {
+                *error = err;
+            }
+            return true;
+        }
+    }
+
+    free(extensions);
+    if (error != NULL) {
+        *error = CL_SUCCESS;
+    }
+    return false;
+}
diff --git a/src/library/common/gens/dblock_kgen.c b/src/library/common/gens/dblock_kgen.c
new file mode 100644
index 0000000..b30b391
--- /dev/null
+++ b/src/library/common/gens/dblock_kgen.c
@@ -0,0 +1,1497 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <string.h>
+#include <stdio.h>
+
+#include <dis_warning.h>
+#include <dblock_kgen.h>
+
+/*
+ * TODO:
+ * 1) barriers in the case when work group size is greater than the
+ *    wavefront size
+ * 2) 2D dimensional work group size
+ * 3) Try version with array indexing
+ * 4) Option to avoid unaligned access to vector data (?)
+ */
+
+// work performed by work items
+typedef struct ItemWork {
+    // number of rows to be processed by single work item
+    size_t nrRows;
+    // number of columns to be processed by single work item
+    size_t nrCols;
+    // number of items processing the same row
+    unsigned int itemsPerRow;
+    // total number of items performing the work
+    unsigned int nrItems;
+    // reduced number of rows at the block tail
+    size_t blockTail;
+    // work size to be done with the row tail non packed in float4
+    size_t tail;
+} ItemWork;
+
+/*
+ * Private data for loop unrolling
+ *
+ * NOTE: lmemLD is not used if both
+ *       'locLDName' is initialized
+ */
+typedef struct GenPriv {
+    DBlockCopyDirection dir;
+    bool transp;
+    bool packed;
+    bool conjugate;
+    bool notVectorize;
+    // local memory block leading dimension
+    size_t lmemLD;
+    // local memory leading dimension variable name
+    const char *locLDName;
+    // global memory leading dimension variable name
+    const char *globLDName;
+    DataType dtype;
+    unsigned int nfloats;
+    unsigned int typeSize;
+    const SubproblemDim *dim;
+    const ItemWork *work;
+    const char *srcName;
+    const char *dstName;
+    // variables names used while copying to images
+    const char *imgXName;
+    const char *imgYName;
+    size_t cnt;
+    // The block size used for copying.
+    // The default is 4.
+    unsigned int vecLen;
+
+} GenPriv;
+
+
+/*
+ *  'ld' in the list of arguments is matrix leading dimension
+ *
+ *  Common name forming rule:
+ *  (type prefix)(generic part)['Transp']['Conj']['Nvec'](src mem][dst mem][block height][block width]
+ */
+const char *copyMemDBlockDecl =
+    "void\n"
+    "%ccopyDBlock%s%s%s%c%c%lu%lu(\n"
+    "    %cPtr dst,\n"
+    "    %cPtr src,\n"
+    "    uint startRow,\n"
+    "    uint startCol,\n"
+    "    uint ld)\n";
+
+const char *copyMemGImgDBlockDecl =
+    "void\n"
+    "%ccopyDBlock%sGI%lux%lu(\n"
+    "    __write_only image2d_t dst,\n"
+    "    int startX,\n"
+    "    int startY,\n"
+    "    GPtr src,\n"
+    "    uint startRow,\n"
+    "    uint startCol,\n"
+    "    uint ld)\n";
+
+const char *copyMemLImgDBlockDecl =
+    "void\n"
+    "%ccopyDBlock%sLI%lux%lu(\n"
+    "    __write_only image2d_t dst,\n"
+    "    int startX,\n"
+    "    int startY,\n"
+    "    LPtr src)\n";
+
+/*
+ * declaration for function performing slow data block copying
+ */
+const char *copyMemDBlockSlowDecl =
+    "void\n"
+    "%ccopyDBlock%s%s%s%c%c(\n"
+    "    %cPtr dst,\n"
+    "    %cPtr src,\n"
+    "    uint startRow,\n"
+    "    uint startCol,\n"
+    "    uint nrRows,\n"
+    "    uint nrCols,\n"
+    "    uint dstLD,\n"
+    "    uint srcLD)\n";
+
+/*
+ * declaration for function performing slow data to image block copying
+ */
+const char *copyMemGImgDBlockSlowDecl =
+    "void\n"
+    "%ccopyDBlock%sGI(\n"
+    "    __write_only image2d_t dst,\n"
+    "    int startX,\n"
+    "    int startY,\n"
+    "    GPtr src,\n"
+    "    uint startRow,\n"
+    "    uint startCol,\n"
+    "    uint nrRows,\n"
+    "    uint nrCols,\n"
+    "    uint srcLD)\n";
+
+const char *copyMemLImgDBlockSlowDecl =
+    "void\n"
+    "%ccopyDBlock%sLI(\n"
+    "    __write_only image2d_t dst,\n"
+    "    int startX,\n"
+    "    int startY,\n"
+    "    LPtr src,\n"
+    "    uint nrRows,\n"
+    "    uint nrCols,\n"
+    "    uint srcLD)\n";
+
+/*
+ * local variables for slow copying between the global and
+ * the local memory
+ */
+
+const char *copyMemSlowLvars =
+    "uint i, j, n;\n"
+    /*
+     * end counters for copying with vector blocks and just vectors
+     * depending in copying type and direction
+     */
+    "uint jb, jv;\n"
+    // end counter for coying with single data with size lesser than float4
+    "%s"
+    // temporaty float4 variable for the transposing version
+    "%s"
+    "%cPtr dst1;\n"
+    "%cPtr src1;\n\n";
+
+/*
+ * One version use passing over the rows, and the second one use
+ * passing over the columns. The Second variant is used for transposed
+ * copying from the local to the global memory.
+ */
+const char *copyMemDBlockSlowStart[2] = {
+    "if (nrRows %% lsize) {\n"
+    "    n = nrRows / lsize + 1;\n"
+    "}\n"
+    "else {\n"
+    "    n = nrRows / lsize;\n"
+    "}\n"
+    "\n"
+    "jb = nrCols / %u;\n"
+    "jv = (nrCols - jb * %u) / %u;\n"
+    // set counter end for copying with data which size is lesser than float4
+    "%s"
+    // set pointers to initial position
+    "%s"
+    "%s"
+    "n = (n * lid >= nrRows) ? 0 : n;\n"
+    "n = (n * lid + n > nrRows) ? (n - 1) : n;\n"
+    "\n",
+
+    "if (nrCols %% lsize) {\n"
+    "    n = nrCols / lsize + 1;\n"
+    "}\n"
+    "else {\n"
+    "    n = nrCols / lsize;\n"
+    "}\n"
+    "\n"
+    // set counters for vector copying
+    "jb = nrRows / %u;\n"
+    "jv = (nrRows - jb * %u) / %u;\n"
+    // set counter end for copying with data which size is lesser than float4
+    "%s"
+    // set pointers to initial position
+    "%s"
+    "%s"
+    "n = (n * lid >= nrCols) ? 0 : n;\n"
+    "n = (n * lid + n > nrCols) ? (n - 1) : n;\n"
+    "\n"
+};
+
+/*
+ * declaration for function zeroing float4 aligned
+ * block of data
+ */
+const char *f4zeroDecl =
+    "void\n"
+    "%cf4zero%lu(%s float4 *data)\n";
+
+const char *fzeroSlowDecl = "void\n"
+                            "%cf4zero(%s float4 *buf, size_t cnt)\n";
+
+const char *copyMemImgDBlockSlow =
+    "for (i = 0; i < n; i++) {\n"
+    "    int x1 = x;\n"
+    "    int y1 = y;\n"
+    "    %cPtr src1 = src;\n"
+    "\n"
+    "    for (j = 0; j < jb; j++) {\n"
+    "        write_imageui(dst, (int2)(x1++, y1), as_uint4(*src1.f4v++));\n"
+    "        write_imageui(dst, (int2)(x1++, y1), as_uint4(*src1.f4v++));\n"
+    "        write_imageui(dst, (int2)(x1++, y1), as_uint4(*src1.f4v++));\n"
+    "        write_imageui(dst, (int2)(x1++, y1), as_uint4(*src1.f4v++));\n"
+    "    }\n"
+    "    for (j = 0; j < jv; j++) {\n"
+    "        write_imageui(dst, (int2)(x1++, y1), as_uint4(*src1.f4v++));\n"
+    "    }\n"
+    "\n"
+    "    y++;\n"
+    "    src.%s += srcLD;\n"
+    "}\n";
+
+
+const char *copyMemImgDBlockPackedSlow =
+    "for (i = 0; i < n; i++) {\n"
+    "    %cPtr src1 = src;\n"
+    "    x = startX + ((index + i) %% nLines) * nrCols / %lu;\n"
+    "    y = startY + (index + i) / nLines;\n"
+    "\n"
+    "    for (j = 0; j < jb; j++) {\n"
+    "        write_imageui(dst, (int2)(x++, y), as_uint4(*src1.f4v++));\n"
+    "        write_imageui(dst, (int2)(x++, y), as_uint4(*src1.f4v++));\n"
+    "        write_imageui(dst, (int2)(x++, y), as_uint4(*src1.f4v++));\n"
+    "        write_imageui(dst, (int2)(x++, y), as_uint4(*src1.f4v++));\n"
+    "    }\n"
+    "    for (j = 0; j < jv; j++) {\n"
+    "        write_imageui(dst, (int2)(x++, y), as_uint4(*src1.f4v++));\n"
+    "    }\n"
+    "\n"
+    "    src.%s += srcLD;\n"
+    "}\n";
+
+const char *setLoopBoundStmt =
+        "if (lid > %u) {\n"
+        "   nrows = 0;\n"
+        "}\n"
+        "else {\n"
+        "   nrows = (lid == %u) ? %u : %u;\n"
+        "}\n";
+
+const char *privatePtrs =
+        "%cPtr src1;\n"
+        "%cPtr dst1;\n";
+
+// loop bound variable name
+const char *lboundVarName = "nrows";
+// local id variable
+const char *lidVarName = "lid";
+
+
+/*
+ * Partial initialization of the generator private information
+ */
+static void
+initGenPriv(
+    GenPriv *priv,
+    DataType dtype,
+    unsigned int typeSize,
+    const SubproblemDim *dim,
+    DBlockCopyDirection dir,
+    const ItemWork *work,
+    const PGranularity *pgran)
+{
+    unsigned int gsize;
+
+    priv->dtype = dtype;
+    priv->typeSize = typeSize;
+    priv->nfloats = typeSize / sizeof(float);
+    priv->dim = dim;
+    priv->dir = dir;
+    priv->work = work;
+    priv->cnt = 0;
+    priv->vecLen = FLOAT4_VECLEN;
+    if (dir == DBLOCK_GLOBAL_TO_LOCAL || dir == DBLOCK_LOCAL_TO_GLOBAL) {
+        gsize = pgran->wgSize[0] * pgran->wgSize[1];
+        priv->vecLen = (unsigned int)(dim->x * dim->y * priv->nfloats / gsize);
+
+        if (priv->vecLen < 1) {
+            priv->vecLen = 1;
+        } else if (priv->vecLen > 4) {
+            priv->vecLen = FLOAT4_VECLEN;
+        }
+    }
+
+}
+
+/*
+ * get info about work to be done by the work group
+ *
+ * Resulting work data chunk for each item is float4 aligned.
+ * Remaining data chunk presented as tail for which code is
+ * generated just after the loop part getting deal with float4
+ * aligned chunks.
+ */
+static void
+getItemWork(ItemWork *work, const SubproblemDim *dim,
+            const PGranularity *pgran, size_t nfloats,
+            unsigned int vecLen)
+{
+    size_t n;
+    size_t gsize;
+
+    memset(work, 0, sizeof(ItemWork));
+    gsize = pgran->wgSize[0] * pgran->wgSize[1];
+
+    if (dim->y < gsize) {
+        // one work item processes a part of a row (or none at all)
+        work->itemsPerRow = (unsigned int)(gsize / dim->y);
+        work->nrCols = dim->x / work->itemsPerRow;
+        work->nrRows = 1;
+        if (work->itemsPerRow * dim->y < gsize) {
+            work->nrItems = (unsigned int)(work->itemsPerRow * dim->y);
+        }
+    }
+    else {
+        // one work item processes typically several rows (or none at all)
+        work->itemsPerRow = 1;
+        work->nrCols = dim->x;
+        work->nrRows = dim->y / gsize;
+        if (dim->y % gsize) {
+            work->nrRows++;
+            work->nrItems = (unsigned int)(dim->y / work->nrRows);
+            // remaining number of rows
+            n = dim->y - work->nrItems * work->nrRows;
+            if (n) {
+                work->blockTail = n;
+                // total number of work items needed for the transfer
+                work->nrItems++;
+            }
+        }
+    }
+    work->nrCols -= (work->nrCols * nfloats % vecLen) / nfloats;
+    work->tail = dim->x - work->nrCols * work->itemsPerRow;
+}
+
+/*
+ * Prepare generator outer loop
+ */
+static void
+prepareLoop(struct KgenContext *ctx, ItemWork *work, LoopCtl *loopCtl)
+{
+    char tmp[1024];
+
+    kgenAddStmt(ctx, "size_t n;\n");
+    loopCtl->ocName = "n";
+
+    if (work->nrItems) {
+        sprintf(tmp, "size_t %s;\n\n", lboundVarName);
+        kgenAddStmt(ctx, tmp);
+
+        /*
+         * set number of rows to be processed by the work item;
+         * in the case it is not a constant
+         */
+        if (work->blockTail) {
+            sprintf(tmp, setLoopBoundStmt, work->nrItems - 1, work->nrItems - 1,
+                    work->blockTail, work->nrRows);
+            kgenAddStmt(ctx, tmp);
+        }
+        else {
+            sprintf(tmp, "nrows = (%s >= %u) ? 0 : %lu;\n", lidVarName,
+                    work->nrItems, work->nrRows);
+            kgenAddStmt(ctx, tmp);
+        }
+
+        loopCtl->outBound.name = lboundVarName;
+    }
+    else {
+        loopCtl->outBound.val = (unsigned long)work->nrRows;
+        loopCtl->obConst = true;
+    }
+}
+
+static int
+getVecLen(struct KgenContext *ctx, void *priv)
+{
+    GenPriv *gpriv = (GenPriv*)priv;
+    (void) ctx;
+    return gpriv->vecLen;
+}
+
+/*
+ * common function for loop tail generating
+ */
+static void
+addTailCode(
+    struct KgenContext *ctx,
+    GenPriv *gpriv,
+    LoopUnrollGen genSingleVec,
+    LoopUnrollGen genSingle)
+{
+    char tmp[1024];
+    const ItemWork *work = gpriv->work;
+    LoopCtl loopCtl;
+    LoopUnrollers unrollers;
+
+    memset(&loopCtl, 0, sizeof(loopCtl));
+    memset(&unrollers, 0, sizeof(unrollers));
+
+    loopCtl.inBound = (unsigned long)work->tail;
+
+    if (work->itemsPerRow > 1) {
+        if (work->nrItems) {
+            sprintf(tmp, "if ((%s %% %u == %u) && (%s < %u))",
+                    lidVarName, work->itemsPerRow, work->itemsPerRow - 1,
+                    lidVarName, work->nrItems);
+        }
+        else {
+            sprintf(tmp, "if (%s %% %u == %u)",
+                    lidVarName, work->itemsPerRow, work->itemsPerRow - 1);
+        }
+        kgenBeginBranch(ctx, tmp);
+    }
+
+    unrollers.genSingleVec = genSingleVec;
+    unrollers.genSingle = genSingle;
+    unrollers.getVecLen = getVecLen;
+
+    kgenLoopUnroll(ctx, &loopCtl, gpriv->dtype, &unrollers, gpriv);
+
+    if (work->itemsPerRow > 1) {
+        kgenEndBranch(ctx, NULL);
+    }
+}
+
+static int
+copyMemPreUnroll(struct KgenContext *ctx, void *priv)
+{
+    DUMMY_ARG_USAGE(priv);
+
+    kgenAddStmt(ctx, "src1 = src;\n");
+
+    return kgenAddStmt(ctx, "dst1 = dst;\n\n");
+}
+
+static int
+copyImgPreUnroll(struct KgenContext *ctx, void *priv)
+{
+    char tmp[1024];
+    GenPriv *gpriv = (GenPriv*)priv;
+    if (gpriv->packed) {
+        sprintf(tmp, "%s = startX + (index * %lu) %% pLine / %u;\n"
+                "%s = startY + (index * %lu) / pLine;\n" "%s = src;\n\n",
+                gpriv->imgXName, gpriv->dim->x, FLOAT4_VECLEN / gpriv->nfloats,
+                gpriv->imgYName, gpriv->dim->x, gpriv->srcName);
+    }
+    else {
+        sprintf(tmp, "%s = x;\n" "%s = y;\n" "%s = src;\n\n", gpriv->imgXName,
+                gpriv->imgYName, gpriv->srcName);
+    }
+    return kgenAddStmt(ctx, tmp);
+}
+
+static int
+copyImgVec(struct KgenContext *ctx, void *priv)
+{
+    char tmp[1024];
+    GenPriv *gpriv = (GenPriv*)priv;
+
+    dtypeUPtrField(gpriv->dtype);
+    sprintf(tmp, "write_imageui(%s, (int2)(%s++,%s), as_uint4(*%s.f4v++));\n",
+            gpriv->dstName, gpriv->imgXName, gpriv->imgYName, gpriv->srcName);
+
+    return kgenAddStmt(ctx, tmp);
+}
+
+static int
+copyImgSingle(struct KgenContext *ctx, void *priv)
+{
+    GenPriv *gpriv = (GenPriv*)priv;
+    if (gpriv->dtype == TYPE_COMPLEX_DOUBLE) {
+        return copyImgVec(ctx, priv);
+    }
+    else {
+        return -EINVAL;
+    }
+}
+
+static int
+copyMemVec(struct KgenContext *ctx, void *priv)
+{
+    char tmp[1024];
+    char vec[64];
+    GenPriv *gpriv = (GenPriv*)priv;
+
+    if (gpriv->vecLen == 1)
+        sprintf(vec,"f");
+    else
+        sprintf(vec,"f%dv", gpriv->vecLen);
+
+    if (gpriv->conjugate) {
+        sprintf(tmp, "tmp = *%s.%s++;\n", gpriv->srcName, vec);
+        kgenAddStmt(ctx, tmp);
+        if (gpriv->dtype == TYPE_COMPLEX_FLOAT) {
+            kgenAddStmt(ctx, "tmp.y = -tmp.y;\n"
+                             "tmp.w = -tmp.w;\n");
+        }
+        else {
+            kgenAddStmt(ctx, "tmp.y = -tmp.y;\n");
+        }
+        sprintf(tmp, "*%s.%s++ = tmp;\n",
+                gpriv->dstName, vec);
+    }
+    else {
+        sprintf(tmp, "*%s.%s++ = *%s.%s++;\n", gpriv->dstName, vec,
+                gpriv->srcName, vec);
+    }
+
+    return kgenAddStmt(ctx, tmp);
+}
+
+static int
+copyMemSingle(struct KgenContext *ctx, void *priv)
+{
+    char tmp[1024];
+    GenPriv *gpriv = (GenPriv*)priv;
+    const char *vfield;
+
+    vfield = dtypeUPtrField(gpriv->dtype);
+
+    if (gpriv->conjugate) {
+        sprintf(tmp, "*%s.%s = *%s.%s++;\n",
+                gpriv->dstName, vfield, gpriv->srcName, vfield);
+        kgenAddStmt(ctx, tmp);
+        sprintf(tmp, "(*%s.%s).y = -(*%s.%s).y;\n",
+                gpriv->dstName, vfield, gpriv->dstName, vfield);
+        kgenAddStmt(ctx, tmp);
+        sprintf(tmp, "%s.%s++;\n", gpriv->dstName, vfield);
+    }
+    else {
+        sprintf(tmp, "*%s.%s++ = *%s.%s++;\n",
+                gpriv->dstName, vfield, gpriv->srcName, vfield);
+    }
+
+    return kgenAddStmt(ctx, tmp);
+}
+
+static int
+copyMemVecTransp(struct KgenContext *ctx, void *priv)
+{
+    char tmp[1024];
+    size_t i;
+    GenPriv *gpriv = (GenPriv*)priv;
+    unsigned int n = gpriv->nfloats;
+    const char *tmpSuff[2][4] = {
+            {"x", "y", "z", "w"},
+            {"xy", "zw", NULL, NULL}};
+    const char *dstSuff[4] = {"f", "f2v", NULL, "f4v"};
+    const char *vfield;
+    const char *s;
+
+    vfield = dtypeUPtrField(gpriv->dtype);
+    kgenAddBlankLine(ctx);
+
+    if (gpriv->dir == DBLOCK_GLOBAL_TO_LOCAL) {
+        sprintf(tmp, "tmp = *%s.f4v++;\n", gpriv->srcName);
+        kgenAddStmt(ctx, tmp);
+
+        if (gpriv->conjugate) {
+            /*
+             * Only complex float element can be conjugated here,
+             * those of double complex type are processed with no vectrized
+             * function
+             */
+            kgenAddStmt(ctx, "tmp.y = -tmp.y;\n"
+                             "tmp.w = -tmp.w;\n");
+        }
+
+        for (i = 0; i < FLOAT4_VECLEN / n; i++) {
+            if (gpriv->locLDName) {
+                sprintf(tmp, "%s.%s[%s * %lu] = tmp.%s;\n",
+                        gpriv->dstName, dstSuff[n - 1],
+                        gpriv->locLDName, i, tmpSuff[n - 1][i]);
+            }
+            else {
+                sprintf(tmp, "%s.%s[%lu] = tmp.%s;\n", gpriv->dstName,
+                        dstSuff[n - 1], gpriv->lmemLD * i, tmpSuff[n - 1][i]);
+            }
+            kgenAddStmt(ctx, tmp);
+        }
+        s = gpriv->dstName;
+    }
+    else {
+        for (i = 0; i < FLOAT4_VECLEN / n; i++) {
+            if (gpriv->locLDName) {
+                sprintf(tmp, "tmp.%s = %s.%s[%s * %lu];\n", tmpSuff[n - 1][i],
+                        gpriv->srcName, dstSuff[n - 1], gpriv->locLDName, i);
+            }
+            else {
+                sprintf(tmp, "tmp.%s = %s.%s[%lu];\n", tmpSuff[n - 1][i],
+                        gpriv->srcName, dstSuff[n - 1], gpriv->lmemLD * i);
+            }
+            kgenAddStmt(ctx, tmp);
+        }
+
+        sprintf(tmp, "*%s.f4v++ = tmp;\n", gpriv->dstName);
+        kgenAddStmt(ctx, tmp);
+
+        s = gpriv->srcName;
+    }
+
+    if (gpriv->locLDName) {
+        sprintf(tmp, "%s.%s += %s * %lu;\n", s, vfield, gpriv->locLDName, i);
+    }
+    else {
+        sprintf(tmp, "%s.%s += %lu;\n", s, vfield, gpriv->lmemLD * i);
+    }
+
+    return kgenAddStmt(ctx, tmp);
+}
+
+static int
+copyMemSingleTransp(struct KgenContext *ctx, void *priv)
+{
+    char tmp[1024];
+    GenPriv *gpriv = (GenPriv*)priv;
+    const char *vfield;
+
+    vfield = dtypeUPtrField(gpriv->dtype);
+    kgenAddBlankLine(ctx);
+
+    if (gpriv->dir == DBLOCK_GLOBAL_TO_LOCAL) {
+        if (gpriv->locLDName) {
+            sprintf(tmp, "*%s.%s = *%s.%s++;\n",
+                    gpriv->dstName, vfield,
+                    gpriv->srcName, vfield);
+            kgenAddStmt(ctx, tmp);
+
+            if (gpriv->conjugate) {
+                sprintf(tmp, "(*%s.%s).y = -(*%s.%s).y;\n",
+                        gpriv->dstName, vfield, gpriv->dstName,
+                        vfield);
+                kgenAddStmt(ctx, tmp);
+            }
+            sprintf(tmp, "%s.%s += %s;\n",
+                    gpriv->dstName, vfield, gpriv->locLDName);
+        }
+        else {
+            sprintf(tmp, "%s.%s[%lu] = *%s.%s++;\n",
+                    gpriv->dstName, vfield,
+                    gpriv->lmemLD * gpriv->cnt, gpriv->srcName,
+                    vfield);
+            if (gpriv->conjugate) {
+                kgenAddStmt(ctx, tmp);
+                sprintf(tmp, "%s.%s[%lu].y = -%s.%s[%lu].y;\n",
+                        gpriv->dstName, vfield, gpriv->lmemLD * gpriv->cnt,
+                        gpriv->dstName, vfield, gpriv->lmemLD * gpriv->cnt);
+            }
+        }
+    }
+    else {
+        if (gpriv->locLDName) {
+            sprintf(tmp, "*%s.%s++ = *%s.%s;\n"
+                         "%s.%s += %s;\n",
+                    gpriv->dstName, vfield,
+                    gpriv->srcName, vfield,
+                    gpriv->srcName, vfield, gpriv->locLDName);
+        }
+        else {
+            sprintf(tmp, "*%s.%s++ = %s.%s[%lu];\n",
+                    gpriv->dstName, vfield, gpriv->srcName, vfield,
+                    gpriv->lmemLD * gpriv->cnt);
+        }
+    }
+    gpriv->cnt++;
+
+    return kgenAddStmt(ctx, tmp);
+}
+
+/*
+ *  transfer row tail elements being not packing in float4 vector
+ *  and zeroing row tail
+ */
+static void
+addCopyTailCode(struct KgenContext *ctx, GenPriv *gpriv)
+{
+    LoopUnrollGen singleVec;
+    LoopUnrollGen single;
+    bool image;
+
+    image = (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE ||
+            gpriv->dir == DBLOCK_LOCAL_TO_IMAGE);
+
+    if (image) {
+        singleVec = copyImgVec;
+        single = copyImgSingle;
+    }
+    else {
+        if (gpriv->transp) {
+            singleVec = copyMemVecTransp;
+            single = copyMemSingleTransp;
+        }
+        else {
+            singleVec = copyMemVec;
+            single = copyMemSingle;
+        }
+    }
+
+    if (gpriv->notVectorize) {
+        singleVec = NULL;
+    }
+    addTailCode(ctx, gpriv, singleVec, single);
+}
+
+static int
+copyMemPostUnroll(struct KgenContext *ctx, void *priv)
+{
+    char tmp[1024];
+    const char *s[2] = {"src", "dst"};
+    GenPriv *gpriv = (GenPriv*)priv;
+    int gdir;
+    const char *vfield;
+
+    gdir = (gpriv->dir == DBLOCK_GLOBAL_TO_LOCAL) ? 0 : 1;
+
+    if (gpriv->work && gpriv->work->tail) {
+        addCopyTailCode(ctx, gpriv);
+    }
+
+    if (!gpriv->transp) {
+        kgenAddBlankLine(ctx);
+    }
+
+    // modify pointers
+    vfield = dtypeUPtrField(gpriv->dtype);
+    sprintf(tmp, "%s.%s += %s;\n", s[gdir], vfield, gpriv->globLDName);
+    kgenAddStmt(ctx, tmp);
+
+    if (gpriv->transp) {
+        sprintf(tmp, "%s.%s++;\n", s[1 - gdir], vfield);
+    }
+    else {
+        if (gpriv->locLDName) {
+            sprintf(tmp, "%s.%s += %s;\n", s[1 - gdir],
+                    vfield, gpriv->locLDName);
+        }
+        else {
+            sprintf(tmp, "%s.%s += %lu;\n", s[1 - gdir],
+                    vfield, gpriv->lmemLD);
+        }
+    }
+
+    return kgenAddStmt(ctx, tmp);
+}
+
+static int
+copyImgPostUnroll(struct KgenContext *ctx, void *priv)
+{
+    char tmp[1024];
+    GenPriv *gpriv = (GenPriv*)priv;
+    const char *vfield = dtypeUPtrField(gpriv->dtype);
+
+    if (gpriv->work && gpriv->work->tail) {
+        addCopyTailCode(ctx, gpriv);
+    }
+
+    kgenAddBlankLine(ctx);
+
+    if (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) {
+        sprintf(tmp, "src.%s += %s;\n", vfield, gpriv->globLDName);
+    }
+    else if (gpriv->dir == DBLOCK_LOCAL_TO_IMAGE) {
+        sprintf(tmp, "src.%s += %lu;\n", vfield, gpriv->lmemLD);
+    }
+    kgenAddStmt(ctx, tmp);
+    if(gpriv->packed) {
+        sprintf(tmp, "index++;\n");
+    } else {
+        sprintf(tmp, "y++;\n");
+    }
+    return kgenAddStmt(ctx, tmp);
+}
+
+// unrolling generator for the f4zero function
+static int
+f4zeroSingle(struct KgenContext *ctx, void *priv)
+{
+    DUMMY_ARG_USAGE(priv);
+
+    return kgenAddStmt(ctx, "*data++ = 0;\n");
+}
+
+/*
+ * Add statement setting initial local pointer for the work item
+ *
+ * @ld: lead dimension for the local block in float words;
+ *       if it's zero, the "ld" argument of a generated function is
+ *       used instead
+ */
+static void
+addSettingPtrCode(
+    struct KgenContext *ctx,
+    const char *ptrName,
+    size_t ld,
+    bool transpose,
+    const PGranularity *pgran,
+    GenPriv *gpriv)
+{
+    char tmp[4096];
+    const char *vfield;
+    const SubproblemDim *dim = gpriv->dim;
+    const ItemWork *work = gpriv->work;
+    size_t gsize;
+
+    vfield = dtypeUPtrField(gpriv->dtype);
+    gsize = pgran->wgSize[0] * pgran->wgSize[1];
+
+    if (ld) {
+        // offset between two rows and two elements in each row
+        size_t roff, eoff;
+
+        if (transpose) {
+            roff = 1;
+            eoff = ld;
+        }
+        else {
+            roff = ld;
+            eoff = 1;
+        }
+
+        if (dim->y < gsize) {
+            sprintf(tmp, "%s.%s += (%s / %u) * %lu + (%s %% %u * %lu) * %lu;\n",
+                    ptrName, vfield, lidVarName, work->itemsPerRow,
+                    roff, lidVarName, work->itemsPerRow, work->nrCols, eoff);
+        }
+        else {
+            sprintf(tmp, "%s.%s += %s * %lu * %lu;\n",
+                    ptrName, vfield, lidVarName, work->nrRows, roff);
+        }
+    }
+    else {
+        if (dim->y < gsize) {
+            sprintf(tmp, "%s.%s += (startRow + %s / %u) * %s + "
+                                   "startCol + %s %% %u * %lu;\n",
+                    ptrName, vfield, lidVarName, work->itemsPerRow,
+                    gpriv->globLDName, lidVarName, work->itemsPerRow, work->nrCols);
+        }
+        else {
+            sprintf(tmp, "%s.%s += (startRow + %s * %lu) * %s + startCol;\n",
+                    ptrName, vfield, lidVarName, work->nrRows, gpriv->globLDName);
+        }
+    }
+
+    kgenAddStmt(ctx, tmp);
+    kgenAddBlankLine(ctx);
+}
+
+/*
+ * Add statement setting initial coordinates pointer for image
+ *
+ */
+static void
+addSettingImageXYCode(
+    struct KgenContext *ctx,
+    const char *xName,
+    const char *yName,
+    const PGranularity *pgran,
+    GenPriv *gpriv)
+{
+    char tmp[4096];
+    const ItemWork *work = gpriv->work;
+    size_t gsize = pgran->wgSize[0] * pgran->wgSize[1];
+
+    if (gpriv->packed) {
+        sprintf(tmp, "pLine = ((get_image_width(dst) - startX) * %d / %lu) * %lu;\n",
+                FLOAT4_VECLEN / gpriv->nfloats, gpriv->dim->x, gpriv->lmemLD);
+        kgenAddStmt(ctx, tmp);
+        if (gpriv->dim->y < gsize) {
+            sprintf(tmp, "index = %s / %u;\n", lidVarName,
+                    work->itemsPerRow);
+        }
+        else {
+            sprintf(tmp, "index = %s * %lu;\n", lidVarName,
+                    work->nrRows);
+        }
+        kgenAddStmt(ctx, tmp);
+        sprintf(tmp, "x = startX + (index * %lu) %% pLine / %u;\n", gpriv->dim->x,
+                FLOAT4_VECLEN / gpriv->nfloats);
+        kgenAddStmt(ctx, tmp);
+        if (gpriv->dim->y < gsize) {
+            sprintf(tmp, "x += (%s %% %u) * (%lu / %u / %u);\n", lidVarName,
+                    work->itemsPerRow, gpriv->dim->x,
+                    (FLOAT4_VECLEN / gpriv->nfloats), work->itemsPerRow);
+            kgenAddStmt(ctx, tmp);
+        }
+        sprintf(tmp, "y = startY + (index * %lu) / pLine;\n", gpriv->dim->x);
+        kgenAddStmt(ctx, tmp);
+    }
+    else {
+        if (gpriv->dim->y < gsize) {
+            sprintf(tmp, "%s = startX + %s %% %u * %lu / %d;\n",
+                    xName, lidVarName, work->itemsPerRow, work->nrCols,
+                    FLOAT4_VECLEN/gpriv->nfloats);
+            kgenAddStmt(ctx, tmp);
+            sprintf(tmp, "%s = startY + %s / %u;\n", yName, lidVarName,
+                    work->itemsPerRow);
+            kgenAddStmt(ctx, tmp);
+        }
+        else {
+            sprintf(tmp, "%s = startX;\n", xName);
+            kgenAddStmt(ctx, tmp);
+            sprintf(tmp, "%s = startY + %s * %lu;\n", yName, lidVarName,
+                    gpriv->work->nrRows);
+            kgenAddStmt(ctx, tmp);
+        }
+    }
+
+    kgenAddBlankLine(ctx);
+}
+
+// generator working with subproblems of any dimension
+static int
+copyDBlockGenericGen(
+    struct KgenContext *ctx,
+    const PGranularity *pgran,
+    GenPriv *gpriv)
+{
+    char fpref;
+    const char varPref[2] = {'G', 'L'};
+    char tmp[1024];
+    bool image;
+    const char *s[3];
+    int gdir;
+    unsigned int i, n, gsize;
+    const char *vfield;
+    DataType dtype = gpriv->dtype;
+
+    fpref = dtypeToPrefix(dtype);
+    if (!fpref || (fpref == 'i')) {
+        return -EINVAL;
+    }
+
+    image = (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE ||
+             gpriv->dir == DBLOCK_LOCAL_TO_IMAGE);
+    s[0] = (gpriv->transp) ? "Transp" : "";
+    vfield = dtypeUPtrField(dtype);
+    n = FLOAT4_VECLEN / gpriv->nfloats;
+    gsize = pgran->wgSize[0] * pgran->wgSize[1];
+
+    if (image) {
+        char srcStr[1024];
+        s[1] = (gpriv->packed) ? "Pack" : "";
+        if (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) {
+            sprintf(srcStr, "src.%s += (startRow + lid * n) *"
+                    " srcLD + startCol;\n", vfield);
+            sprintf(tmp, copyMemGImgDBlockSlowDecl, fpref, s[1]);
+        }
+        else {
+            sprintf(srcStr, "src.%s += srcLD * lid * n;\n", vfield);
+            sprintf(tmp, copyMemLImgDBlockSlowDecl, fpref, s[1]);
+        }
+        kgenDeclareFunction(ctx, tmp);
+        kgenBeginFuncBody(ctx);
+        sprintf(tmp, "int x, y;\n"
+                     "uint i, j, n, jb, jv;\n"
+                     "int lsize = %u;\n", gsize);
+        kgenAddStmt(ctx, tmp);
+        kgenDeclareLocalID(ctx, "lid", pgran);
+        if (gpriv->packed) {
+            char nLinesStr[1024];
+            sprintf(nLinesStr,
+                    "nLines = (get_image_width(dst) - startX) * %d / nrCols;\n"
+                    "index = lid * n;\n", FLOAT4_VECLEN / gpriv->nfloats);
+            sprintf(tmp, "int nLines, index;\n");
+            kgenAddStmt(ctx, tmp);
+            sprintf(tmp, copyMemDBlockSlowStart[0], 4 * n, 4 * n, n,"",
+                    nLinesStr, srcStr);
+        }
+        else {
+            sprintf(tmp, copyMemDBlockSlowStart[0], 4 * n, 4 * n, n, "",
+                    "x = startX;\n" "y = startY + lid * n;\n", srcStr);
+        }
+        kgenAddStmt(ctx, tmp);
+
+        gdir = (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) ? 0 : 1;
+        if (gpriv->packed) {
+            sprintf(tmp, copyMemImgDBlockPackedSlow, varPref[gdir],
+                    FLOAT4_VECLEN / gpriv->nfloats, vfield);
+        }
+        else {
+            sprintf(tmp, copyMemImgDBlockSlow, varPref[gdir], vfield);
+        }
+        kgenAddStmt(ctx, tmp);
+    }
+    else {
+        LoopCtl loopCtl;
+        LoopUnrollers unrollers;
+        char buf[3][256];
+
+        memset(&loopCtl, 0, sizeof(loopCtl));
+        memset(&unrollers, 0, sizeof(unrollers));
+
+        s[1] = (gpriv->conjugate) ? "Conj" : "";
+        s[2] = (gpriv->notVectorize) ? "Nvec" : "";
+        gdir = (gpriv->dir == DBLOCK_GLOBAL_TO_LOCAL) ? 0 : 1;
+        sprintf(tmp, copyMemDBlockSlowDecl,
+                fpref, s[0], s[1], s[2], varPref[gdir], varPref[1 - gdir],
+                varPref[1 - gdir], varPref[gdir]);
+        kgenDeclareFunction(ctx, tmp);
+        kgenBeginFuncBody(ctx);
+        kgenDeclareLocalID(ctx, "lid", pgran);
+        sprintf(tmp, "int lsize = %u;\n", gsize);
+        kgenAddStmt(ctx, tmp);
+
+        if (dtype == TYPE_COMPLEX_DOUBLE) {
+            s[0] = "";
+            s[1] = "";
+        }
+        else {
+            s[0] = "uint js;\n";
+            s[1] = (gpriv->transp || gpriv->conjugate) ? "float4 tmp;\n" : "";
+        }
+
+        // pass over rows or columns?
+        i = (gpriv->transp && gdir) ? 1 : 0;
+
+        if (dtype == TYPE_COMPLEX_DOUBLE) {
+            buf[0][0] = '\0';
+        }
+        else {
+            const char *boundName;
+
+            // set counter bound to copy tail part, each work less than float4
+            boundName = (i) ? "nrRows" : "nrCols";
+
+            /*
+             * FIXME: the kludge is introduced due to strange
+             * runtime segfault at block transferring for another
+             * data types. Verify it later. Now, for non float types
+             * keep only simple loop.
+             */
+            if (i && (dtype != TYPE_FLOAT)) {
+                gpriv->notVectorize = true;
+            }
+
+            if (gpriv->notVectorize) {
+                sprintf(buf[0], "jb = 0;\n"
+                                "jv = 0;\n"
+                                "js = %s;\n",
+                        boundName);
+            }
+            else {
+                sprintf(buf[0], "js = %s - jb * %u - jv * %u;\n",
+                        boundName, 4 * n, n);
+            }
+        }
+
+        // set initial pointers
+        if (!gdir) {
+            sprintf(buf[1], "src.%s += (startRow + lid * n) * srcLD + "
+                                       "startCol;\n", vfield);
+            if (gpriv->transp) {
+                sprintf(buf[2], "dst.%s += lid * n;\n", vfield);
+            }
+            else {
+                sprintf(buf[2], "dst.%s += dstLD * lid * n;\n", vfield);
+            }
+        }
+        else {
+            if (gpriv->transp) {
+                sprintf(buf[1], "src.%s += lid * n;\n", vfield);
+            }
+            else {
+                sprintf(buf[1], "src.%s += srcLD * lid * n;\n", vfield);
+            }
+            sprintf(buf[2], "dst.%s += (startRow + lid * n) * dstLD + "
+                                       "startCol;\n", vfield);
+        }
+
+        sprintf(tmp, copyMemSlowLvars, s[0], s[1],
+                varPref[1 - gdir], varPref[gdir]);
+        kgenAddStmt(ctx, tmp);
+
+        sprintf(tmp, copyMemDBlockSlowStart[i],
+                4 * n, 4 * n, n, buf[0], buf[1], buf[2]);
+        kgenAddStmt(ctx, tmp);
+
+        // prepare to loop unrolling
+        gpriv->srcName = "src1";
+        gpriv->dstName = "dst1";
+        if (gdir) {
+            gpriv->locLDName = "srcLD";
+            gpriv->globLDName = "dstLD";
+        }
+        else {
+            gpriv->locLDName = "dstLD";
+            gpriv->globLDName = "srcLD";
+        }
+
+        loopCtl.ocName = "j";
+
+        if (gpriv->transp) {
+            unrollers.genSingle = copyMemSingleTransp;
+            if (dtype != TYPE_COMPLEX_DOUBLE) {
+                unrollers.genSingleVec = copyMemVecTransp;
+            }
+        }
+        else {
+            unrollers.genSingle = copyMemSingle;
+            if (dtype != TYPE_COMPLEX_DOUBLE) {
+                unrollers.genSingleVec = copyMemVec;
+            }
+        }
+
+        // external loop
+        kgenBeginBranch(ctx, "for (i = 0; i < n; i++)");
+        copyMemPreUnroll(ctx, gpriv);
+
+        // finally, unroll all loops
+        unrollers.getVecLen = getVecLen;
+
+        // copying with 4 float4 words
+        if (!gpriv->notVectorize) {
+            loopCtl.outBound.name = "jb";
+            loopCtl.inBound = 4 * n;
+            kgenLoopUnroll(ctx, &loopCtl, dtype, &unrollers, gpriv);
+
+            // copying with float4 words
+            loopCtl.outBound.name = "jv";
+            loopCtl.inBound = n;
+            kgenLoopUnroll(ctx, &loopCtl, dtype, &unrollers, gpriv);
+        }
+
+        // copying the remaining tail
+        if (dtype != TYPE_COMPLEX_DOUBLE) {
+            unrollers.genSingleVec = NULL;
+            loopCtl.outBound.name = "js";
+            loopCtl.inBound = 1;
+            kgenLoopUnroll(ctx, &loopCtl, dtype, &unrollers, gpriv);
+        }
+
+        copyMemPostUnroll(ctx, gpriv);
+        kgenEndBranch(ctx, NULL);
+    }
+
+    return kgenEndFuncBody(ctx);
+}
+
+// generator optimizing to a subproblem size
+static int
+copyDBlockOptimGen(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    const PGranularity *pgran,
+    GenPriv *gpriv)
+{
+    char fpref;
+    const char varPref[2] = {'G', 'L'};
+    char tmp[1024];
+    // lead dimension for right and transposed local block in float words
+    ItemWork work;
+    LoopCtl loopCtl;
+    LoopUnrollers unrollers;
+    const char *s, *s1, *s2;
+    bool image;
+    SubproblemDim newDim;
+    // copying direction within the memory or image related function group
+    int gdir = 0;
+    int r;
+
+    fpref = dtypeToPrefix(gpriv->dtype);
+    if (!fpref || (fpref == 'i')) {
+        return -EINVAL;
+    }
+
+    image = (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE ||
+             gpriv->dir == DBLOCK_LOCAL_TO_IMAGE);
+
+    memset(&unrollers, 0, sizeof(unrollers));
+    memset(&loopCtl, 0, sizeof(loopCtl));
+    memset(&newDim, 0, sizeof(newDim));
+
+    gpriv->dim = &newDim;
+    gpriv->work = (const ItemWork*)&work;
+    gpriv->globLDName = "ld";
+    s = (gpriv->transp) ? "Transp" : "";
+    s1 = (gpriv->conjugate) ? "Conj" : "";
+    s2 = (gpriv->notVectorize) ? "Nvec" : "";
+
+    if ((gpriv->dir == DBLOCK_LOCAL_TO_GLOBAL) && gpriv->transp) {
+        // pass over columns of the block stored in the local memory
+        newDim.x = dim->y;
+        newDim.y = dim->x;
+    }
+    else {
+        // pass over rows
+        newDim.x = dim->x;
+        newDim.y = dim->y;
+    }
+
+    getItemWork(&work, &newDim, pgran, gpriv->nfloats, gpriv->vecLen);
+
+    if (image) {
+        s = (gpriv->packed) ? "Pack" : "";
+        if (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) {
+            sprintf(tmp, copyMemGImgDBlockDecl, fpref, s, dim->y, dim->x);
+        }
+        else {
+            sprintf(tmp, copyMemLImgDBlockDecl, fpref, s, dim->y, dim->x);
+        }
+
+    }
+    else {
+        gdir = (gpriv->dir == DBLOCK_GLOBAL_TO_LOCAL) ? 0 : 1;
+        sprintf(tmp, copyMemDBlockDecl, fpref, s, s1, s2, varPref[gdir],
+                varPref[1 - gdir], dim->y, dim->x, varPref[1 - gdir],
+                varPref[gdir]);
+    }
+
+    kgenDeclareFunction(ctx, tmp);
+    kgenBeginFuncBody(ctx);
+
+    kgenDeclareLocalID(ctx, lidVarName, pgran);
+
+    if (image) {
+        // data for loop unrolling
+        if (work.nrRows > 1) {
+            gpriv->srcName = "src1";
+            gpriv->dstName = "dst";
+            gpriv->imgXName="x1";
+            gpriv->imgYName="y1";
+            if(gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) {
+                kgenAddStmt(ctx, "GPtr src1;\n");
+            }
+            else if(gpriv->dir == DBLOCK_LOCAL_TO_IMAGE) {
+                kgenAddStmt(ctx, "LPtr src1;\n");
+            }
+            kgenAddStmt(ctx, "int x1, y1;\n");
+
+            unrollers.preUnroll = copyImgPreUnroll;
+            unrollers.postUnroll = copyImgPostUnroll;
+        }
+        else {
+            gpriv->srcName = "src";
+            // dst has image2d_t type here
+            gpriv->dstName = "dst";
+            gpriv->imgXName="x";
+            gpriv->imgYName="y";
+        }
+    }
+    else {
+        if ((gpriv->nfloats != FLOAT4_VECLEN) &&
+            (gpriv->transp || gpriv->conjugate)) {
+
+            /*
+             * temporary variable to transpose or conjugate non double
+             * complex elements
+             */
+            kgenAddStmt(ctx, "float4 tmp;\n");
+        }
+
+        if (work.nrRows > 1) {
+            sprintf(tmp, privatePtrs, varPref[gdir], varPref[1 - gdir]);
+            kgenAddStmt(ctx, tmp);
+
+            // data for loop unrolling
+            unrollers.preUnroll = copyMemPreUnroll;
+            unrollers.postUnroll = copyMemPostUnroll;
+            gpriv->srcName = "src1";
+            gpriv->dstName = "dst1";
+        }
+        else {
+            gpriv->srcName = "src";
+            gpriv->dstName = "dst";
+        }
+    }
+
+    if ((work.nrRows > 1) || work.nrItems) {
+        prepareLoop(ctx, &work, &loopCtl);
+    }
+    kgenAddBlankLine(ctx);
+    loopCtl.inBound = (unsigned long)work.nrCols;
+
+    // now, prepare all needed for loop unrolling
+
+    if (image) {
+        kgenAddStmt(ctx, "int x, y;\n");
+        if (gpriv->packed) {
+            kgenAddStmt(ctx, "int pLine, index;\n");
+        }
+        gpriv->lmemLD = fl4RowWidth(dim->x, gpriv->typeSize) *
+                           FLOAT4_VECLEN / gpriv->nfloats;
+        // set up starting x and y in image
+        addSettingImageXYCode(ctx, "x", "y", pgran, gpriv);
+
+        if (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) {
+            // set initial global pointer
+            addSettingPtrCode(ctx, "src", 0, false, pgran, gpriv);
+        }
+        else if (gpriv->dir == DBLOCK_LOCAL_TO_IMAGE) {
+            // set initial local pointer
+            addSettingPtrCode(ctx, "src", gpriv->lmemLD, gpriv->transp,
+                              pgran, gpriv);
+        }
+
+        unrollers.genSingleVec = copyImgVec;
+        unrollers.genSingle = copyImgSingle;
+    }
+    else {
+        // set initial global pointer
+        s = (gdir) ? "dst" : "src";
+        addSettingPtrCode(ctx, s, 0, false, pgran, gpriv);
+
+        s = (gdir) ? "src" : "dst";
+
+        if (!gdir && gpriv->transp) {
+            gpriv->lmemLD = fl4RowWidth(dim->y, gpriv->typeSize) *
+                           FLOAT4_VECLEN / gpriv->nfloats;
+        }
+        else {
+            gpriv->lmemLD = fl4RowWidth(dim->x, gpriv->typeSize) *
+                           FLOAT4_VECLEN / gpriv->nfloats;
+        }
+
+        if (gpriv->transp) {
+            unrollers.genSingleVec = (gpriv->notVectorize) ? NULL :
+                                                             copyMemVecTransp;
+            unrollers.genSingle = copyMemSingleTransp;
+        }
+        else {
+            unrollers.genSingleVec = (gpriv->notVectorize) ? NULL : copyMemVec;
+            unrollers.genSingle = copyMemSingle;
+        }
+
+        addSettingPtrCode(ctx, s, gpriv->lmemLD, gpriv->transp,
+                          pgran, gpriv);
+    }
+    unrollers.getVecLen = getVecLen;
+
+    // unroll for float4 aligned data chunk
+    kgenLoopUnroll(ctx, &loopCtl, gpriv->dtype, &unrollers, gpriv);
+
+    /*
+     * Unroll for remaining data tail.
+     * Block tail reading/writing is done separately
+     * when many work items process single row
+     * because the compiler don't like any conditional
+     * branches in loops
+     */
+    if ((unrollers.postUnroll == NULL) && work.tail) {
+        addCopyTailCode(ctx, gpriv);
+    }
+
+    r = kgenEndFuncBody(ctx);
+
+    return r ? -EOVERFLOW : 0;
+}
+
+int
+copyDataBlockGen(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    const PGranularity *pgran,
+    DataType dtype,
+    DBlockCopyDirection dir,
+    DBlockCopyFlags flags)
+{
+    int r;
+    GenPriv gpriv;
+    unsigned int tsize;
+
+    tsize = dtypeSize(dtype);
+
+    if (dir == DBLOCK_LOCAL_TO_IMAGE ||
+        dir == DBLOCK_GLOBAL_TO_IMAGE) {
+        size_t rowSize;
+
+        if (dim != NULL) {
+            rowSize = tsize * dim->x;
+            if (rowSize % sizeof(cl_float4) != 0) {
+                // only float4 aligned rows are supported
+                return -EINVAL;
+            }
+        }
+        if (flags & DBLOCK_COPY_TRANSPOSE) {
+            return -EINVAL;
+        }
+    }
+
+    memset(&gpriv, 0, sizeof(gpriv));
+    gpriv.transp = (flags & DBLOCK_COPY_TRANSPOSE);
+    gpriv.packed = (flags & DBLOCK_COPY_PACKED_IMAGE);
+    if (dtype != TYPE_COMPLEX_DOUBLE) {
+        gpriv.notVectorize = (flags & DBLOCK_COPY_NOT_VECTORIZE);
+    }
+    if ((flags & DBLOCK_COPY_CONJUGATE) && isComplexType(dtype)) {
+        gpriv.conjugate = true;
+    }
+    initGenPriv(&gpriv, dtype, tsize, dim ,dir, NULL, pgran);
+
+    if (dim) {
+        r = copyDBlockOptimGen(ctx, dim, pgran, &gpriv);
+    }
+    else {
+        r = copyDBlockGenericGen(ctx, pgran, &gpriv);
+    }
+    return r;
+}
+
+int
+f4zeroBlockGen(
+    struct KgenContext *ctx,
+    const SubproblemDim *dim,
+    const PGranularity *pgran,
+    const char *memPrefix)
+{
+    char tmp[1024];
+    ItemWork work;
+    LoopCtl loopCtl;
+    GenPriv priv;
+    char pref;
+    LoopUnrollers unrollers;
+
+    if (!strcmp(memPrefix, "__local")) {
+        pref = 'l';
+    }
+    else if (!strcmp(memPrefix, "__global")) {
+        pref = 'g';
+    }
+    else {
+        return -EINVAL;
+    }
+
+    if (dim->y != 1) {
+        return -EINVAL;
+    }
+
+    memset(&loopCtl, 0, sizeof(loopCtl));
+    memset(&unrollers, 0, sizeof(unrollers));
+    memset(&priv, 0, sizeof(GenPriv));
+    initGenPriv(&priv, TYPE_COMPLEX_DOUBLE, FLOAT4_VECLEN * sizeof(cl_float),
+                dim, 0, (const ItemWork*)&work, pgran);
+    getItemWork(&work, dim, pgran, priv.nfloats, priv.vecLen);
+
+    sprintf(tmp, f4zeroDecl, pref, dim->x, memPrefix);
+    kgenDeclareFunction(ctx, tmp);
+    kgenBeginFuncBody(ctx);
+
+    // declare local ID variable and set data offset
+    kgenDeclareLocalID(ctx, lidVarName, pgran);
+    sprintf(tmp, "\ndata += %s * %lu;\n\n",
+            lidVarName, work.nrCols);
+    kgenAddStmt(ctx, tmp);
+
+    unrollers.genSingle = f4zeroSingle;
+    loopCtl.inBound = (unsigned int)work.nrCols;
+    unrollers.getVecLen = getVecLen;
+
+    kgenLoopUnroll(ctx, &loopCtl, TYPE_COMPLEX_DOUBLE, &unrollers, &priv);
+    if (work.tail) {
+        addTailCode(ctx, &priv, NULL, f4zeroSingle);
+    }
+
+    return kgenEndFuncBody(ctx);
+}
diff --git a/src/library/common/kern_cache.c b/src/library/common/kern_cache.c
new file mode 100644
index 0000000..787d139
--- /dev/null
+++ b/src/library/common/kern_cache.c
@@ -0,0 +1,443 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Kernel cache implementation
+ */
+
+/*
+ *  TODO: more efficient data structure to search
+ *        by dimensions (red black tree, for example) (?)
+ */
+
+
+#include<stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include <kern_cache.h>
+#include <kerngen.h>
+#include <mempat.h>
+
+#define KCACHE_LOCK(kache)      mutexLock((kcache)->mutex)
+#define KCACHE_UNLOCK(kcache)   mutexUnlock((kcache)->mutex)
+#define UNLIMITED_CACHE_SIZE    (~0UL)
+
+enum {
+    KNODE_MAGIC = 0x3CED50C5,
+    TRUNC_AHEAD_FACTOR = 4,
+    MAX_OPENCL_DEVICES = 64
+};
+
+// prime is chosen such overflowing on multiply on is very likely
+const unsigned long long prime = 100000000000000889LL;
+
+typedef struct KernelNode {
+    unsigned long magic;
+    unsigned long refcnt;
+    Kernel kern;
+    unsigned long hash;
+    // key data the kernel is based on
+    KernelKey key;
+    // function comparing kernel extra information
+    KernelExtraCmpFn extraCmp;
+    // node to store in a memory pattern related list
+    ListNode dimNode;
+    ListNode lruNode;
+} KernelNode;
+
+typedef struct KcacheKey {
+    unsigned long hash;
+    KernelKey key;
+    const void *extra;
+} KcacheKey;
+
+struct KernelCache {
+    size_t totalSize;
+    size_t sizeLimit;
+    // total amount of solvers
+    unsigned int nrSolvers;
+    // lists to search by subproblem dimensions
+    ListHead *dimKern;
+    // least recently used kernels list
+    ListHead lruKern;
+    mutex_t *mutex;
+};
+
+// update kernel hash using the dimension size
+static __inline unsigned long
+updateHash(unsigned long hash, unsigned long size)
+{
+    if (size != SUBDIM_UNUSED) {
+        hash = (hash << 5) | size;
+    }
+
+    return hash;
+}
+
+// hash kernel subproblem dimensions
+static unsigned long
+kernHash(const SubproblemDim *subdims, unsigned int nrDims)
+{
+    unsigned int i;
+    unsigned long hash = 0;
+
+    for (i = 0; i < nrDims; i++) {
+        hash = updateHash(hash, (unsigned long)subdims[i].x);
+        hash = updateHash(hash, (unsigned long)subdims[i].y);
+        hash = updateHash(hash, (unsigned long)subdims[i].bwidth);
+        hash = updateHash(hash, (unsigned long)subdims[i].itemX);
+        hash = updateHash(hash, (unsigned long)subdims[i].itemY);
+    }
+
+    return (unsigned long)(hash * prime);
+}
+
+// comparison function to look for a kernel node in the cache
+static int
+knodeCmp(const ListNode *node, const void *key)
+{
+    KcacheKey *kkey = (KcacheKey*)key;
+    KernelNode *knode = container_of(node, dimNode, KernelNode);
+
+    KernelKey *a = &(kkey->key);
+    KernelKey *b = &(knode->key);
+
+    if ((a->device != b->device) || (a->context != b->context) ||
+                (a->nrDims != b->nrDims)) {
+        return 1;
+    }
+    if (memcmp(a->subdims, b->subdims, a->nrDims * sizeof(SubproblemDim)) != 0) {
+        return 1;
+    }
+
+    if (knode->extraCmp != NULL) {
+        return knode->extraCmp(knode->kern.extra, kkey->extra);
+    }
+
+    return 0;
+}
+
+static void
+removeKernels(ListHead *truncList, struct KernelCache *kcache, size_t truncSize)
+{
+    size_t remSize = 0;
+    size_t ksize;
+    ListNode *l;
+    KernelNode *knode;
+
+    listInitHead(truncList);
+
+    while (remSize < truncSize) {
+        l = listNodeLast(&kcache->lruKern);
+        if (l == &kcache->lruKern) {
+            break;
+        }
+
+        knode = container_of(l, lruNode, KernelNode);
+        listDel(l);
+        listDel(&knode->dimNode);
+        listAddToTail(truncList, &knode->lruNode);
+        ksize = fullKernelSize(&knode->kern);
+        remSize += ksize;
+        kcache->totalSize -= ksize;
+    }
+}
+
+static void
+putRemovedKernels(struct KernelCache *kcache, ListHead *truncList)
+{
+    struct ListNode *l;
+    struct KernelNode *knode;
+
+    while (1) {
+        l = listNodeFirst(truncList);
+        if (l == truncList) {
+            break;
+        }
+
+        knode = container_of(l, lruNode, KernelNode);
+        listDel(l);
+        putKernel(kcache, &knode->kern);
+    }
+}
+
+Kernel
+*allocKernel(void)
+{
+    KernelNode *knode;
+
+    knode = malloc(sizeof(KernelNode));
+    if (knode == NULL) {
+        return NULL;
+    }
+
+    memset(knode, 0, sizeof(KernelNode));
+    knode->refcnt = 1;
+    knode->magic = KNODE_MAGIC;
+
+    return &knode->kern;
+}
+
+void
+getKernel(Kernel *kern)
+{
+    KernelNode *knode;
+
+    knode = container_of(kern, kern, KernelNode);
+    assert(knode->magic == KNODE_MAGIC);
+    knode->refcnt++;
+}
+
+void
+putKernel(struct KernelCache *kcache, Kernel *kern)
+{
+    KernelNode *knode;
+    unsigned long refcnt;
+
+    if (kern == NULL) {
+        return;
+    }
+
+    knode = container_of(kern, kern, KernelNode);
+    assert(knode->magic == KNODE_MAGIC);
+
+    if (kcache) {
+        KCACHE_LOCK(kcache);
+    }
+    refcnt = --knode->refcnt;
+    if (kcache) {
+        KCACHE_UNLOCK(kcache);
+    }
+
+    if (!refcnt) {
+        if (kern->dtor) {
+            kern->dtor(kern);
+        }
+        clReleaseProgram(kern->program);
+        clReleaseContext(knode->key.context);
+        free(knode);
+    }
+}
+
+struct KernelCache
+*createKernelCache(
+    unsigned int nrSolvers,
+    size_t sizeLimit)
+{
+    int err = 0;
+    unsigned int i;
+    struct KernelCache *kcache;
+
+    kcache = malloc(sizeof(struct KernelCache));
+    if (kcache == NULL) {
+        return NULL;
+    }
+
+    memset(kcache, 0, sizeof(struct KernelCache));
+
+    kcache->nrSolvers = nrSolvers;
+    kcache->dimKern = malloc(kcache->nrSolvers * sizeof(ListHead));
+    if (kcache->dimKern == NULL) {
+        err = -1;
+    }
+    else {
+        for (i = 0; i < kcache->nrSolvers; i++) {
+            listInitHead(&kcache->dimKern[i]);
+        }
+        listInitHead(&kcache->lruKern);
+
+        kcache->sizeLimit = sizeLimit;
+        kcache->totalSize = 0;
+
+        kcache->mutex = mutexInit();
+        err = (kcache->mutex == NULL);
+    }
+
+    if (err) {
+        if (kcache->dimKern) {
+            free(kcache->dimKern);
+        }
+        free(kcache);
+        kcache = NULL;
+    }
+
+    return kcache;
+}
+
+void
+destroyKernelCache(struct KernelCache *kcache)
+{
+    cleanKernelCache(kcache);
+    free(kcache->dimKern);
+    mutexDestroy(kcache->mutex);
+    free(kcache);
+}
+
+int
+addKernelToCache(
+    struct KernelCache *kcache,
+    solver_id_t sid,
+    Kernel *kern,
+    const KernelKey *key,
+    KernelExtraCmpFn extraCmp)
+{
+    size_t ksize;
+    KernelNode *knode;
+    ListHead truncList;
+
+    knode = container_of(kern, kern, KernelNode);
+    assert(knode->magic == KNODE_MAGIC);
+
+    if ((unsigned)sid >= kcache->nrSolvers || key->nrDims > MAX_SUBDIMS) {
+        return -1;
+    }
+
+    listInitHead(&truncList);
+    ksize = fullKernelSize(kern);
+
+    KCACHE_LOCK(kcache);
+
+    if (kcache->sizeLimit) {
+        if (ksize > kcache->sizeLimit) {
+            KCACHE_UNLOCK(kcache);
+            return -1;
+        }
+        else if (ksize > kcache->sizeLimit - kcache->totalSize) {
+            removeKernels(&truncList, kcache, ksize * TRUNC_AHEAD_FACTOR);
+        }
+    }
+
+    knode->hash = kernHash(key->subdims, key->nrDims);
+    knode->extraCmp = extraCmp;
+
+    knode->key.device = key->device;
+    knode->key.context = key->context;
+    clRetainContext(knode->key.context);
+    knode->key.nrDims = key->nrDims;
+    memset(knode->key.subdims, 0, sizeof(knode->key.subdims));
+    memcpy(knode->key.subdims, key->subdims, sizeof(SubproblemDim) *
+           knode->key.nrDims);
+
+    listAddToTail(&kcache->dimKern[sid], &knode->dimNode);
+    listAddToHead(&kcache->lruKern, &knode->lruNode);
+    kcache->totalSize += ksize;
+
+    KCACHE_UNLOCK(kcache);
+
+    if (!isListEmpty(&truncList)) {
+        putRemovedKernels(kcache, &truncList);
+    }
+
+    return 0;
+}
+
+Kernel
+*findKernel(
+    struct KernelCache *kcache,
+    solver_id_t sid,
+    const KernelKey *key,
+    const void *extraKey)
+{
+    Kernel *kern = NULL;
+    KcacheKey kkey;
+    KernelNode *knode;
+    ListNode *lnode;
+
+    if ((unsigned)sid >= kcache->nrSolvers || key->nrDims > MAX_SUBDIMS) {
+        return NULL;
+    }
+
+    kkey.hash = kernHash(key->subdims, key->nrDims);
+    kkey.extra = extraKey;
+
+    kkey.key.device = key->device;
+    kkey.key.context = key->context;
+    kkey.key.nrDims = key->nrDims;
+    memset(kkey.key.subdims, 0, sizeof(kkey.key.subdims));
+    memcpy(kkey.key.subdims, key->subdims, sizeof(SubproblemDim) * kkey.key.nrDims);
+
+    KCACHE_LOCK(kcache);
+    lnode = listNodeSearch(&kcache->dimKern[sid], &kkey, knodeCmp);
+    if (lnode) {
+        knode = container_of(lnode, dimNode, KernelNode);
+        knode->refcnt++;
+        kern = &knode->kern;
+
+        // move the kernel to the top of the LRU list
+        listDel(&knode->lruNode);
+        listAddToHead(&kcache->lruKern, &knode->lruNode);
+    }
+    KCACHE_UNLOCK(kcache);
+
+    return kern;
+}
+
+size_t
+availKernelCacheSize(struct KernelCache *kcache)
+{
+    size_t size;
+
+    KCACHE_LOCK(kcache);
+    size = (kcache->sizeLimit) ? (kcache->sizeLimit - kcache->totalSize) :
+           ~(size_t)0;
+    KCACHE_UNLOCK(kcache);
+
+    return size;
+}
+
+void
+cleanKernelCache(struct KernelCache *kcache)
+{
+    ListHead truncList;
+
+    KCACHE_LOCK(kcache);
+    removeKernels(&truncList, kcache, kcache->totalSize);
+    KCACHE_UNLOCK(kcache);
+
+    putRemovedKernels(kcache, &truncList);
+}
+
+size_t
+fullKernelSize(Kernel *kern)
+{
+    size_t allSizes[MAX_OPENCL_DEVICES], size = 0;
+    size_t i, retSize;
+
+    clGetProgramInfo(kern->program, CL_PROGRAM_BINARY_SIZES,
+                     sizeof(allSizes), &allSizes, &retSize);
+    retSize /= sizeof(size);
+    for (i = 0; i < retSize; i++) {
+        size += allSizes[i];
+    }
+
+    clGetProgramInfo(kern->program, CL_PROGRAM_SOURCE, 0, NULL, &retSize);
+
+    return (size + retSize + sizeof(Kernel) + kern->extraSize);
+}
+
+#if defined(TRACE_MALLOC)
+
+#include <stdio.h>
+
+void
+printKernelCacheSize(struct KernelCache *kcache)
+{
+    printf("[KERNEL CACHE] My size is %lu MiB\n", kcache->totalSize / 1048576);
+}
+
+#endif
diff --git a/src/library/common/kerngen_core.c b/src/library/common/kerngen_core.c
new file mode 100644
index 0000000..7db25b6
--- /dev/null
+++ b/src/library/common/kerngen_core.c
@@ -0,0 +1,623 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Implementation of common logic for kernel
+ * generators
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdarg.h>
+
+#include <list.h>
+#include <kerngen.h>
+#include <mempat.h>
+
+/*
+ * TODO: Add checks for corruption for KgenContext and StatementBatch
+ */
+
+enum {
+    TAB_WIDTH = 4,
+};
+
+struct KgenContext {
+    char *buf;
+    size_t bufLen;
+    // name of the last declared function
+    char *lastFname;
+    size_t fnameLen;
+    // current length without trailing '\0'
+    size_t currLen;
+    bool err;
+    // current execution branch nesting
+    int nesting;
+    // number of tabs on the zero level of nesting
+    int nrTabs;
+    bool fmt;
+};
+
+struct StmtNode {
+    char *stmt;
+    ListNode node;
+};
+
+struct StatementBatch {
+    ListHead statements[MAX_STATEMENT_PRIORITY + 1];
+};
+
+#ifdef TRACE_MALLOC
+
+#define strdup(s)    strdupDebug(s)
+
+static char
+*strdupDebug(const char *s)
+{
+    char *dst;
+    int len;
+
+    len = strlen(s);
+    dst = malloc(len + 1);
+    if (dst != NULL) {
+        memcpy(dst, s, len);
+        dst[len] = '\0';
+    }
+
+    return dst;
+}
+
+#else                           /* TRACE_MALLOC */
+#if defined(_MSC_VER)
+#define strdup _strdup
+#endif                          /* _MSC_VER */
+#endif                          /* !TRACE_MALLOC */
+
+static void
+resetCtx(struct KgenContext *ctx)
+{
+    ctx->currLen = 0;
+    ctx->nesting = 0;
+    ctx->err = false;
+    ctx->lastFname = NULL;
+    ctx->fnameLen = 0;
+    if (ctx->buf != NULL) {
+        ctx->buf[0] = '\0';
+    }
+}
+
+// extrace the first function name from a source buffer
+static char*
+searchFuncName(const char *source, size_t *len)
+{
+    char *sep;
+    char *name = NULL;
+
+    /*
+     * Search the opening paranthesis. The word before it is
+     * the function name
+     */
+    sep = strchr(source, '(');
+    if (sep != NULL) {
+        for (name = sep; name >= source; name--) {
+            if ((*name == ' ') || (*name == '\n') || (*name == '*')) {
+                break;
+            }
+        }
+        name++;
+        *len = (size_t)(sep - name);
+    }
+
+    return name;
+}
+
+/*
+ * Immediately add string to source and does length check.
+ *
+ * The string should terminate with '\0' or pass size to copy
+ */
+static int
+checkAddStr(struct KgenContext *ctx, const char *str, size_t slen)
+{
+    int ret = 0;
+    size_t n = ctx->bufLen - ctx->currLen;
+    size_t cplen;
+
+    if (!slen) {
+        slen = strlen(str);
+        cplen = slen + 1;
+    }
+    else {
+        cplen = slen;
+    }
+
+    if (ctx->buf == NULL) {
+        ctx->currLen += slen;
+    }
+    else {
+        if (cplen > n) {
+            // make further code appendings unallowed
+            ctx->err = true;
+            ret = -1;
+        }
+        else {
+            strncpy(ctx->buf + ctx->currLen, str, cplen);
+            ctx->currLen += slen;
+        }
+    }
+
+    return ret;
+}
+
+
+// add string to source, consiting of a prefix, a statement and a suffix
+static int
+addStr(
+    struct KgenContext *ctx,
+    const char *pref,
+    const char *stmt,
+    const char *suff)
+{
+    int ret = 0;
+    char blank[MAX_NESTING * TAB_WIDTH];
+    int i;
+    char *sep = NULL;
+    size_t len = 0;
+    const int nblanks = (ctx->nesting + ctx->nrTabs) * TAB_WIDTH;
+
+    if (nblanks && ctx->fmt) {
+        for (i = 0; i < nblanks; i++) {
+            blank[i] = ' ';
+        }
+
+        /*
+         *  add formatting symbols if there is a prefix,
+         *  or the statement don't begin with the new line
+         *  symbols
+         */
+        if (pref || (stmt && (stmt[0] != '\n'))) {
+            ret = checkAddStr(ctx, blank, nblanks);
+        }
+    }
+
+    if (!ret && pref) {
+        ret = checkAddStr(ctx, pref, 0);
+    }
+
+    /*
+     * add the statement itself,
+     * format the multiline ones if it's needed.
+     */
+    while (!ret && stmt) {
+        if (ctx->fmt) {
+            /*
+             * do not add tabulation for lines consisting of
+             * the new line symbol only
+             */
+            if (*stmt != '\n') {
+                if (sep && nblanks) {
+                    ret = checkAddStr(ctx, blank, nblanks);
+                    if (ret) {
+                        break;
+                    }
+                }
+                sep = strchr(stmt, '\n');
+                // skip the new line symbol if it is at the end of the line
+                if (sep && (sep[1] == '\0')) {
+                    sep = NULL;
+                }
+                len = (sep) ? (sep - stmt + 1) : 0;
+            }
+            else {
+                /*
+                 * The line can start with the new line symbol
+                 * and have not any prefix. The assignment
+                 * ensures the tabulation for the case.
+                 */
+                sep = (sep) ? sep : ((char*)stmt);
+                len = (stmt[1] == '\0') ? 0 : 1;
+            }
+        }
+        ret = checkAddStr(ctx, stmt, len);
+        if (len) {
+            stmt += len;
+        }
+        else {
+            stmt = NULL;
+        }
+    }
+
+    if (!ret && suff) {
+        ret = checkAddStr(ctx, suff, 0);
+    }
+
+    return ret;
+}
+
+struct KgenContext
+*createKgenContext(char *srcBuf, size_t srcBufLen, bool fmt)
+{
+    struct KgenContext *ctx;
+
+    ctx = malloc(sizeof(struct KgenContext));
+    if (ctx != NULL) {
+        ctx->buf = srcBuf;
+        ctx->bufLen = srcBufLen;
+        ctx->fmt = fmt;
+        ctx->nrTabs = 0;
+        resetCtx(ctx);
+    }
+
+    return ctx;
+}
+
+static void
+flushDestroyStmtNode(ListNode *l, void *priv)
+{
+    struct StmtNode *snode = container_of(l, node, struct StmtNode);
+
+    if (priv != NULL) {
+        addStr((struct KgenContext*)priv, NULL, snode->stmt, NULL);
+    }
+    free(snode->stmt);
+    free(snode);
+}
+
+void
+destroyKgenContext(struct KgenContext *ctx)
+{
+    if (ctx->lastFname) {
+        free(ctx->lastFname);
+    }
+    free(ctx);
+}
+
+void
+resetKgenContext(struct KgenContext *ctx)
+{
+    if (ctx->lastFname) {
+        free(ctx->lastFname);
+    }
+    resetCtx(ctx);
+}
+
+int
+kgenSyncFormatting(
+    struct KgenContext *srcCtx,
+    const struct KgenContext *dstCtx,
+    int nrTabs)
+{
+    int ret = -EINVAL;
+
+    if (nrTabs >= 0 && (nrTabs + dstCtx->nesting <= MAX_TABS)) {
+        srcCtx->nesting = nrTabs + dstCtx->nesting;
+        ret = 0;
+    }
+
+    return ret;
+}
+
+int
+kgenDeclareFunction(struct KgenContext *ctx, const char *decl)
+{
+    int ret;
+    size_t len;
+    char *dbuf;
+    const char *fnName;
+
+    if (ctx->err || ctx->nesting) {
+        ctx->err = true;
+        return -1;
+    }
+    else {
+        fnName = searchFuncName(decl, &len);
+        if (fnName == NULL) {
+            ret = -1;
+        }
+        else {
+            // save the last declaration without
+            dbuf = ctx->lastFname;
+            if (dbuf == NULL) {
+                dbuf = malloc(len + 1);
+            }
+            else if (ctx->fnameLen < len + 1) {
+                dbuf = realloc(ctx->lastFname, len + 1);
+                ctx->fnameLen = len + 1;
+            }
+
+            if (dbuf == NULL) {
+                ret = -1;
+            }
+            else {
+                strncpy(dbuf, fnName, len);
+                dbuf[len] = '\0';
+                ctx->lastFname = dbuf;
+                ret = addStr(ctx, NULL, decl, NULL);
+            }
+        }
+
+        if (ret) {
+            ctx->err = true;
+        }
+    }
+
+    return ret;
+}
+
+int
+kgenBeginFuncBody(struct KgenContext *ctx)
+{
+    int ret;
+
+    if (ctx->err || ctx->nesting) {
+        ctx->err = true;
+        ret = -1;
+    }
+    else {
+        ret = addStr(ctx, NULL, NULL, "{\n");
+        if (!ret) {
+            ctx->nesting++;
+        }
+    }
+
+    return ret;
+}
+
+int
+kgenEndFuncBody(struct KgenContext *ctx)
+{
+    int ret;
+
+    if (ctx->err || (ctx->nesting != 1)) {
+        ctx->err = true;
+        ret = -1;
+    }
+    else {
+        ctx->nesting--;
+        ret = addStr(ctx, NULL, NULL, "}\n");
+    }
+
+    return ret;
+}
+
+int
+kgenGetLastFuncName(
+    char *buf,
+    size_t buflen,
+    const struct KgenContext *ctx)
+{
+    size_t len;
+    int ret = -1;
+
+    if (ctx->lastFname) {
+        len = strlen(ctx->lastFname);
+        if (buflen >= len + 1) {
+            strncpy(buf, ctx->lastFname, len);
+            buf[len] = '\0';
+            ret = 0;
+        }
+    }
+
+    return ret;
+}
+
+int
+kgenBeginBranch(struct KgenContext *ctx, const char *stmt)
+{
+    int ret;
+
+    if (ctx->err || (ctx->nesting == MAX_NESTING)) {
+        ctx->err = true;
+        ret = -1;
+    }
+    else {
+        const char *suff;
+
+        if (stmt == NULL) {
+            stmt = "";
+            suff = "{\n";
+        }
+        else {
+            suff = " {\n";
+        }
+
+        ret = addStr(ctx, NULL, stmt, suff);
+        if (!ret) {
+            ctx->nesting++;
+        }
+    }
+
+    return ret;
+}
+
+
+int
+kgenEndBranch(struct KgenContext *ctx, const char *stmt)
+{
+    const char *pref;
+    const char *suff;
+
+    if (ctx->err || !ctx->nesting) {
+        ctx->err = true;
+        return -1;
+    }
+
+    ctx->nesting--;
+
+    if (stmt) {
+        pref = "} ";
+        suff = ";\n";
+    }
+    else {
+        pref = "}\n";
+        suff = NULL;
+    }
+
+    return addStr(ctx, pref, stmt, suff);
+}
+
+int
+kgenAddStmt(struct KgenContext *ctx, const char *stmt)
+{
+    int ret = 0;
+
+    if (ctx->err) {
+        ret = -1;
+    }
+    else if (stmt != NULL) {
+        ret = addStr(ctx, NULL, stmt, NULL);
+    }
+
+    return ret;
+}
+
+int
+kgenPrintf(struct KgenContext *ctx, const char *fmt,...)
+{
+    char buf[MAX_STATEMENT_LENGTH];
+    va_list ap;
+    int len;
+
+    if (ctx->err) {
+        return -1;
+    }
+
+    va_start(ap, fmt);
+    len = vsnprintf(buf, MAX_STATEMENT_LENGTH, fmt, ap);
+    va_end(ap);
+
+    if (len >= MAX_STATEMENT_LENGTH) {  /* has the statement been truncated? */
+        return -1;
+    }
+
+    return addStr(ctx, NULL, buf, NULL);
+}
+
+struct StatementBatch
+*createStmtBatch(void)
+{
+    struct StatementBatch *batch;
+
+    batch = malloc(sizeof(struct StatementBatch));
+    if (batch != NULL) {
+        int i;
+
+        for (i = 0; i <= MAX_STATEMENT_PRIORITY; i++) {
+            listInitHead(&batch->statements[i]);
+        }
+    }
+
+    return batch;
+}
+
+int
+kgenAddStmtToBatch(
+    struct StatementBatch *batch,
+    int priority,
+    const char *stmt)
+{
+    struct StmtNode *snode;
+    int ret = -ENOMEM;
+
+    if (priority == MAX_STATEMENT_PRIORITY) {
+        return -EINVAL;
+    }
+
+    snode = malloc(sizeof(struct StmtNode));
+    if (snode != NULL) {
+        snode->stmt = strdup(stmt);
+        if (snode->stmt != NULL) {
+            listAddToTail(&batch->statements[priority], &snode->node);
+            ret = 0;
+        }
+        else {
+            free(snode);
+        }
+    }
+
+    return ret;
+}
+
+int
+kgenBatchPrintf(
+    struct StatementBatch *batch,
+    int priority,
+    const char *fmt,...)
+{
+    char buf[MAX_STATEMENT_LENGTH];
+    va_list ap;
+    int len;
+
+    va_start(ap, fmt);
+    len = vsnprintf(buf, MAX_STATEMENT_LENGTH, fmt, ap);
+    va_end(ap);
+
+    if (len >= MAX_STATEMENT_LENGTH) {  /* has the statement been truncated? */
+        return -1;
+    }
+
+    kgenAddStmtToBatch(batch, priority, buf);
+
+    return 0;
+}
+
+int
+flushStmtBatch(struct KgenContext *ctx, struct StatementBatch *batch)
+{
+    int i = 0;
+
+    for (i = 0; i <= MAX_STATEMENT_PRIORITY; i++) {
+        listDoForEachPrivSafe(&batch->statements[i], flushDestroyStmtNode, ctx);
+        listInitHead(&batch->statements[i]);
+    }
+
+    return (ctx->err) ? -1 : 0;
+}
+
+void
+destroyStmtBatch(struct StatementBatch *batch)
+{
+    int i;
+
+    for (i = 0; i <= MAX_STATEMENT_PRIORITY; i++) {
+       listDoForEachPrivSafe(&batch->statements[i], flushDestroyStmtNode, NULL);
+    }
+    free(batch);
+}
+
+int
+kgenAddBlankLine(struct KgenContext *ctx)
+{
+    int ret;
+
+    if (ctx->err) {
+        ret = -1;
+    }
+    else {
+        ret = addStr(ctx, NULL, NULL, "\n");
+    }
+
+    return ret;
+}
+
+size_t
+kgenSourceSize(struct KgenContext *ctx)
+{
+    return ctx->currLen;
+}
diff --git a/src/library/common/kgen_basic.c b/src/library/common/kgen_basic.c
new file mode 100644
index 0000000..54b6182
--- /dev/null
+++ b/src/library/common/kgen_basic.c
@@ -0,0 +1,427 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdarg.h>
+#include <assert.h>
+
+#include <kerngen.h>
+#include <mempat.h>
+
+const char *uptrsFullDeclaration =
+    "#ifdef cl_khr_fp64\n"
+    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+    "#else\n"
+    "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
+    "#endif\n"
+    "\n"
+    "typedef union GPtr {\n"
+    "    __global float *f;\n"
+    "    __global double *d;\n"
+    "    __global float2 *f2v;\n"
+    "    __global double2 *d2v;\n"
+    "    __global float4 *f4v;\n"
+    "    __global double4 *d4v;\n"
+    "    __global float8 *f8v;\n"
+    "    __global double8 *d8v;\n"
+    "    __global float16 *f16v;\n"
+    "    __global double16 *d16v;\n"
+    "} GPtr;\n"
+    "\n"
+    "typedef union LPtr {\n"
+    "    __local float *f;\n"
+    "    __local double *d;\n"
+    "    __local float2 *f2v;\n"
+    "    __local double2 *d2v;\n"
+    "    __local float4 *f4v;\n"
+    "    __local double4 *d4v;\n"
+    "    __local float8 *f8v;\n"
+    "    __local double8 *d8v;\n"
+    "    __local float16 *f16v;\n"
+    "    __local double16 *d16v;\n"
+    "} LPtr;\n"
+    "\n"
+    "typedef union PPtr {\n"
+    "    float *f;\n"
+    "    double *d;\n"
+    "    float2 *f2v;\n"
+    "    double2 *d2v;\n"
+    "    float4 *f4v;\n"
+    "    double4 *d4v;\n"
+    "    float8 *f8v;\n"
+    "    double8 *d8v;\n"
+    "    float16 *f16v;\n"
+    "    double16 *d16v;\n"
+    "} PPtr;\n\n";
+
+const char *uptrsSingleDeclaration =
+    "typedef union GPtr {\n"
+    "    __global float *f;\n"
+    "    __global float2 *f2v;\n"
+    "    __global float4 *f4v;\n"
+    "    __global float8 *f8v;\n"
+    "    __global float16 *f16v;\n"
+    "} GPtr;\n"
+    "\n"
+    "typedef union LPtr {\n"
+    "    __local float *f;\n"
+    "    __local float2 *f2v;\n"
+    "    __local float4 *f4v;\n"
+    "    __local float8 *f8v;\n"
+    "    __local float16 *f16v;\n"
+    "} LPtr;\n"
+    "\n"
+    "typedef union PPtr {\n"
+    "    float *f;\n"
+    "    float2 *f2v;\n"
+    "    float4 *f4v;\n"
+    "    float8 *f8v;\n"
+    "    float16 *f16v;\n"
+    "} PPtr;\n\n";
+
+const char
+*uptrTypeName(UptrType type)
+{
+    const char *s = NULL;
+
+    switch(type) {
+    case UPTR_GLOBAL:
+        s = "GPtr";
+        break;
+    case UPTR_LOCAL:
+        s = "LPtr";
+        break;
+    case UPTR_PRIVATE:
+        s = "PPtr";
+        break;
+    }
+
+    return s;
+}
+
+char
+dtypeToPrefix(DataType type)
+{
+    char c;
+
+    switch (type) {
+    case TYPE_FLOAT:
+        c = 'f';
+        break;
+    case TYPE_DOUBLE:
+        c = 'd';
+        break;
+    case TYPE_COMPLEX_FLOAT:
+        c = 'c';
+        break;
+    case TYPE_COMPLEX_DOUBLE:
+        c = 'z';
+        break;
+    default:
+        c = 0;
+        break;
+    }
+
+    return c;
+}
+
+const char
+*dtypeBuiltinType(DataType dtype)
+{
+    const char *s;
+
+    switch (dtype) {
+    case TYPE_FLOAT:
+        s = "float";
+        break;
+    case TYPE_DOUBLE:
+        s = "double";
+        break;
+    case TYPE_COMPLEX_FLOAT:
+        s = "float2";
+        break;
+    case TYPE_COMPLEX_DOUBLE:
+        s = "double2";
+        break;
+    default:
+        s = NULL;
+        break;
+    }
+
+    return s;
+}
+
+const char
+*dtypeUPtrField(DataType dtype)
+{
+    const char *s;
+
+    switch (dtype) {
+    case TYPE_FLOAT:
+        s = "f";
+        break;
+    case TYPE_DOUBLE:
+        s = "d";
+        break;
+    case TYPE_COMPLEX_FLOAT:
+        s = "f2v";
+        break;
+    case TYPE_COMPLEX_DOUBLE:
+        s = "d2v";
+        break;
+    default:
+        s = NULL;
+        break;
+    }
+
+    return s;
+}
+
+const char
+*strOne(DataType dtype)
+{
+    const char *s;
+
+    if (isComplexType(dtype)) {
+        if (isDoubleBasedType(dtype)) {
+            s = "(double2)(1, 0)";
+        }
+        else {
+            s = "(float2)(1, 0)";
+        }
+    }
+    else {
+        s = "1";
+    }
+
+    return s;
+}
+
+void
+getVectorTypeName(
+    DataType dtype,
+    unsigned int vecLen,
+    const char **typeName,
+    const char **typePtrName)
+{
+    char *tn = "";
+    char *tpn = "";
+
+    if (isDoubleBasedType(dtype)) {
+        switch (vecLen * dtypeSize(dtype)) {
+        case sizeof(cl_double):
+            tn = "double";
+            tpn = "d";
+            break;
+        case sizeof(cl_double2):
+            tn = "double2";
+            tpn = "d2v";
+            break;
+        case sizeof(cl_double4):
+            tn = "double4";
+            tpn = "d4v";
+            break;
+        case sizeof(cl_double8):
+            tn = "double8";
+            tpn = "d8v";
+            break;
+        case sizeof(cl_double16):
+            tn = "double16";
+            tpn = "d16v";
+            break;
+        };
+    }
+    else {
+        switch (vecLen * dtypeSize(dtype)) {
+        case sizeof(cl_float):
+            tn = "float";
+            tpn = "f";
+            break;
+        case sizeof(cl_float2):
+            tn = "float2";
+            tpn = "f2v";
+            break;
+        case sizeof(cl_float4):
+            tn = "float4";
+            tpn = "f4v";
+            break;
+        case sizeof(cl_float8):
+            tn = "float8";
+            tpn = "f8v";
+            break;
+        case sizeof(cl_float16):
+            tn = "float16";
+            tpn = "f16v";
+            break;
+        };
+    }
+    if (typeName != NULL) {
+        *typeName = tn;
+    }
+    if (typePtrName != NULL) {
+        *typePtrName = tpn;
+    }
+}
+
+int
+kgenAddBarrier(
+    struct KgenContext *ctx,
+    CLMemFence fence)
+{
+    int ret;
+
+    if (fence == CLK_LOCAL_MEM_FENCE) {
+        ret = kgenAddStmt(ctx, "barrier(CLK_LOCAL_MEM_FENCE);\n");
+    }
+    else {
+        ret = kgenAddStmt(ctx, "barrier(CLK_GLOBAL_MEM_FENCE);\n");
+    }
+    if (ret) {
+        ret = -EOVERFLOW;
+    }
+
+    return ret;
+}
+
+int
+kgenAddMemFence(
+    struct KgenContext *ctx,
+    CLMemFence fence)
+{
+    int ret;
+
+    if (fence == CLK_LOCAL_MEM_FENCE) {
+        ret = kgenAddStmt(ctx, "mem_fence(CLK_LOCAL_MEM_FENCE);\n");
+    }
+    else {
+        ret = kgenAddStmt(ctx, "mem_fence(CLK_GLOBAL_MEM_FENCE);\n");
+    }
+    if (ret) {
+        ret = -EOVERFLOW;
+    }
+
+    return ret;
+}
+
+int
+kgenDeclareLocalID(
+    struct KgenContext *ctx,
+    const char *lidName,
+    const PGranularity *pgran)
+{
+    char tmp[128];
+    int r;
+
+    if (pgran->wgDim == 1) {
+        sprintf(tmp, "const int %s = get_local_id(0);\n", lidName);
+    }
+    else {
+        sprintf(tmp, "const int %s = get_local_id(1) * %u + "
+                     "get_local_id(0);\n",
+                lidName, pgran->wgSize[0]);
+    }
+
+    r = kgenAddStmt(ctx, tmp);
+
+    return (r) ? -EOVERFLOW : 0;
+}
+
+int
+kgenDeclareGroupID(
+    struct KgenContext *ctx,
+    const char *gidName,
+    const PGranularity *pgran)
+{
+    char tmp[128];
+    int r;
+
+    if (pgran->wgDim == 1) {
+        sprintf(tmp, "const int %s = get_global_id(0) / %u;\n",
+                gidName, pgran->wgSize[0]);
+    }
+    else {
+        sprintf(tmp, "const int %s = (get_global_id(1) / %u) * "
+                     "(get_global_size(0) / %u) + "
+                     "get_global_id(0) / %u;\n",
+                     gidName, pgran->wgSize[1], pgran->wgSize[0],
+                     pgran->wgSize[0]);
+    }
+
+    r = kgenAddStmt(ctx, tmp);
+
+    return (r) ? -EOVERFLOW : 0;
+}
+
+int
+kgenDeclareUptrs(struct KgenContext *ctx, bool withDouble)
+{
+    int ret;
+    const char *s;
+
+    s = (withDouble) ? uptrsFullDeclaration : uptrsSingleDeclaration;
+    ret = kgenAddStmt(ctx, s);
+
+    return ret ? -EOVERFLOW: 0;
+}
+
+void
+kstrcpy(Kstring *kstr, const char *str)
+{
+    const int lastByte = sizeof(kstr->buf) - 1;
+
+    kstr->buf[lastByte] = '\0';
+    strncpy(kstr->buf, str, sizeof(kstr->buf));
+    assert(kstr->buf[lastByte] == '\0');
+}
+
+void
+ksprintf(Kstring *kstr, const char *fmt,...)
+{
+    va_list ap;
+    int len;
+
+    va_start(ap, fmt);
+    len = vsnprintf(kstr->buf, sizeof(kstr->buf), fmt, ap);
+    va_end(ap);
+
+    // to mute GCC with its warning regarding set but unused variables
+#ifdef NDEBUG
+    (void)len;
+#endif
+
+    assert((size_t)len < sizeof(kstr->buf));
+}
+
+void
+kstrcatf(Kstring *kstr, const char *fmt,...)
+{
+    va_list ap;
+    int len, maxlen;
+
+    va_start(ap, fmt);
+    len = (int)strlen(kstr->buf);
+    maxlen = sizeof(kstr->buf) - len;
+    len = vsnprintf(kstr->buf + len, maxlen, fmt, ap);
+    va_end(ap);
+
+    assert(len < maxlen);
+}
+
+
diff --git a/src/library/common/kgen_guard.c b/src/library/common/kgen_guard.c
new file mode 100644
index 0000000..d785905
--- /dev/null
+++ b/src/library/common/kgen_guard.c
@@ -0,0 +1,159 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <kerngen.h>
+#include <list.h>
+#include <dis_warning.h>
+
+typedef struct FuncNode {
+    void *pattern;
+    char funcName[FUNC_NAME_MAXLEN];
+    ListNode node;
+} FuncNode;
+
+typedef struct FuncNodeKey {
+    const void *pattern;
+    size_t patSize;
+} FuncNodeKey;
+
+struct KgenGuard {
+    struct KgenContext *ctx;
+    int (*genCallback)(struct KgenContext*, const void*);
+    size_t patSize;
+    ListHead funcs;
+};
+
+static int
+funcNodeCmp(const ListNode *n, const void *key)
+{
+    const FuncNode *fnode = container_of(n, node, FuncNode);
+    const FuncNodeKey *fkey = (FuncNodeKey*)key;
+
+    return memcmp(fnode->pattern, fkey->pattern, fkey->patSize);
+}
+
+static void
+destroyFuncNode(ListNode *node)
+{
+    FuncNode *fnode = container_of(node, node, FuncNode);
+
+    free(fnode->pattern);
+    free(fnode);
+}
+
+struct KgenGuard
+*createKgenGuard(
+    struct KgenContext *ctx,
+    int (*genCallback)(struct KgenContext *ctx, const void *pattern),
+    size_t patSize)
+{
+    struct KgenGuard *guard;
+
+    guard = malloc(sizeof(struct KgenGuard));
+    if (guard != NULL) {
+        guard->ctx = ctx;
+        guard->genCallback = genCallback;
+        guard->patSize = patSize;
+        listInitHead(&guard->funcs);
+    }
+
+    return guard;
+}
+
+void
+reinitKgenGuard(
+    struct KgenGuard *guard,
+    struct KgenContext *ctx,
+    int (*genCallback)(struct KgenContext *ctx, const void *pattern),
+    size_t patSize)
+{
+    listDoForEachSafe(&guard->funcs, destroyFuncNode);
+    listInitHead(&guard->funcs);
+    guard->ctx = ctx;
+    guard->genCallback = genCallback;
+    guard->patSize = patSize;
+}
+
+/*
+ * Invokes generator to generate a function
+ * matching to the 'pattern' pattern or just
+ * returns its name if the function is already
+ * generated
+ */
+int
+findGenerateFunction(
+    struct KgenGuard *guard,
+    const void *pattern,
+    char *name,
+    size_t nameLen)
+{
+    ListNode *n;
+    FuncNode *fnode = NULL;
+
+    FuncNodeKey fkey = {pattern, guard->patSize};
+    int ret = 0;
+
+    n = listNodeSearch(&guard->funcs, &fkey, funcNodeCmp);
+    if (n == NULL) {
+        ret = guard->genCallback(guard->ctx, pattern);
+        if (!ret) {
+            fnode = malloc(sizeof(FuncNode));
+            if (fnode == NULL) {
+                ret = -ENOMEM;
+            }
+            else {
+                fnode->pattern = malloc(guard->patSize);
+                if (fnode->pattern == NULL) {
+                    free(fnode);
+                    ret = -ENOMEM;
+                }
+                else {
+                    memcpy(fnode->pattern, pattern, guard->patSize);
+                    kgenGetLastFuncName(fnode->funcName,
+                                        sizeof(fnode->funcName),
+                                        guard->ctx);
+                    fnode->funcName[FUNC_NAME_MAXLEN - 1] = '\0';
+                    listAddToTail(&guard->funcs, &fnode->node);
+                }
+            }
+        }
+        else {
+            ret = -EOVERFLOW;
+        }
+    }
+    else {
+        fnode = container_of(n, node, FuncNode);
+    }
+
+    if (!ret) {
+        strncpy(name, fnode->funcName, nameLen);
+        name[nameLen - 1] = '\0';
+    }
+
+    return ret;
+}
+
+void
+destroyKgenGuard(struct KgenGuard *guard)
+{
+    listDoForEachSafe(&guard->funcs, destroyFuncNode);
+    free(guard);
+}
+
diff --git a/src/library/common/kgen_loop_helper.c b/src/library/common/kgen_loop_helper.c
new file mode 100644
index 0000000..d48f5f4
--- /dev/null
+++ b/src/library/common/kgen_loop_helper.c
@@ -0,0 +1,105 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+#include <kerngen.h>
+#include <mempat.h>
+
+
+int
+kgenLoopUnroll(
+    struct KgenContext *ctx,
+    LoopCtl *loopCtl,
+    DataType dtype,
+    const LoopUnrollers *unrollers,
+    void *priv)
+{
+    int ret = 0;
+    char tmp[1024];
+    unsigned long i, n;
+    unsigned int nfloats;
+    int vecLen;
+
+    if (!(dtype == TYPE_FLOAT ||
+          dtype == TYPE_DOUBLE ||
+          dtype == TYPE_COMPLEX_FLOAT ||
+          dtype == TYPE_COMPLEX_DOUBLE)) {
+
+        return -EINVAL;
+    }
+
+    if (unrollers->genSingle == NULL) {
+        return -EINVAL;
+    }
+
+    nfloats = dtypeSize(dtype) / sizeof(cl_float);
+
+    vecLen = (unrollers->getVecLen == NULL)? FLOAT4_VECLEN
+                                           : unrollers->getVecLen(ctx, priv);
+
+    if (loopCtl->ocName) {
+        if (loopCtl->obConst) {
+            sprintf(tmp, "for (%s = 0; %s < %lu; %s++)",
+                    loopCtl->ocName, loopCtl->ocName,
+                    loopCtl->outBound.val, loopCtl->ocName);
+        }
+        else {
+            sprintf(tmp, "for (%s = 0; %s < %s; %s++)",
+                    loopCtl->ocName, loopCtl->ocName,
+                    loopCtl->outBound.name, loopCtl->ocName);
+        }
+
+        kgenBeginBranch(ctx, tmp);
+    }
+
+    if (unrollers->preUnroll) {
+        ret = unrollers->preUnroll(ctx, priv);
+    }
+
+    if ((dtype != TYPE_COMPLEX_DOUBLE) && unrollers->genSingleVec) {
+
+        n = loopCtl->inBound * nfloats / vecLen;
+
+        for (i = 0; (i < n) && !ret; i++) {
+            ret = unrollers->genSingleVec(ctx, priv);
+        }
+
+        n = loopCtl->inBound % (vecLen / nfloats);
+    }
+    else {
+        n = loopCtl->inBound;
+    }
+
+    for (i = 0; (i < n) && !ret; i++) {
+        ret = unrollers->genSingle(ctx, priv);
+    }
+
+    if (unrollers->postUnroll && !ret) {
+        ret = unrollers->postUnroll(ctx, priv);
+    }
+
+    if (loopCtl->ocName && !ret) {
+        ret = kgenEndBranch(ctx, NULL);
+    }
+
+    return ret ? 0 : -EOVERFLOW;
+}
+
diff --git a/src/library/common/list.c b/src/library/common/list.c
new file mode 100644
index 0000000..c381462
--- /dev/null
+++ b/src/library/common/list.c
@@ -0,0 +1,136 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stddef.h>
+#include <list.h>
+#include <assert.h>
+
+static __inline
+void listAddAfter(ListNode *prev, ListNode *node)
+{
+    ListNode *next = prev->next;
+
+    prev->next = node;
+    node->prev = prev;
+    node->next = next;
+    next->prev = node;
+}
+
+void
+listAddToTail(ListHead *head, ListNode *node)
+{
+    listAddAfter(head->prev, node);
+}
+
+void
+listAddToHead(ListHead *head, ListNode *node)
+{
+    listAddAfter(head, node);
+}
+
+void
+listDel(ListNode *node)
+{
+#ifdef DEBUG
+    // check if it's not really the list head
+    assert(node->next != node->prev);
+#endif
+
+    node->prev->next = node->next;
+    node->next->prev = node->prev;
+}
+
+ListNode
+*listDelFromTail(ListHead *head)
+{
+    ListNode *node = head->prev;
+
+    listDel(node);
+
+    return node;
+}
+
+void
+listDoForEach(ListHead *head, ListAction act)
+{
+    ListNode *node;
+
+    for (node = listNodeFirst(head); node != head; node = node->next) {
+        act(node);
+    }
+}
+
+void
+listDoForEachSafe(ListHead *head, ListAction act)
+{
+    ListNode *node, *save;
+
+    for (node = listNodeFirst(head), save = node->next; node != head;
+         node = save, save = node->next) {
+
+        act(node);
+    }
+}
+
+void
+listDoForEachPriv(const ListHead *head, ListPrivAction act, void *actPriv)
+{
+    ListNode *node;
+
+    for (node = listNodeFirst(head); node != head; node = node->next) {
+        act(node, actPriv);
+    }
+}
+
+void
+listDoForEachPrivSafe(const ListHead *head, ListPrivAction act, void *actPriv)
+{
+    ListNode *node, *save;
+
+    for (node = listNodeFirst(head), save = node->next; node != head;
+         node = save, save = node->next) {
+
+        act(node, actPriv);
+    }
+}
+
+ListNode
+*listNodeSearch(const ListHead *head, const void *key, ListCmpFn cmp)
+{
+    ListNode *node;
+
+    for (node = listNodeFirst(head); node != head; node = node->next) {
+        if (!cmp(node, key)) {
+            break;
+        }
+    }
+
+    return (node == head) ? NULL : node;
+}
+
+size_t
+listLength(const ListHead *head)
+{
+    size_t length = 0;
+    ListNode *node;
+
+    for (node= listNodeFirst(head); node != head; node = node->next) {
+        length++;
+    }
+
+    return length;
+}
diff --git a/src/library/common/misc.c b/src/library/common/misc.c
new file mode 100644
index 0000000..81f3e6b
--- /dev/null
+++ b/src/library/common/misc.c
@@ -0,0 +1,61 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <cltypes.h>
+
+unsigned int
+dtypeSize(DataType type)
+{
+    size_t ret;
+
+    switch (type) {
+    case TYPE_FLOAT:
+        ret = sizeof(cl_float);
+        break;
+    case TYPE_DOUBLE:
+        ret = sizeof(cl_double);
+        break;
+    case TYPE_COMPLEX_FLOAT:
+        ret = sizeof(cl_float2);
+        break;
+    case TYPE_COMPLEX_DOUBLE:
+        ret = sizeof(cl_double2);
+        break;
+    case TYPE_UNSIGNED_INT:// For iAMAX
+        ret = sizeof(cl_uint);
+        break;
+    default:
+        ret = (size_t)-1;
+        break;
+    }
+
+    return (unsigned int)ret;
+}
+
+size_t
+fl4RowWidth(size_t width, size_t typeSize)
+{
+    size_t s;
+
+    s = width / (sizeof(cl_float4) / typeSize);
+    if (s * (sizeof(cl_float4) / typeSize) != width) {
+        s++;
+    }
+
+    return s;
+}
+
diff --git a/src/library/common/mutex.c b/src/library/common/mutex.c
new file mode 100644
index 0000000..ff87473
--- /dev/null
+++ b/src/library/common/mutex.c
@@ -0,0 +1,128 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <mutex.h>
+
+#if defined(_MSC_VER)
+
+#pragma warning(push,3)
+#include <windows.h>
+#pragma warning(pop)
+
+mutex_t*
+mutexInit(void)
+{
+    HANDLE mutex;
+
+    mutex = CreateMutex(NULL, FALSE, NULL);
+    return (mutex_t*)mutex;
+}
+
+int
+mutexDestroy(mutex_t *_mutex)
+{
+    HANDLE mutex = (HANDLE)_mutex;
+
+    if (CloseHandle(mutex) == FALSE) {
+        /* Bad mutex, etc. */
+        return 1;
+    }
+    return 0;
+}
+
+int
+mutexLock(mutex_t *_mutex)
+{
+    HANDLE mutex = (HANDLE)_mutex;
+    DWORD rc;
+
+    rc = WaitForSingleObjectEx(mutex, INFINITE, FALSE);
+    if (rc != WAIT_OBJECT_0) {
+        /* Bad mutex, etc. */
+        return 1;
+    }
+    return 0;
+}
+
+int
+mutexUnlock(mutex_t *_mutex)
+{
+    HANDLE mutex = (HANDLE)_mutex;
+
+    if (ReleaseMutex(mutex) == FALSE) {
+        /* Bad mutex, etc. */
+        return 1;
+    }
+    return 0;
+}
+
+#else /* defined(_MSC_VER) */
+
+#include <stdlib.h>
+#include <pthread.h>
+
+mutex_t*
+mutexInit(void)
+{
+    pthread_mutex_t *mutex;
+
+    mutex = calloc(1, sizeof(pthread_mutex_t));
+    if (mutex == NULL)
+        return NULL;
+    if (pthread_mutex_init(mutex, NULL) != 0) {
+        free(mutex);
+        return NULL;
+    }
+
+    return (mutex_t*)mutex;
+}
+
+int
+mutexDestroy(mutex_t *_mutex)
+{
+    pthread_mutex_t *mutex = (pthread_mutex_t*)_mutex;
+
+    if (mutex == NULL) {
+        /* Mutex is invalid */
+        return 1;
+    }
+    if (pthread_mutex_destroy(mutex) != 0) {
+        /* Mutex is busy or invalid */
+        return 1;
+    }
+
+    free(mutex);
+    return 0;
+}
+
+int
+mutexLock(mutex_t *_mutex)
+{
+    pthread_mutex_t *mutex = (pthread_mutex_t*)_mutex;
+
+    return (pthread_mutex_lock(mutex) == 0) ? 0 : 1;
+}
+
+int
+mutexUnlock(mutex_t *_mutex)
+{
+    pthread_mutex_t *mutex = (pthread_mutex_t*)_mutex;
+
+    return (pthread_mutex_unlock(mutex) == 0) ? 0 : 1;
+}
+
+#endif  /* defined (_MSC_VER) */
diff --git a/src/library/common/tests/CMakeLists.txt b/src/library/common/tests/CMakeLists.txt
new file mode 100644
index 0000000..213e0bc
--- /dev/null
+++ b/src/library/common/tests/CMakeLists.txt
@@ -0,0 +1,65 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+set(SRC_COMMON
+    ../list.c
+    ../clkern.c
+    ../kern_cache.c
+    ../kerngen_core.c
+    ../kgen_basic.c
+    ../kgen_loop_helper.c
+    ../kgen_guard.c
+    ../misc.c
+    ../gens/dblock_kgen.c
+    ../devinfo.c
+    ../devinfo-cache.c
+    ../mutex.c
+    ../trace_malloc.c
+)
+
+set(SRC_DBLOCK_KGEN
+    ${SRC_COMMON}
+    t_dblock_kgen.c
+)
+
+set(SRC_GENS_CACHE
+    ${SRC_COMMON}
+    t_gens_cache.c
+)
+
+include_directories(${OPENCL_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_DIR}/include ${clBLAS_SOURCE_DIR}/src/blas/include)
+
+add_executable(t_dblock_kgen ${SRC_DBLOCK_KGEN})
+target_link_libraries(t_dblock_kgen ${OPENCL_LIBRARIES} ${MATH_LIBRARY})
+
+add_executable(t_gens_cache ${SRC_GENS_CACHE})
+target_link_libraries(t_gens_cache ${OPENCL_LIBRARIES} ${MATH_LIBRARY})
+
+if( TARGET_PLATFORM EQUAL 64 )
+	# CPack configuration; include the executable into the package
+	install( TARGETS t_dblock_kgen t_gens_cache
+			RUNTIME DESTINATION bin64
+			LIBRARY DESTINATION lib64
+			ARCHIVE DESTINATION lib64/import
+			)
+else()
+	# CPack configuration; include the executable into the package
+	install( TARGETS t_dblock_kgen t_gens_cache
+			RUNTIME DESTINATION bin32
+			LIBRARY DESTINATION lib32
+			ARCHIVE DESTINATION lib32/import
+			)
+endif()
diff --git a/src/library/common/tests/t_dblock_kgen.c b/src/library/common/tests/t_dblock_kgen.c
new file mode 100644
index 0000000..75987f9
--- /dev/null
+++ b/src/library/common/tests/t_dblock_kgen.c
@@ -0,0 +1,1389 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * data block processing function
+ * generators test
+ *
+ * NOTES:
+ *    1) The test can run incorrectly on devices with
+ *       wavefront less than 64.
+ *    2) The test with -n or (and) -o option will not work
+ *       on CPU since unaligned access to vector data are
+ *       not allowed for it.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <string.h>
+#include <assert.h>
+
+#include <clkern.h>
+#include <dblock_kgen.h>
+
+#define MAX(a, b)           ((b) > (a)) ? (b) : (a)
+#define ARRAY_LENGTH(ar)    sizeof((ar)) / sizeof((ar)[0])
+
+#define EXTRACT_COMPLEX_DOUBLE(ptr, type, re, img)                  \
+do {                                                                \
+    type *ptr1 = (type*)ptr;                                        \
+                                                                    \
+    re = ptr1->s[0];                                                \
+    img = ptr1->s[1];                                               \
+} while (0)
+
+#define MUL_COMPLEX(mul1, mul2, type)                               \
+do {                                                                \
+    type *mul11 = (type*)mul1;                                      \
+    type *mul21 = (type*)mul2;                                      \
+    type tmp = *mul11;                                              \
+                                                                    \
+    mul11->s[0] = tmp.s[0] * mul21->s[0] - tmp.s[1] * mul21->s[1];  \
+    mul11->s[1] = tmp.s[0] * mul21->s[1] + tmp.s[1] * mul21->s[0];  \
+} while (0)                                                         \
+
+enum {
+    SOURCE_BUFLEN = 1048576
+};
+
+enum {
+    DEBUG_BUFLEN = 1048576
+};
+
+typedef enum TransposeType {
+    TRANSPOSE_LOCAL,        // transpose at copying to the local memory
+    TRANSPOSE_GLOBAL,       // transpose at copying to the global memory
+    TRANSPOSE_BOTH          // transpose at both the directions copying
+} TransposeType;
+
+typedef struct TestDesc {
+    cl_uint widthA;
+    cl_uint heightA;
+    cl_uint widthB;
+    cl_uint heightB;
+    cl_uint srowA;       // start row in matrix A
+    cl_uint scolA;
+    cl_uint srowB;
+    cl_uint scolB;
+    SubproblemDim dim;
+    PGranularity pgran;
+    bool transpose;
+    bool generic;
+    bool packedImages;
+    TransposeType transpType;
+    // type size
+    DataType type;
+} TestDesc;
+
+typedef struct FuncTable {
+    // fill matrix element with random value
+    void (*fillRandom)(void *a);
+    // fill the matrix element with a special marker
+    void (*fillMarker)(void *a);
+    // function comparing two elements
+    int (*compare)(const void *a, const void *b);
+    // multiply an element 'a' on element 'b' and update the element 'a'
+    void (*mul)(void *a, const void *b);
+} FuncTable;
+
+typedef int
+(*TestFn)(
+    struct KgenContext *ctx,
+    void *srcBuf,
+    TestDesc *tdesc,
+    cl_device_id devID,
+    cl_context clCtx,
+    cl_command_queue queue);
+
+extern char *optarg;
+
+const float boundMarker = 5.0;
+
+const char *usage =
+    "Usage: t_dblock_kgen -f <proc> [-c] [-t type] -d <type> [-n] [-o] [-g];\n"
+    "-c -- launch the CL code on CPU\n"
+    "-t -- transposed version: if option argument is 'local', transpose at copying\n"
+    "      to the local memory, if it is 'global', then transpose at copying to the\n"
+    "      global memory, if 'both' transpose at both the copying\n"
+    "-d -- data type: float, double, complex_float, complex_double\n"
+    "-n -- matrix width is not float4 aligned\n"
+    "-o -- start offset is not zero\n"
+    "-g -- generic (slow) version\n"
+    "-b -- several rows can be packed to one image row;\n";
+
+const char *rwBlockKernelDecl =
+    "__kernel void\n"
+    "rwMatrBlockTest(\n"
+    "   __global %s *matrA,\n"
+    "   unsigned int lda,\n"
+    "   __global %s *matrB,\n"
+    "   unsigned int ldb,\n"
+    "   unsigned int srowA,\n"
+    "   unsigned int scolA,\n"
+    "   unsigned int srowB,\n"
+    "   unsigned int scolB)\n";
+
+const char *rwBlockKernelImgDecl =
+    "__kernel void\n"
+    "rwMatrBlockTest(\n"
+    "   __global %s *matrA,\n"
+    "   unsigned int lda,\n"
+    "   __global %s *matrB,\n"
+    "   unsigned int ldb,\n"
+    "   unsigned int srowA,\n"
+    "   unsigned int scolA,\n"
+    "   unsigned int srowB,\n"
+    "   unsigned int scolB,\n"
+    "   __write_only image2d_t image1,\n"
+    "   __write_only image2d_t image2)\n";
+
+// type specific functions
+
+// for the  float type
+static void
+fFillRandom(void *a)
+{
+    *(cl_float*)a = random() % 1000;
+}
+
+static void
+fFillMarker(void *a)
+{
+    *(cl_float*)a = boundMarker;
+}
+
+static int
+fCompare(const void *a, const void *b)
+{
+    cl_float *a1 = (cl_float*)a;
+    cl_float *b1 = (cl_float*)b;
+
+    return !(*a1 == *b1);
+}
+
+static void
+fmul(void *a, const void *b)
+{
+    cl_float *a1 = (cl_float*)a;
+    cl_float *b1 = (cl_float*)b;
+
+    *a1 *= *b1;
+}
+
+// for the double type
+
+static void
+dFillRandom(void *a)
+{
+    *(cl_double*)a = random() % 1000;
+}
+
+static void
+dFillMarker(void *a)
+{
+    *(cl_double*)a = boundMarker;
+}
+
+static int
+dCompare(const void *a, const void *b)
+{
+    cl_double *a1 = (cl_double*)a;
+    cl_double *b1 = (cl_double*)b;
+
+    return !(*a1 == *b1);
+}
+
+static void
+dmul(void *a, const void *b)
+{
+    cl_double *a1 = (cl_double*)a;
+    cl_double *b1 = (cl_double*)b;
+
+    *a1 *= *b1;
+}
+
+// for the complex float type
+
+static void
+cFillRandom(void *a)
+{
+    cl_float2 *a1 = (cl_float2*)a;
+
+    a1->s[0] = random() % 1000;
+    a1->s[1] = random() % 1000;
+}
+
+static void
+cFillMarker(void *a)
+{
+    cl_float2 *a1 = (cl_float2*)a;
+
+    a1->s[0] = boundMarker;
+    a1->s[1] = boundMarker;
+}
+
+static int
+cCompare(const void *a, const void *b)
+{
+    cl_float2 *a1 = (cl_float2*)a;
+    cl_float2 *b1 = (cl_float2*)b;
+
+    return !((a1->s[0] == b1->s[0]) && (a1->s[1] == b1->s[1]));
+}
+
+static void
+cmul(void *a, const void *b)
+{
+    MUL_COMPLEX(a, b, cl_float2);
+}
+
+// for the complex double type
+
+void
+zFillRandom(void *a)
+{
+    cl_double2 *a1 = (cl_double2*)a;
+
+    a1->s[0] = random() % 1000;
+    a1->s[1] = random() % 1000;
+}
+
+void
+zFillMarker(void *a)
+{
+    cl_double2 *a1 = (cl_double2*)a;
+
+    a1->s[0] = boundMarker;
+    a1->s[1] = boundMarker;
+}
+
+int
+zCompare(const void *a, const void *b)
+{
+    cl_double2 *a1 = (cl_double2*)a;
+    cl_double2 *b1 = (cl_double2*)b;
+
+    return !((a1->s[0] == b1->s[0]) && (a1->s[1] == b1->s[1]));
+}
+
+static void
+zmul(void *a, const void *b)
+{
+    MUL_COMPLEX(a, b, cl_double2);
+}
+
+static FuncTable funcTable[TYPE_COMPLEX_DOUBLE + 1] = {
+    {fFillRandom, fFillMarker, fCompare, fmul},
+    {dFillRandom, dFillMarker, dCompare, dmul},
+    {cFillRandom, cFillMarker, cCompare, cmul},
+    {zFillRandom, zFillMarker, zCompare, zmul}
+};
+
+/*
+ *  fill matrix with random elements or the special random
+ *  element if 'random' is set to true
+ */
+static void
+fillMatrix(
+    cl_float *matr,
+    size_t height,
+    size_t width,
+    size_t ld,
+    DataType dtype,
+    bool marker)
+{
+    unsigned int nfloats;
+    size_t i, j;
+    void *p;
+    void (*fill)(void*);
+
+    fill = (marker) ? funcTable[dtype].fillMarker : funcTable[dtype].fillRandom;
+
+    nfloats = dtypeSize(dtype) / sizeof(cl_float);
+    for (i = 0; i < height; i++) {
+        for (j = 0; j < width; j++) {
+            p = (cl_float*)matr + (i * ld + j) * nfloats;
+            fill(p);
+        }
+    }
+}
+
+static int
+compareMatrices(void *matrA, void *matrB, const TestDesc *tdesc)
+{
+    size_t i, j;
+    unsigned int nfloats;
+    void *p1, *p2;
+    int ret = 0;
+    double a1, b1, a2, b2;
+
+    nfloats = dtypeSize(tdesc->type) / sizeof(cl_float);
+    for (i = 0; (i < tdesc->dim.y) && !ret; i++) {
+        for (j = 0; j < tdesc->dim.x; j++) {
+            p1 = (cl_float*)matrA + ((tdesc->srowA + i) * tdesc->widthA +
+                            tdesc->scolA + j) * nfloats;
+            if (tdesc->transpose && (tdesc->transpType != TRANSPOSE_BOTH)) {
+                p2 = (cl_float*)matrB + ((tdesc->srowB + j) * tdesc->widthB +
+                                tdesc->scolB + i) * nfloats;
+            }
+            else {
+                p2 = (cl_float*)matrB + ((tdesc->srowB + i) * tdesc->widthB +
+                               tdesc->scolB + j) * nfloats;
+            }
+            ret = funcTable[tdesc->type].compare(p1, p2);
+            if (ret) {
+                printf("The first error occurred at row %lu, column %lu "
+                       "of the block: ", i + tdesc->srowA, j + tdesc->scolA);
+                if ((tdesc->type == TYPE_FLOAT) ||
+                    (tdesc->type == TYPE_DOUBLE)) {
+
+                    if (tdesc->type == TYPE_FLOAT) {
+                        a1 = *(cl_float*)p1;
+                        b1 = *(cl_float*)p2;
+                    }
+                    else {
+                        a1 = *(cl_double*)p1;
+                        b1 = *(cl_double*)p2;
+                    }
+                    printf("value is %.5E but must be %.5E\n", b1, a1);
+                }
+                else {
+                    if (tdesc->type == TYPE_COMPLEX_FLOAT) {
+                        EXTRACT_COMPLEX_DOUBLE(p1, cl_float2, a1, a2);
+                        EXTRACT_COMPLEX_DOUBLE(p2, cl_float2, b1, b2);
+                    }
+                    else {
+                        EXTRACT_COMPLEX_DOUBLE(p1, cl_double2, a1, a2);
+                        EXTRACT_COMPLEX_DOUBLE(p2, cl_double2, b1, b2);
+                    }
+                    printf("value is (%.5E, %.5E) but must be (%.5E, %.5E)\n",
+                           b1, b2, a1, a2);
+                }
+                break;
+            }
+        }
+    }
+
+    return ret;
+}
+
+static int
+checkBound(
+    void *matr,
+    DataType dtype,
+    size_t srow,
+    size_t scol,
+    size_t nrRows,
+    size_t nrCols,
+    size_t rwidth)
+{
+    size_t i, j;
+    unsigned int nfloats;
+    void *p;
+    int ret = 0;
+    double a1, a2;
+    unsigned char marker[sizeof(cl_double2)];
+
+    nfloats = dtypeSize(dtype) / sizeof(cl_float);
+    funcTable[dtype].fillMarker(marker);
+
+    for (i = 0; (i < nrRows) && !ret; i++) {
+        for (j = 0; j < nrCols; j++) {
+            p = (cl_float*)matr + ((srow + i) * rwidth +
+                           scol + j) * nfloats;
+            ret = funcTable[dtype].compare(p, marker);
+            if (ret) {
+                printf("The bound marker first damaged at row %lu, column %lu "
+                       "of the block: ", i + srow, j + scol);
+                if ((dtype == TYPE_FLOAT) ||
+                    (dtype == TYPE_DOUBLE)) {
+
+                    if (dtype == TYPE_FLOAT) {
+                        a1 = *(cl_float*)p;
+                    }
+                    else {
+                        a1 = *(cl_double*)p;
+                    }
+                    printf("actual value is %.5E\n", a1);
+                }
+                else {
+                    if (dtype == TYPE_COMPLEX_FLOAT) {
+                        EXTRACT_COMPLEX_DOUBLE(p, cl_float2, a1, a2);
+                    }
+                    else {
+                        EXTRACT_COMPLEX_DOUBLE(p, cl_double2, a1, a2);
+                    }
+                    printf("actual value is (%.5E, %.5E)\n",
+                           a1, a2);
+                }
+                break;
+            }
+        }
+    }
+
+    return ret;
+}
+
+// check the data was not written outside bound
+static int
+checkMatrixBound(void *matrB, const TestDesc *tdesc)
+{
+    int ret = 0;
+    size_t dimr, dimc;
+
+    if (tdesc->transpose && (tdesc->transpType != TRANSPOSE_BOTH)) {
+        dimr = tdesc->dim.y;
+        dimc = tdesc->dim.x;
+    }
+    else {
+        dimr = tdesc->dim.x;
+        dimc = tdesc->dim.y;
+    }
+
+    if (tdesc->srowB) {
+        ret = checkBound(matrB, tdesc->type, 0, 0, tdesc->srowB,
+                         tdesc->widthB, tdesc->widthB);
+    }
+
+    if (tdesc->scolB && !ret) {
+        ret = checkBound(matrB, tdesc->type, tdesc->srowB, 0, dimc,
+                         tdesc->scolB, tdesc->widthB);
+    }
+
+    if ((tdesc->scolB + dimr < tdesc->widthB) && !ret) {
+        ret = checkBound(matrB, tdesc->type, tdesc->srowB,
+                         tdesc->scolB + dimr, dimc,
+                         tdesc->widthB - tdesc->scolB - dimr,
+                         tdesc->widthB);
+    }
+
+    if ((tdesc->srowB + dimc < tdesc->heightB) && !ret) {
+        ret = checkBound(matrB, tdesc->type,
+                         tdesc->srowB + dimc, 0,
+                         tdesc->heightB - tdesc->srowB - dimc,
+                         tdesc->widthB, tdesc->widthB);
+    }
+
+    return ret;
+}
+
+// Check the data was not written outside bound. Several matrix rows can be
+// packed into single image line.
+static int
+checkImageBound(void *imgB, const TestDesc *tdesc)
+{
+    int ret = 0;
+    // Size of packed line of rows, in tdesc->type's
+    size_t pLine;
+    size_t rowsInLine;
+
+    rowsInLine = (tdesc->widthB / tdesc->dim.x);
+    pLine = rowsInLine * tdesc->dim.x;
+
+    //right
+    ret = checkBound(imgB, tdesc->type, 0, pLine, tdesc->heightB,
+                     tdesc->widthB - pLine, tdesc->widthB);
+
+    //last image line tail
+    if (!ret && ((tdesc->dim.x * tdesc->dim.y) % pLine != 0)) {
+        ret = checkBound(imgB, tdesc->type,
+                         (tdesc->dim.x * tdesc->dim.y) / pLine,
+                         (tdesc->dim.x * tdesc->dim.y) % pLine, 1,
+                         (pLine - (tdesc->dim.x * tdesc->dim.y) % pLine)
+                         % pLine, tdesc->widthB);
+    }
+
+    //bottom
+    if (!ret) {
+        int startRow = tdesc->dim.x * tdesc->dim.y / pLine;
+        if (tdesc->dim.x * tdesc->dim.y % pLine != 0) {
+            startRow ++;
+        }
+        ret = checkBound(imgB, tdesc->type, startRow, 0,
+                         tdesc->heightB - startRow,
+                         tdesc->widthB, tdesc->widthB);
+    }
+    return ret;
+}
+
+// Compare image with matrix. Several matrix rows can be packed into single
+// image line.
+static int
+compareImage(void *matrA, void *imgB, const TestDesc *tdesc)
+{
+    size_t i, j;
+    unsigned int nfloats;
+    void *p1, *p2;
+    int ret = 0;
+    double a1, b1, a2, b2;
+
+    nfloats = dtypeSize(tdesc->type) / sizeof(cl_float);
+
+    for (i = 0; (i < tdesc->dim.y) && !ret; i++) {
+        for (j = 0; j < tdesc->dim.x; j++) {
+            // Size of packed line of rows, in tdesc->type's
+            int pLine;
+            // absolute index of element in image
+            int index;
+            p1 = (cl_float*)matrA + ((tdesc->srowA + i) * tdesc->widthA +
+                    tdesc->scolA + j) * nfloats;
+            pLine = (tdesc->widthB / tdesc->dim.x) * tdesc->dim.x;
+            index = i * tdesc->dim.x + j;
+
+            p2 = (cl_float*)imgB + ((index / pLine) * tdesc->widthB +
+                  index % pLine) * nfloats;
+            ret = funcTable[tdesc->type].compare(p1, p2);
+
+            if (ret) {
+                printf("The first error occurred at row %lu, column %lu "
+                        "of the block: ", i + tdesc->srowA, j + tdesc->scolA);
+                if ((tdesc->type == TYPE_FLOAT) ||
+                        (tdesc->type == TYPE_DOUBLE)) {
+
+                    if (tdesc->type == TYPE_FLOAT) {
+                        a1 = *(cl_float*)p1;
+                        b1 = *(cl_float*)p2;
+                    }
+                    else {
+                        a1 = *(cl_double*)p1;
+                        b1 = *(cl_double*)p2;
+                    }
+                    printf("value is %.5E but must be %.5E\n", b1, a1);
+                }
+                else {
+                    if (tdesc->type == TYPE_COMPLEX_FLOAT) {
+                        EXTRACT_COMPLEX_DOUBLE(p1, cl_float2, a1, a2);
+                        EXTRACT_COMPLEX_DOUBLE(p2, cl_float2, b1, b2);
+                    }
+                    else {
+                        EXTRACT_COMPLEX_DOUBLE(p1, cl_double2, a1, a2);
+                        EXTRACT_COMPLEX_DOUBLE(p2, cl_double2, b1, b2);
+                    }
+                    printf("value is (%.5E, %.5E) but must be (%.5E, %.5E)\n",
+                            b1, b2, a1, a2);
+                }
+                break;
+            }
+        }
+    }
+
+    return ret;
+}
+
+static cl_uint
+get_cl_device(cl_device_id *id, int type)
+{
+  cl_uint status;
+  cl_uint numEnt;
+  cl_platform_id platform;
+
+  status = clGetPlatformIDs(0, NULL, &numEnt);
+  status += clGetPlatformIDs(1, &platform, NULL);
+  status += clGetDeviceIDs(platform, type, 1, id, &numEnt);
+
+  return status;
+}
+
+// create memory buffer objects needed for a test case
+static cl_int
+createBufferObjs(
+    void *matrA,
+    void *matrB,
+    cl_mem *aobj,
+    cl_mem *bobj,
+    cl_context ctx,
+    size_t asize,
+    size_t bsize)
+{
+    cl_int status;
+
+    if (aobj != NULL) {
+        *aobj = clCreateBuffer(ctx, (CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR),
+                               asize, matrA, &status);
+        if (*aobj == NULL) {
+            printf("Memory object creation for A matrix failed, status = %d, "
+                   "asize = %lu\n", status, asize);
+            return status;
+        }
+    }
+
+    *bobj = clCreateBuffer(ctx, (CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR),
+                           bsize, matrB, &status);
+    if (*bobj == NULL) {
+        printf("Memory object creation for B matrix failed, status = %d, "
+               "bsize = %lu\n", status, bsize);
+        if (aobj) {
+            clReleaseMemObject(*aobj);
+            *aobj = NULL;
+        }
+    }
+
+    return status;
+}
+
+// create image memory objects needed for a test case
+static cl_int
+createImageObjs(
+    void *img1,
+    void *img2,
+    cl_mem *img1obj,
+    cl_mem *img2obj,
+    cl_context ctx,
+    size_t pixels_width,
+    size_t pixels_height)
+{
+    cl_mem *objs[2] = {img1obj, img2obj};
+    void *bufs[2] = {img1, img2};
+    const char *names[2]={"first", "second"};
+    const cl_image_format format = { CL_RGBA, CL_FLOAT };
+    cl_int status;
+    int i;
+
+    for (i=0; i<2; i++) {
+        if (objs[i] == NULL) {
+            continue;
+        }
+        *objs[i] = clCreateImage2D(ctx,
+                (CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR), &format,
+                pixels_width, pixels_height, 0, bufs[i], &status);
+        if (status != CL_SUCCESS) {
+            printf("Memory object creation for %s image failed, status = %d, "
+                           "width = %lupx, height = %lupx\n", names[i], status,
+                           pixels_width, pixels_height);
+            if (i==1) { //first image was created successfully, release it
+                if(objs[0] != NULL) {
+                    clReleaseMemObject(*objs[0]);
+                }
+            }
+            break;
+        }
+    }
+
+    return status;
+}
+
+// create a kernel needed for a test case
+static cl_kernel
+createKernel(
+    const char *kernName,
+    char *srcBuf,
+    cl_context ctx,
+    cl_device_id devID,
+    cl_program *program)
+{
+    char log[65536];
+    cl_int status;
+    cl_kernel krn = NULL;
+
+    *program = buildClProgram(srcBuf, NULL, ctx, devID, log,
+                              sizeof(log), &status);
+    if (*program == NULL) {
+        printf("Program building failed, status = %d, log info:\n%s\n",
+               status, log);
+    }
+    else {
+        krn = clCreateKernel(*program, kernName, &status);
+        if (krn == NULL) {
+            printf("Kernel creation failed, status = %d\n", status);
+            clReleaseProgram(*program);
+            *program = NULL;
+            printf("failed program code: \"%s\"\n", srcBuf);
+            fflush(stdout);
+         }
+    }
+
+    return krn;
+}
+
+static void
+releaseBufferObjs(
+    cl_mem aobj,
+    cl_mem bobj)
+{
+    if (aobj != NULL) {
+        clReleaseMemObject(aobj);
+    }
+    clReleaseMemObject(bobj);
+}
+
+static int
+testMatrBlockRW(
+    struct KgenContext *ctx,
+    void *srcBuf,
+    TestDesc *tdesc,
+    cl_device_id devID,
+    cl_context clCtx,
+    cl_command_queue queue)
+{
+    cl_float *matrA;
+    cl_float *matrB;
+    cl_float *img1;
+    cl_float *img2;
+    TestDesc tdescImage;
+    cl_mem aobj = NULL, bobj = NULL;
+    cl_mem img1obj = NULL, img2obj = NULL;
+    unsigned int tsize;
+    char tmp[1024];
+    KernelDesc kdesc;
+    const char *s, *s1;
+    int ret;
+    // read, write, global to image, local to image functions names
+    char rname[128], wname[128], giname[128], liname[128];
+    size_t size, asize, bsize;
+    // width and height in pixels, size in bytes
+    size_t imageWidth, imageHeight, imgSize;
+    cl_program program = NULL;
+    cl_device_type devType;
+    KernelArg *karg;
+    KernelErrorInfo errInfo;
+    cl_event event;
+    cl_int status;
+    SubproblemDim dim, *pdim;
+    // local memory block leading dimension for generic read and write back
+    size_t ld;
+    bool testImages;
+    DBlockCopyFlags flags = 0;
+    unsigned int nfloats;
+    bool b;
+
+    memset(&kdesc, 0, sizeof(kdesc));
+    rname[0] = wname[0] = giname[0] = liname[0] = '\0';
+
+    clGetDeviceInfo(devID, CL_DEVICE_TYPE, sizeof(devType), &devType, NULL);
+
+    tsize = dtypeSize(tdesc->type);
+    nfloats = tsize / sizeof(cl_float);
+    if (((tdesc->dim.x * tsize) % sizeof(cl_float4) == 0) &&
+        (devType == CL_DEVICE_TYPE_GPU) && !tdesc->transpose) {
+
+        testImages = true;
+    }
+    else {
+        printf("Size of row is not float4 aligned, or the target device is CPU,"
+               "or copying should be transposed, images are not used.\n");
+        testImages = false;
+    }
+    resetKgenContext(ctx);
+    asize = tdesc->heightA * tdesc->widthA * tsize;
+    bsize = tdesc->heightB * tdesc->widthB * tsize;
+
+    // Size of images in pixels. Each pixel is float4.
+    if (tdesc->packedImages) {
+        imageWidth = fl4RowWidth(tdesc->dim.x * 3.5, tsize);
+        imageHeight = tdesc->dim.y;
+    }
+    else {
+        imageWidth = fl4RowWidth(tdesc->dim.x, tsize);
+        imageHeight = tdesc->dim.y;
+    }
+
+    imgSize = imageHeight * imageWidth * sizeof(cl_float4);
+
+    matrA = malloc(asize);
+    matrB = malloc(bsize);
+    img1 = malloc(imgSize);
+    img2 = malloc(imgSize);
+    if (!matrA || !matrB || !img1 || !img2) {
+        printf("Memory allocation failed\n");
+        return -1;
+    }
+    fillMatrix(matrA, tdesc->heightA, tdesc->widthA, tdesc->widthA,
+               tdesc->type, false);
+    fillMatrix(matrB, tdesc->heightB, tdesc->widthB, tdesc->widthB,
+               tdesc->type, true);
+    fillMatrix(img1, imageHeight, imageWidth * FLOAT4_VECLEN / nfloats,
+               imageWidth * FLOAT4_VECLEN / nfloats, tdesc->type, true);
+    fillMatrix(img2, imageHeight, imageWidth * FLOAT4_VECLEN / nfloats,
+               imageWidth * FLOAT4_VECLEN / nfloats, tdesc->type, true);
+
+    if (createBufferObjs(matrA, matrB, &aobj, &bobj, clCtx, asize, bsize)
+            !=  CL_SUCCESS) {
+        return -1;
+    }
+    if (testImages) {
+        // function gets width in float4's
+        if (createImageObjs(img1, img2, &img1obj, &img2obj, clCtx,
+                imageWidth, imageHeight)
+                != CL_SUCCESS) {
+            releaseBufferObjs(aobj, bobj);
+            return -1;
+        }
+    }
+
+    b = isDoubleBasedType(tdesc->type);
+    kgenDeclareUptrs(ctx, b);
+    kgenAddBlankLine(ctx);
+
+    s = dtypeBuiltinType(tdesc->type);
+    s1 = dtypeUPtrField(tdesc->type);
+
+    pdim = (tdesc->generic) ? NULL : &dim;
+
+    // generate the functions
+    dim = tdesc->dim;
+    if (tdesc->transpose && (tdesc->transpType != TRANSPOSE_GLOBAL)) {
+        flags = DBLOCK_COPY_TRANSPOSE;
+    }
+
+    if ((devType == CL_DEVICE_TYPE_CPU) &&
+        (tdesc->widthA % sizeof(cl_float4) || tdesc->srowA)) {
+        flags |= DBLOCK_COPY_NOT_VECTORIZE;
+    }
+
+    copyDataBlockGen(ctx, pdim, &tdesc->pgran, tdesc->type,
+            DBLOCK_GLOBAL_TO_LOCAL, flags);
+    kgenGetLastFuncName(rname, sizeof(rname), ctx);
+    kgenAddBlankLine(ctx);
+
+    if (tdesc->transpose && (tdesc->transpType != TRANSPOSE_GLOBAL)) {
+        ld = fl4RowWidth(tdesc->dim.y, tsize) * FLOAT4_VECLEN / nfloats;
+    }
+    else {
+        ld = fl4RowWidth(tdesc->dim.x, tsize) * FLOAT4_VECLEN / nfloats;
+    }
+
+    if (tdesc->transpose) {
+        flags = (tdesc->transpType == TRANSPOSE_LOCAL) ? 0 :
+                                        DBLOCK_COPY_TRANSPOSE;
+        if (tdesc->transpType != TRANSPOSE_GLOBAL) {
+            dim.x = tdesc->dim.y;
+            dim.y = tdesc->dim.x;
+        }
+    }
+    else {
+        flags = 0;
+    }
+
+    if ((devType == CL_DEVICE_TYPE_CPU) &&
+        (tdesc->widthA % sizeof(cl_float4) || tdesc->srowA)) {
+        flags |= DBLOCK_COPY_NOT_VECTORIZE;
+    }
+
+    copyDataBlockGen(ctx, pdim, &tdesc->pgran, tdesc->type,
+                     DBLOCK_LOCAL_TO_GLOBAL, flags);
+    kgenGetLastFuncName(wname, sizeof(wname), ctx);
+    kgenAddBlankLine(ctx);
+
+    if (testImages) {
+        if (tdesc->packedImages) {
+            flags |= DBLOCK_COPY_PACKED_IMAGE;
+        }
+        copyDataBlockGen(ctx, pdim, &tdesc->pgran, tdesc->type,
+                         DBLOCK_GLOBAL_TO_IMAGE, flags);
+        kgenGetLastFuncName(giname, sizeof(giname), ctx);
+        kgenAddBlankLine(ctx);
+
+        copyDataBlockGen(ctx, pdim, &tdesc->pgran, tdesc->type,
+                         DBLOCK_LOCAL_TO_IMAGE, flags);
+        kgenGetLastFuncName(liname, sizeof(liname), ctx);
+        kgenAddBlankLine(ctx);
+    }
+
+    if (testImages) {
+        sprintf(tmp, rwBlockKernelImgDecl, s, s);
+    }
+    else {
+        sprintf(tmp, rwBlockKernelDecl, s, s);
+    }
+    kgenDeclareFunction(ctx, tmp);
+    kgenBeginFuncBody(ctx);
+
+    size = fl4RowWidth(tdesc->dim.x, tsize) * tdesc->dim.y * FLOAT4_VECLEN;
+    if (size < fl4RowWidth(tdesc->dim.y, tsize) * tdesc->dim.x * FLOAT4_VECLEN) {
+        size = fl4RowWidth(tdesc->dim.y, tsize) * tdesc->dim.x * FLOAT4_VECLEN;
+    }
+
+    // declare and initialize local variables
+    sprintf(tmp, "__local float tmpBuf[%lu];\n"
+                 "LPtr tmp;\n"
+                 "GPtr src, dst;\n"
+                 "\n"
+                 "tmp.f = tmpBuf;\n"
+                 "src.%s = matrA;\n"
+                 "dst.%s = matrB;\n\n",
+            size, s1, s1);
+    kgenAddStmt(ctx, tmp);
+    // read block call
+    if (tdesc->generic) {
+        sprintf(tmp, "%s(tmp, src, srowA, scolA, %lu, %lu, %lu, lda);\n",
+                rname, tdesc->dim.y, tdesc->dim.x, ld);
+    }
+    else {
+        sprintf(tmp, "%s(tmp, src, srowA, scolA, lda);\n", rname);
+    }
+    kgenAddStmt(ctx, tmp);
+
+    kgenAddStmt(ctx, "barrier(CLK_LOCAL_MEM_FENCE);\n");
+
+    // write block call
+    if (tdesc->generic) {
+        sprintf(tmp, "%s(dst, tmp, srowB, scolB, %lu, %lu, ldb, %lu);\n",
+                wname, dim.y, dim.x, ld);
+    }
+    else {
+        sprintf(tmp,  "%s(dst, tmp, srowB, scolB, ldb);\n", wname);
+    }
+    kgenAddStmt(ctx, tmp);
+
+    if (testImages) {
+        // global memory to image write function call
+        if (tdesc->generic) {
+            sprintf(tmp, "%s(image1, 0, 0, src, srowA, scolA, %lu, %lu, lda);\n",
+                    giname, dim.y, dim.x);
+        }
+        else {
+            sprintf(tmp,  "%s(image1, 0, 0, src, srowA, scolA, lda);\n", giname);
+        }
+        kgenAddStmt(ctx, tmp);
+
+        // local memory to image write function call
+        if (tdesc->generic) {
+            sprintf(tmp, "%s(image2, 0, 0, tmp, %lu, %lu, %lu);\n",
+                    liname, dim.y, dim.x, ld);
+        }
+        else {
+            sprintf(tmp,  "%s(image2, 0, 0, tmp);\n", liname);
+        }
+        kgenAddStmt(ctx, tmp);
+    }
+
+    ret = kgenEndFuncBody(ctx);
+
+    // now compile and launch the kernel
+    if (!ret) {
+        kdesc.kernel = createKernel("rwMatrBlockTest", srcBuf, clCtx,
+                                    devID, &program);
+        if (kdesc.kernel == NULL) {
+            ret = -1;
+        }
+    }
+
+    karg = kdesc.args;
+    initMemobjKarg(&karg[0], aobj, matrA, asize, MEMOBJ_WRITE);
+    INIT_KARG(&karg[1], tdesc->widthA);
+    initMemobjKarg(&karg[2], bobj, matrB, bsize, MEMOBJ_READ);
+    INIT_KARG(&karg[3], tdesc->widthB);
+    INIT_KARG(&karg[4], tdesc->srowA);
+    INIT_KARG(&karg[5], tdesc->scolA);
+    INIT_KARG(&karg[6], tdesc->srowB);
+    INIT_KARG(&karg[7], tdesc->scolB);
+    if (testImages) {
+        INIT_KARG(&karg[8], img1obj);
+        INIT_KARG(&karg[9], img2obj);
+    }
+
+    kdesc.globalThreads[0] = tdesc->pgran.wgSize[0];
+    kdesc.localThreads[0] = tdesc->pgran.wgSize[0];
+    kdesc.workDim = 1;
+    kdesc.needExecTime = 1;
+    kdesc.event = &event;
+
+    if (!ret) {
+        status = launchClKernel(&kdesc, queue, &errInfo);
+        if (status != CL_SUCCESS) {
+            printf("Kernel launching failed: status = %d, phase = %d, "
+                   "wrong arg = %d\n", status, errInfo.phase, errInfo.wrongArg);
+            ret = -1;
+        }
+    }
+    if (testImages) {
+        if (!ret) {
+            ret = clEnqueueReadImage(queue, img1obj, CL_TRUE,
+                                     (size_t[3]){0, 0, 0},
+                                     (size_t[3]){imageWidth, imageHeight, 1},
+                                     0, 0, img1, 0, NULL, NULL);
+            if (ret) {
+                printf ("image read failed, code %d\n", ret);
+            }
+        }
+        if (!ret) {
+            ret = clEnqueueReadImage(queue, img2obj, CL_TRUE,
+                                     (size_t[3]){0, 0, 0},
+                                     (size_t[3]){imageWidth, imageHeight, 1},
+                                     0, 0, img2, 0, NULL, NULL);
+            if (ret) {
+                printf ("image read failed, code %d\n", ret);
+            }
+        }
+    }
+
+    memcpy(&tdescImage, tdesc, sizeof(tdescImage));
+    // width in tdesc->types
+    tdescImage.widthB = (imageWidth * FLOAT4_VECLEN) / nfloats;
+    tdescImage.heightB = imageHeight;
+    tdescImage.scolB = 0;
+    tdescImage.srowB = 0;
+    // check the result
+    if (!ret) {
+        ret = compareMatrices(matrA, matrB, tdesc);
+        // check the data wasn't written outside the square
+        if (!ret) {
+            ret = checkMatrixBound(matrB, tdesc);
+        }
+    }
+    if (testImages) {
+        if (tdesc->packedImages) {
+            // compare matrix with packed image data
+            if (!ret) {
+                ret = compareImage(matrA, img1, &tdescImage);
+                if (!ret) {
+                    ret = checkImageBound(img1, &tdescImage);
+                }
+            }
+            if (!ret) {
+                ret = compareImage(matrA, img2, &tdescImage);
+                if (!ret) {
+                    ret = checkImageBound(img2, &tdescImage);
+                }
+            }
+        }
+        else {
+            if (!ret) {
+                ret = compareMatrices(matrA, img1, &tdescImage);
+                if (!ret) {
+                    ret = checkMatrixBound(img1, &tdescImage);
+                }
+            }
+            if (!ret) {
+                ret = compareMatrices(matrA, img2, &tdescImage);
+                if (!ret) {
+                    ret = checkMatrixBound(img2, &tdescImage);
+                }
+            }
+        }
+    }
+    releaseBufferObjs(aobj, bobj);
+    if (testImages) {
+        releaseBufferObjs(img1obj, img2obj);
+    }
+
+    if (kdesc.kernel) {
+        clReleaseKernel(kdesc.kernel);
+        clReleaseProgram(program);
+    }
+
+    free(matrA);
+    free(matrB);
+    free(img1);
+    free(img2);
+
+    return ret;
+}
+
+static int
+parseDataType(DataType *dtype)
+{
+    int ret = 0;
+
+    if (!strcmp(optarg, "float")) {
+        *dtype = TYPE_FLOAT;
+    }
+    else if (!strcmp(optarg, "double")) {
+        *dtype = TYPE_DOUBLE;
+    }
+    else if (!strcmp(optarg, "complex_float")) {
+        *dtype = TYPE_COMPLEX_FLOAT;
+    }
+    else if (!strcmp(optarg, "complex_double")) {
+        *dtype = TYPE_COMPLEX_DOUBLE;
+    }
+    else {
+        printf("An unsupported data typs is specified: %s\n", optarg);
+        ret = -1;
+    }
+
+    return ret;
+}
+
+static int
+parseTransposeType(TransposeType *ttype)
+{
+    int ret = 0;
+
+    if (!strcmp(optarg, "local")) {
+        *ttype = TRANSPOSE_LOCAL;
+    }
+    else if (!strcmp(optarg, "global")) {
+        *ttype = TRANSPOSE_GLOBAL;
+    }
+    else if (!strcmp(optarg, "both")) {
+        *ttype = TRANSPOSE_BOTH;
+    }
+    else {
+        printf("An unsupported transpose type is specified: %s\n",
+               optarg);
+        ret = -1;
+    }
+
+    return ret;
+}
+
+static int
+runTestCases(
+    struct KgenContext *ctx,
+    char *srcBuf,
+    TestDesc *tdesc,
+    cl_device_id devID,
+    cl_context clCtx,
+    cl_command_queue queue,
+    TestFn fn)
+{
+    int i, i1;
+    int ret = 0;
+    unsigned int nfloats;
+
+    i1 = (tdesc->type == TYPE_COMPLEX_DOUBLE) ? 1 : 2;
+    nfloats = dtypeSize(tdesc->type) / sizeof(cl_float);
+    tdesc->pgran.wgDim = 1;
+    tdesc->pgran.wgSize[1] = 1;
+    tdesc->pgran.wfSize = 64;
+
+    for (i = 0; i < i1; i++) {
+        if (!i) {
+            printf("Tests with float4 aligned rows:\n\n");
+            tdesc->dim.x = 64;
+        }
+        else {
+            printf("Tests with not float4 aligned rows:\n\n");
+            tdesc->dim.x = 65;
+        }
+
+        printf("Number of block rows is equal to the work group size\n");
+        tdesc->dim.y = 64 / nfloats;
+        tdesc->pgran.wgSize[0] = 64 / nfloats;
+        ret = fn(ctx, srcBuf, tdesc, devID, clCtx, queue);
+        if (ret) {
+            printf("FAIL\n\n");
+            break;
+        }
+        printf("PASS\n\n");
+
+        printf("Number of block rows is greater than the work group size, "
+               "the rows number is divided on the work group size\n");
+        tdesc->pgran.wgSize[0] = 32 / nfloats;
+        ret = fn(ctx, srcBuf, tdesc, devID, clCtx, queue);
+        if (ret) {
+            printf("FAIL\n\n");
+            break;
+        }
+        tdesc->pgran.wgSize[0] = 64 / nfloats;
+        printf("PASS\n\n");
+
+        printf("Number of block rows is greater than the work group size, "
+               "the rows number is not divided on the work group size\n");
+        tdesc->dim.y = 99 / nfloats;
+        ret = fn(ctx, srcBuf, tdesc, devID, clCtx, queue);
+        if (ret) {
+            printf("FAIL\n\n");
+            break;
+        }
+        printf("PASS\n\n");
+
+        printf("Number of block rows is less than the work group size\n"
+               "The work group size is divided on the number of rows\n");
+        tdesc->dim.y = 32 / nfloats;
+        ret = fn(ctx, srcBuf, tdesc, devID, clCtx, queue);
+        if (ret) {
+            printf("FAIL\n\n");
+            break;
+        }
+        printf("PASS\n\n");
+
+        printf("Number of block rows is less than the work group size\n"
+               "The work group size is not divided on the number of rows\n");
+        tdesc->dim.y = (17 + nfloats - 1) / nfloats;
+        ret = fn(ctx, srcBuf, tdesc, devID, clCtx, queue);
+        if (ret) {
+            printf("FAIL\n\n");
+            break;
+        }
+        printf("PASS\n\n");
+
+        printf("Number of block rows is less than the work group size\n"
+               "The work group size is not divided on the number of rows\n"
+               "Each row consists of 1 elements\n");
+        tdesc->dim.x = 1;
+        ret = fn(ctx, srcBuf, tdesc, devID, clCtx, queue);
+        if (ret) {
+            printf("FAIL\n\n");
+            break;
+        }
+        printf("PASS\n\n");
+    }
+
+    return ret;
+}
+
+int
+main(int argc, char *argv[])
+{
+    struct KgenContext *ctx;
+    char *buf;
+    TestDesc tdesc;
+    cl_context clCtx = NULL;
+    cl_command_queue queue = NULL;
+    cl_device_id devID;
+    int devType = CL_DEVICE_TYPE_GPU;
+    cl_int status;
+    int err = 0;
+    int opt;
+    TestFn func;
+    // test with non zero offset
+    bool off = false;
+    // test with non float4 aligned width
+    bool v4na = false;
+    char dataType[64];
+    const char *s2 = "", *s3 = "", *s4 = "", *s5 = "", *s7 = "";
+    const char *s6 = "GPU";
+
+    memset(&tdesc, 0, sizeof(tdesc));
+    tdesc.transpose = false;
+    tdesc.type = -1;
+
+    // parse command line arguments
+    while (!err) {
+        opt = getopt(argc, argv,  "ct:d:nogb");
+        if (opt == -1) {
+            break;
+        }
+        switch (opt) {
+        case 'c':
+            devType = CL_DEVICE_TYPE_CPU;
+            s5 = "CPU";
+            break;
+        case 't':
+            tdesc.transpose = true;
+            err = parseTransposeType(&tdesc.transpType);
+            break;
+        case 'd':
+            err = parseDataType(&tdesc.type);
+            if (!err) {
+                sprintf(dataType, "%s", optarg);
+            }
+            break;
+        case 'g':
+            tdesc.generic = true;
+            s5 = ", generic (slow) version";
+            break;
+        case 'n':
+            v4na = true;
+            break;
+        case 'o':
+            off = true;
+            break;
+        case 'b':
+            tdesc.packedImages = true;
+            s7 = ", several rows can be packed to one image row";
+            break;
+        default:
+            printf("Wrong option %c\n", opt);
+            err = 1;
+            break;
+        }
+    }
+
+    if ((signed)tdesc.type == -1) {
+        printf("Data type is not specified\n");
+        err = -1;
+    }
+
+    if (err) {
+        printf("%s", usage);
+        return 1;
+    }
+
+    status = get_cl_device(&devID, devType);
+    if (status) {
+        printf("Device opening failed, status = %d\n", status);
+        return 1;
+    }
+
+    clCtx = clCreateContext((const cl_context_properties*)NULL, 1, &devID,
+                             NULL, NULL, &status);
+    if (clCtx == NULL) {
+        printf("Context creation failed, status = %d\n", status);
+    }
+    if (clCtx != NULL) {
+        queue = clCreateCommandQueue(clCtx, devID,
+                                     CL_QUEUE_PROFILING_ENABLE,
+                                     &status);
+        if (queue == NULL) {
+            clReleaseContext(clCtx);
+            printf("Command queue creation failed, status = %d\n", status);
+        }
+    }
+
+    buf = malloc(SOURCE_BUFLEN);
+    ctx = createKgenContext(buf, SOURCE_BUFLEN, true);
+    func = testMatrBlockRW;
+
+    if (v4na) {
+        tdesc.widthA = 2055;
+        tdesc.widthB = 2777;
+        s2 = ", matrix rows are not aligned to float4 boundary";
+    }
+    else {
+        tdesc.widthA = 2048;
+        tdesc.widthB = 2560;
+        s2 = "matrix rows are aligned to float4 boundary";
+    }
+
+    tdesc.heightA = 2048;
+    tdesc.heightB = 2048;
+
+    if (off) {
+        s3 = ", starting offsets are not zero";
+        tdesc.srowA = 17;
+        tdesc.scolA = 27;
+        tdesc.srowB = 55;
+        tdesc.scolB = 86;
+    }
+    else {
+        s3 = ", starting offsets are zero";
+    }
+
+    if (tdesc.transpose) {
+        switch (tdesc.transpType) {
+        case TRANSPOSE_LOCAL:
+            s4 = ", transpose at reading";
+            break;
+        case TRANSPOSE_GLOBAL:
+            s4 = ", transpose at writing back";
+            break;
+        case TRANSPOSE_BOTH:
+            s4 = ", transpose at both reading and writing back";
+            break;
+        }
+    }
+
+    printf("Test read/write block function with %s data type%s%s%s%s%s.\n"
+           "Run the test on %s...\n\n",
+           dataType, s2, s3, s4, s5, s7, s6);
+    if (runTestCases(ctx, buf, &tdesc, devID, clCtx, queue,
+                     func)) {
+        printf("Source: \n%s\n", buf);
+    }
+
+    // release OpenCL objects
+    clReleaseCommandQueue(queue);
+    clReleaseContext(clCtx);
+
+    return 0;
+}
+
diff --git a/src/library/common/tests/t_gens_cache.c b/src/library/common/tests/t_gens_cache.c
new file mode 100644
index 0000000..177a25b
--- /dev/null
+++ b/src/library/common/tests/t_gens_cache.c
@@ -0,0 +1,381 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * test generator and cache infrastructure
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <CL/cl.h>
+
+#include <kerngen.h>
+#include <kern_cache.h>
+
+enum {
+    NR_TEST_PATTERNS = 5,
+    KERNELS_PER_PATTERN = 10,
+    KCACHE_SIZE_LIMIT = 1048576
+};
+
+const char *strcpyImpl =
+    "char\n"
+    "*strcpy(char *dst, char *src)\n"
+    "{\n"
+    "   do {\n"
+    "       *dst++ = *src++;\n"
+    "   } while (*(dst - 1) != 0);\n"
+    "}";
+
+static int
+testGenFunc(struct KgenContext *ctx)
+{
+    kgenDeclareFunction(ctx, "char\n"
+                             "*strcpy(char *dst, char *src)\n");
+    kgenBeginFuncBody(ctx);
+    kgenAddStmt(ctx, "char *ret = dst;\n\n");
+    kgenBeginBranch(ctx, "do");
+    kgenAddStmt(ctx, "*dst = *src;\n"
+                     "src++;\n"
+                     "dst++;\n");
+    kgenEndBranch(ctx, "while (*(dst - 1) != 0)");
+    kgenAddBlankLine(ctx);
+    kgenAddStmt(ctx, "return ret;\n");
+
+    return kgenEndFuncBody(ctx);
+}
+
+static int
+kernExtraCmp(const void *extra, const void *extraKey)
+{
+    unsigned long u1 = *(unsigned long*)extra;
+    unsigned long u2 = *(unsigned long*)extraKey;
+
+    return !(u1 == u2);
+}
+
+
+static int
+testGen(void)
+{
+    char buf[4096];
+    char name[64];
+    int r;
+    struct KgenContext *ctx;
+    size_t s;
+
+    ctx = createKgenContext(buf, sizeof(buf), true);
+    if (ctx == NULL) {
+        printf("Context creation failed\n");
+        printf("FAIL\n\n");
+        return -1;
+    }
+
+    printf("Test normal kernel generation\n");
+    if (!testGenFunc(ctx)) {
+        printf("Generated code:\n\n");
+        printf("%s", buf);
+        printf("\n\nPASS\n\n");
+    }
+    else {
+        printf("FAIL\n\n");
+    }
+
+    printf("Test function name extracting from the generated code\n");
+    r = kgenGetLastFuncName(name, sizeof(name), ctx);
+    if (r) {
+        printf("FAIL\n");
+    }
+    else {
+        if (strcmp((const char*)name, "strcpy")) {
+            printf("Extracted names is %s must be strcpy\n", name);
+            printf("FAIL\n\n");
+            r = -1;
+        }
+        else {
+            printf("PASS\n\n");
+        }
+    }
+
+    destroyKgenContext(ctx);
+
+    printf("Test source size calculating without actual source "
+           "adding to any buffer\n");
+    ctx = createKgenContext(NULL, 0, true);
+    r = kgenAddStmt(ctx, strcpyImpl);
+    if (!r) {
+        s = kgenSourceSize(ctx);
+        if (s != strlen(strcpyImpl)) {
+            r = -1;
+        }
+    }
+    if (r) {
+        printf("FAIL\n\n");
+    }
+    else {
+        printf("PASS\n\n");
+    }
+    destroyKgenContext(ctx);
+
+    ctx = createKgenContext(buf, 5, true);
+
+    if (!r) {
+        printf("Test generation with insufficient buffer\n");
+        if (testGenFunc(ctx)) {
+            printf("PASS\n");
+        }
+        else {
+            printf("FAIL\n");
+            r = -1;
+        }
+    }
+
+    return r;
+}
+
+// test case for kache error functionality
+static int
+errorCacheTestCase(
+    const char *msg,
+    struct KernelCache *kcache,
+    solver_id_t sid,
+    SubproblemDim *dims,
+    unsigned int nrDims,
+    cl_context context,
+    cl_device_id device,
+    unsigned long extra,
+    Kernel *kern)
+{
+    KernelKey key;
+    Kernel* krn1;
+    int r;
+    bool fail;
+
+    key.device = device;
+    key.context = context;
+    key.nrDims = nrDims;
+    memset(key.subdims, 0, sizeof(key.subdims));
+    r = nrDims;
+    if (nrDims > MAX_SUBDIMS)
+        r = MAX_SUBDIMS;
+    memcpy(key.subdims, dims, sizeof(SubproblemDim) * r);
+
+    printf("%s", msg);
+    if (kern == NULL) {
+        krn1 = findKernel(kcache, sid, &key, &extra);
+        fail = (krn1 != NULL);
+    }
+    else {
+        r = addKernelToCache(kcache, sid, kern, &key, kernExtraCmp);
+        fail = (r == 0);
+    }
+
+    if (fail) {
+        printf("FAIL\n");
+        r = -1;
+    }
+    else {
+        printf("PASS\n");
+        r = 0;
+    }
+
+    return r;
+}
+
+static int
+testCache(cl_context context, cl_device_id device)
+{
+    int r = 0;
+    int i, j;
+    unsigned int k;
+    const solver_id_t wrongSID = 15;
+    struct KernelCache *kcache;
+    KernelKey key;
+    Kernel *kern[NR_TEST_PATTERNS][KERNELS_PER_PATTERN], *krn1;
+    SubproblemDim dims[NR_TEST_PATTERNS][KERNELS_PER_PATTERN][MAX_SUBDIMS];
+    unsigned int nrDims[NR_TEST_PATTERNS] = {1, 3, 2, 2, 1};
+    unsigned long extra = 7, extra1;
+
+    printf("Testing inserting and normal searching of kernels\n");
+    kcache = createKernelCache(10, KCACHE_SIZE_LIMIT);
+
+    key.device = device;
+    key.context = context;
+
+    for (i = 0; (i < NR_TEST_PATTERNS) && !r; i++) {
+        for (j = 0; (j < KERNELS_PER_PATTERN) && !r; j++) {
+            for (k = 0; k < nrDims[i]; k++) {
+                dims[i][j][k].x = random() % 1000;
+                if (k == 2) {
+                    dims[i][j][k].y = SUBDIM_UNUSED;
+                    dims[i][j][k].itemX = SUBDIM_UNUSED;
+                }
+                else {
+                    dims[i][j][k].y = random() % 1000;
+                    dims[i][j][k].itemX = random() % 1000;
+                }
+                dims[i][j][k].bwidth = random() % 1000;
+                dims[i][j][k].itemY = random() % 1000;
+            }
+
+            kern[i][j] = allocKernel();
+            kern[i][j]->extra = &extra;
+            kern[i][j]->extraSize = sizeof(extra);
+            key.nrDims = nrDims[i];
+            memset(key.subdims, 0, sizeof(key.subdims));
+            memcpy(key.subdims, dims[i][j], sizeof(SubproblemDim) * key.nrDims);
+            r = addKernelToCache(kcache, i, kern[i][j], &key, kernExtraCmp);
+        }
+    }
+
+    if (r) {
+        printf("Error at addition to the cache, i = %d, j = %d\n", i, j);
+        printf("FAIL\n");
+    }
+    else {
+        // Now try to find each cached kernel
+        extra1 = extra;
+        for (i = 0; (i < NR_TEST_PATTERNS) && !r; i++) {
+            for (j = 0; j < KERNELS_PER_PATTERN; j++) {
+                key.nrDims = nrDims[i];
+                memset(key.subdims, 0, sizeof(key.subdims));
+                memcpy(key.subdims, dims[i][j], sizeof(SubproblemDim) * key.nrDims);
+                krn1 = findKernel(kcache, i, &key, &extra1);
+                if (krn1 != kern[i][j]) {
+                    r = -1;
+                    break;
+                }
+            }
+        }
+        if (r) {
+            printf("First error occurred at pattern %d, kernel %d: ", i, j);
+            if (krn1 == NULL) {
+                printf("the kernel is not found\n");
+            }
+            else {
+                printf("the kernel mismatch\n");
+            }
+        }
+        else {
+            printf("PASS\n");
+        }
+    }
+
+    // cases for search error functionality
+    dims[0][0][0].x = 1001;
+
+    if (!r) {
+        r = errorCacheTestCase("Try to search a kernel not being in "
+                               "the cache\n",
+                               kcache, 0, dims[0][0],
+                               nrDims[0], context, device, extra, NULL);
+    }
+
+    if (!r) {
+        r = errorCacheTestCase("Try To search a kernel with a wrong extra "
+                               "information\n", kcache, 0, dims[0][1],
+                               nrDims[0], context, device, extra - 2, NULL);
+    }
+
+    if (!r) {
+        r = errorCacheTestCase("Try to search a kernel with a solver "
+                               "ID\n", kcache, wrongSID,
+                               dims[0][1], nrDims[0], context, device,
+                               extra, NULL);
+    }
+
+    if (!r) {
+        r = errorCacheTestCase("Try to search a kernel with a wrong number "
+                               "of subproblem dimensions\n",
+                               kcache, 0, dims[0][1], 500, context, device,
+                               extra, NULL);
+    }
+    if (!r) {
+        r = errorCacheTestCase("Try to search a kernel with bad OpenCL context\n",
+                               kcache, 0, dims[0][1], 500, (cl_context)-1, device,
+                               extra, NULL);
+    }
+    if (!r) {
+        r = errorCacheTestCase("Try to search a kernel with bad OpenCL device\n",
+                               kcache, 0, dims[0][1], 500, context,
+                               (cl_device_id)-1, extra, NULL);
+    }
+
+    // error test cases for inserting to cache
+    krn1 = allocKernel();
+    krn1->extra = &extra;
+    krn1->extraSize = sizeof(extra);
+
+    if (!r) {
+        r = errorCacheTestCase("Try to insert a kernel with a wrong solver "
+                               "ID\n", kcache, wrongSID,
+                               dims[0][0], nrDims[0], context, device,
+                               extra, krn1);
+    }
+
+    if (!r) {
+        r = errorCacheTestCase("Try to insert a kernel with a wrong number "
+                               "of subproblem dimensions\n",
+                               kcache, 0, dims[0][0],
+                               500, context, device, extra, krn1);
+    }
+
+    return r;
+}
+
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform;
+    cl_device_id device;
+    cl_context_properties props[] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context context;
+
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        fprintf(stderr, "clGetPlatformIDs() failed with %d\n", err);
+        return 1;
+    }
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        fprintf(stderr, "clGetDeviceIDs() failed with %d\n", err);
+        return 1;
+    }
+    props[1] = (cl_context_properties)platform;
+    context = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        fprintf(stderr, "clCreateContext() failed with %d\n", err);
+        return 1;
+    }
+
+    printf("Launch tests for kernel generators\n");
+    printf("-----------------------------------------\n");
+    if (!testGen()) {
+        printf("-----------------------------------------\n\n");
+        printf("Launch tests for kernel cache\n");
+        printf("-----------------------------------------\n");
+        testCache(context, device);
+    }
+
+    clReleaseContext(context);
+    return 0;
+}
+
diff --git a/src/library/common/trace_malloc.c b/src/library/common/trace_malloc.c
new file mode 100644
index 0000000..b99b7e0
--- /dev/null
+++ b/src/library/common/trace_malloc.c
@@ -0,0 +1,278 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <list.h>
+#include <assert.h>
+
+#include <trace_malloc.h>
+#include <mutex.h>
+
+#if defined(TRACE_MALLOC)
+
+#if _MSC_VER
+#include <msvc.h>
+#endif
+
+// use standard malloc/free though
+#undef malloc
+#undef calloc
+#undef realloc
+#undef free
+
+enum {
+    MTRACE_NODE_MAGIC = 0x5A20286D
+};
+
+#define MTRACE_LOCK()     mutexLock(mutex)
+#define MTRACE_UNLOCK()   mutexUnlock(mutex)
+#define KIB 1024
+#define MIB KIB*1024
+
+typedef struct MtraceNode {
+    unsigned long magic;
+    char *file;
+    int line;
+    void *ptr;
+    size_t size;
+    ListNode node;
+} MtraceNode;
+
+static mutex_t *mutex;
+static size_t tracedSize;
+static size_t rawSize;
+ListHead traceList;
+
+static
+int cmpNode(const ListNode *node, const void *key)
+{
+    const MtraceNode *mtnode = container_of(node, node, MtraceNode);
+
+    return !(mtnode->ptr == key);
+}
+
+static __inline size_t
+rawTracedSize(MtraceNode *mtnode)
+{
+    return mtnode->size + sizeof(MtraceNode) + strlen(mtnode->file) + 1;
+}
+
+static MtraceNode
+*searchMtraceNode(void *ptr)
+{
+    ListNode *node;
+
+    MTRACE_LOCK();
+    node = listNodeSearch(&traceList, ptr, cmpNode);
+    MTRACE_UNLOCK();
+
+    return (node) ? container_of(node, node, MtraceNode) : NULL;
+}
+
+static void
+freeNode(ListNode *node)
+{
+    MtraceNode *mtnode = container_of(node, node, MtraceNode);
+
+    if (mtnode->file != NULL) {
+        free(mtnode->file);
+    }
+    if (mtnode->ptr != NULL) {
+        free(mtnode->ptr);
+    }
+    free(mtnode);
+}
+
+static void
+sprintfTracedSize(char *str, size_t size)
+{
+    const char *suffix;
+
+    if (size < KIB * 10) {
+        suffix = "bytes";
+    }
+    else if (size < MIB * 10) {
+        suffix = "KiB";
+        size /= KIB;
+    }
+    else {
+        suffix = "MIB";
+        size /= MIB;
+    }
+
+    sprintf(str, "%lu %s", size, suffix);
+}
+
+static void
+printNodeInfo(ListNode *node)
+{
+    MtraceNode *mtnode = container_of(node, node, MtraceNode);
+    char s[1024];
+
+    sprintfTracedSize(s, mtnode->size);
+    printf("%s at %s line %d\n", s, mtnode->file, mtnode->line);
+}
+
+void
+initMallocTrace(void)
+{
+    listInitHead(&traceList);
+    tracedSize = rawSize = 0;
+    mutex = mutexInit();
+}
+
+void
+*debugMalloc(size_t size, const char *file, int line)
+{
+    void *ret = NULL;
+    MtraceNode *mtnode;
+
+    mtnode = calloc(1, sizeof(MtraceNode));
+    if (mtnode == NULL) {
+        return NULL;
+    }
+
+    mtnode->magic = MTRACE_NODE_MAGIC;
+    mtnode->file = strdup(file);
+    if (mtnode->file != NULL) {
+        ret = mtnode->ptr = malloc(size);
+    }
+
+    if (ret != NULL) {
+        mtnode->line = line;
+        mtnode->size = size;
+
+        MTRACE_LOCK();
+        tracedSize += size;
+        rawSize += rawTracedSize(mtnode);
+        listAddToTail(&traceList, &mtnode->node);
+        MTRACE_UNLOCK();
+    }
+    else {
+        freeNode(&mtnode->node);
+    }
+
+    return ret;
+}
+
+void
+*debugCalloc(size_t size, const char *file, int line)
+{
+    void *ret;
+
+    ret = debugMalloc(size, file, line);
+    if (ret != NULL) {
+        memset(ret, 0, size);
+    }
+
+    return ret;
+}
+
+void
+*debugRealloc(void *ptr, size_t size, const char *file, int line)
+{
+    void *ret;
+
+    if (ptr == NULL) {
+        ret = debugMalloc(size, file, line);
+    }
+    else {
+        MtraceNode *mtnode;
+
+        mtnode = searchMtraceNode(ptr);
+        assert((mtnode != NULL) && (mtnode->magic == MTRACE_NODE_MAGIC));
+        ret = realloc(ptr, size);
+        if (ret != NULL) {
+            ssize_t delta = (ssize_t)size - (ssize_t)mtnode->size;
+
+            mtnode->ptr = ret;
+            mtnode->size = size;
+            MTRACE_LOCK();
+            tracedSize += delta;
+            rawSize += delta;
+            MTRACE_UNLOCK();
+        }
+        else {
+            debugFree(ptr);
+        }
+    }
+
+    return ret;
+}
+
+void
+debugFree(void *ptr)
+{
+    MtraceNode *mtnode;
+
+    if (ptr == NULL) {
+        return;
+    }
+
+    mtnode = searchMtraceNode(ptr);
+    assert((mtnode != NULL) && (mtnode->magic == MTRACE_NODE_MAGIC));
+
+    MTRACE_LOCK();
+    tracedSize -= mtnode->size;
+    rawSize -= rawTracedSize(mtnode);
+    listDel(&mtnode->node);
+    MTRACE_UNLOCK();
+
+    freeNode(&mtnode->node);
+}
+
+void
+printMallocStatistics(void)
+{
+    char s[1024];
+
+    sprintfTracedSize(s, tracedSize);
+    printf("[MALLOC TRACE] Totally %s is allocated\n", s);
+}
+
+void
+printMemLeaksInfo(void)
+{
+    puts("\n");
+    if (!tracedSize) {
+        puts("[MALLOC TRACE] Hurray! There are not memory leaks!");
+    }
+    else {
+        char s1[1024], s2[1024];
+
+        sprintfTracedSize(s1, tracedSize);
+        sprintfTracedSize(s2, rawSize);
+        printf("[MALLOC TRACE] Totally %s is lost, raw traced size is %s\n",
+               s1, s2);
+        puts("Detailed report:\n"
+             "------------------------------------------------------------");
+        assert(!isListEmpty(&traceList));
+        listDoForEach(&traceList, printNodeInfo);
+    }
+}
+
+void
+releaseMallocTrace(void)
+{
+    listDoForEachSafe(&traceList, freeNode);
+    mutexDestroy(mutex);
+}
+
+#endif       /* TRACE_MALLOC */
+
diff --git a/src/library/tools/ktest/CMakeLists.txt b/src/library/tools/ktest/CMakeLists.txt
new file mode 100644
index 0000000..34828f0
--- /dev/null
+++ b/src/library/tools/ktest/CMakeLists.txt
@@ -0,0 +1,158 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+
+set(KTEST_SRC
+    config.cpp
+    config-cmdline.cpp
+    ktest.cpp
+    main.cpp
+    step.cpp
+    step-dump.cpp
+    var.cpp
+    steps/gemv.cpp
+    steps/symv.cpp
+    steps/gemm.cpp
+    steps/syrk.cpp
+    steps/syr2k.cpp
+    steps/trmm.cpp
+    steps/trsm.cpp
+)
+
+set(KTEST_EXTERNAL_SRC
+    ../../common/kgen_basic.c
+    ../../common/clkern.c
+    ../../common/devinfo.c
+    ../../common/kern_cache.c
+    ../../common/kerngen_core.c
+    ../../common/kgen_guard.c
+    ../../common/kgen_loop_helper.c
+    ../../common/list.c
+    ../../common/misc.c
+    ../../common/mutex.c
+    ../../common/trace_malloc.c
+    ../../common/gens/dblock_kgen.c
+    ../../blas/impl.c
+    ../../blas/scimage.c
+    ../../blas/generic/blas_funcs.c
+    ../../blas/generic/common.c
+    ../../blas/generic/events.c
+    ../../blas/generic/kernel_extra.c
+    ../../blas/generic/matrix_dims.c
+    ../../blas/generic/matrix_props.c
+    ../../blas/generic/problem_iter.c
+    ../../blas/generic/solution_seq_make.c
+    ../../blas/generic/solution_seq.c
+    ../../blas/generic/kdump.c
+    ../../blas/gens/tile.c
+    ../../blas/gens/tile_iter.c
+    ../../blas/gens/blas_subgroup.c
+    ../../blas/gens/decomposition.c
+    ../../blas/gens/blas_kgen.c
+    ../../blas/gens/fetch.c
+    ../../blas/gens/gemm.c
+    ../../blas/gens/gemv.c
+    ../../blas/gens/gen_helper.c
+    ../../blas/gens/gen_init.c
+    ../../blas/gens/symv.c
+    ../../blas/gens/syrxk.c
+    ../../blas/gens/tilemul.c
+    ../../blas/gens/trmm.c
+    ../../blas/gens/trsm.c
+    ../../blas/gens/trsm_kgen.c
+    ../../blas/gens/trxm_common.c
+    ../../blas/gens/xxmv_common.c
+    ../../blas/gens/legacy/blkmul.c
+    ../../blas/gens/legacy/gemm_img.c
+    ../../blas/gens/legacy/gemm_lds.c
+    ../../blas/gens/legacy/trmm_img.c
+    ../../blas/gens/legacy/trmm_lds.c
+    ../../blas/gens/legacy/trsm_img.c
+    ../../blas/gens/legacy/trsm_lds.c
+    ../../blas/gens/legacy/trsm_cached_lds.c
+    ../../blas/gens/legacy/blas_kgen_legacy.c
+    ../../blas/gens/legacy/gen_helper_legacy.c
+    ../../blas/gens/legacy/trxm_common_legacy.c
+    ../../blas/gens/legacy/trsm_kgen_legacy.c
+    ../../blas/init.c
+    ../tune/fileio.c
+    ../tune/toolslib.c
+    ../tune/storage_init.c
+    ../tune/storage_data.c
+    ../tune/storage_io.c
+    ../tune/dimension.c
+    
+	../../blas/gens/trmv_reg.cpp
+	../../blas/gens/ger_lds.cpp
+	../../blas/gens/trsv_trtri.cpp
+	../../blas/gens/trsv_gemv.cpp
+	../../blas/gens/kprintf.cpp
+	../../blas/gens/syr_lds.cpp
+	../../blas/gens/symm_cached.cpp
+	../../blas/gens/gemm_cached.cpp
+	../../blas/gens/gemm_tail_cached.cpp
+	../../blas/gens/syr2_lds.cpp
+	../../blas/gens/her_lds.cpp
+	../../blas/gens/her2_lds.cpp
+	../../blas/gens/gbmv.cpp
+	../../blas/gens/tuned_numbers.c
+	../../blas/gens/swap_reg.cpp
+    ../../blas/gens/scal_reg.cpp
+    ../../blas/gens/copy_reg.cpp
+    ../../blas/gens/axpy_reg.cpp
+    ../../blas/gens/dot.cpp
+    ../../blas/gens/reduction.cpp
+    ../../blas/gens/rotg_reg.cpp
+    ../../blas/gens/rotmg_reg.cpp
+    ../../blas/gens/rotm_reg.cpp
+    ../../blas/gens/iamax.cpp
+    ../../blas/gens/nrm2.cpp
+    ../../blas/gens/asum.cpp
+)
+
+include_directories(
+    ${OPENCL_INCLUDE_DIRS}
+    ${Boost_INCLUDE_DIRS}
+    ${clBLAS_SOURCE_DIR}
+    ${clBLAS_SOURCE_DIR}/include
+    ${clBLAS_SOURCE_DIR}/library/blas/include
+    ${clBLAS_SOURCE_DIR}/library/blas/gens
+    ${clBLAS_SOURCE_DIR}/library/tools/tune
+    ${PROJECT_BINARY_DIR}/include
+)
+
+#setup Visual studio tabs
+source_group(\\ FILES ${KTEST_SRC})
+
+add_executable(make-ktest ${KTEST_SRC} ${KTEST_EXTERNAL_SRC})
+add_dependencies(make-ktest GENERATE_CLT)
+target_link_libraries(make-ktest ${OPENCL_LIBRARIES} ${Boost_LIBRARIES} ${MATH_LIBRARY})
+
+if( TARGET_PLATFORM EQUAL 64 )
+	# CPack configuration; include the executable into the package
+	install( TARGETS make-ktest
+			RUNTIME DESTINATION bin64
+			LIBRARY DESTINATION lib64
+			ARCHIVE DESTINATION lib64/import
+			)
+else()
+	# CPack configuration; include the executable into the package
+	install( TARGETS make-ktest
+			RUNTIME DESTINATION bin32
+			LIBRARY DESTINATION lib32
+			ARCHIVE DESTINATION lib32/import
+			)
+endif()
diff --git a/src/library/tools/ktest/config-cmdline.cpp b/src/library/tools/ktest/config-cmdline.cpp
new file mode 100644
index 0000000..a8f18ec
--- /dev/null
+++ b/src/library/tools/ktest/config-cmdline.cpp
@@ -0,0 +1,690 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <boost/tokenizer.hpp>
+#include <boost/lexical_cast.hpp>
+
+#include "config.h"
+
+using namespace clMath;
+namespace po = boost::program_options;
+
+bool
+Config::isSane()
+{
+    if (!hasFuncID_) {
+        std::cerr << "Missing required options 'function'" << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+void
+Config::setOptDesc(
+    po::options_description& opts,
+    bool useDefaults)
+{
+    po::options_description genOpts("Generator Arguments");
+    genOpts.add_options()
+        ("cpp",
+            (useDefaults ? po::value<std::string>()->default_value(cpp())
+                         : po::value<std::string>()),
+            "Output file name for C++ generated source")
+        ("cl", po::value<std::string>(),
+            "Output file name for OpenCL generated source")
+        ("data",
+            (useDefaults ? po::value<std::string>()->default_value("random")
+                         : po::value<std::string>()),
+            "Data generation pattern\n"
+            "Format: {random | unit | sawtooth}")
+        ( "skip-accuracy",
+          "Don't generate code for accuracy check. Applicable if the program "
+          "is needed only for performance measurement")
+    ;
+
+    po::options_description openclOpts("OpenCL Arguments");
+    openclOpts.add_options()
+        ("platform",
+            (useDefaults ? po::value<std::string>()->default_value(platform())
+                         : po::value<std::string>()),
+            "Platform name")
+        ("device",
+            (useDefaults ? po::value<std::string>()->default_value(device())
+                         : po::value<std::string>()),
+            "Device name")
+        ("build-options", po::value<std::string>(),
+            "Build options")
+    ;
+
+    po::options_description kargsOpts("BLAS Arguments");
+    kargsOpts.add_options()
+        ("function,f", po::value<std::string>(),
+            "Function name, mandatory\n"
+            "Format: {s | d | c | z}{BLAS function}")
+        ("order",
+            (useDefaults ? po::value<clblasOrder>()->default_value(clblasRowMajor)
+                         : po::value<clblasOrder>()),
+            "Data ordering\n"
+            "Format: {column | row}")
+        ("side",
+            (useDefaults ? po::value<clblasSide>()->default_value(clblasLeft)
+                         : po::value<clblasSide>()),
+            "The side matrix A is located relative to matrix B\n"
+            "Format: {left | right}")
+        ("uplo",
+            (useDefaults ? po::value<clblasUplo>()->default_value(clblasUpper)
+                         : po::value<clblasUplo>()),
+            "Upper or lower triangle of matrix is being referenced\n"
+            "Format: {upper | lower}")
+        ("transA",
+            (useDefaults ? po::value<clblasTranspose>()->default_value(clblasNoTrans)
+                         : po::value<clblasTranspose>()),
+            "Matrix A transposition operation\n"
+            "Format: {n | t | c}")
+        ("transB",
+            (useDefaults ? po::value<clblasTranspose>()->default_value(clblasNoTrans)
+                         : po::value<clblasTranspose>()),
+            "Matrix B transposition operation\n"
+            "Format: {n | t | c}")
+        ("diag",
+            (useDefaults ? po::value<clblasDiag>()->default_value(clblasNonUnit)
+                         : po::value<clblasDiag>()),
+            "Whether the matrix is unit triangular\n"
+            "Format: {unit | nonunit}")
+        ("M,M",
+            (useDefaults ? po::value<size_t>()->default_value(256)
+                         : po::value<size_t>()->default_value(256))
+        )
+        ("N,N",
+            (useDefaults ? po::value<size_t>()->default_value(256)
+                         : po::value<size_t>())
+        )
+        ("K,K",
+            (useDefaults ? po::value<size_t>()->default_value(256)
+                         : po::value<size_t>())
+        )
+        ("alpha",
+            (useDefaults ? po::value<std::string>()->default_value("1")
+                         : po::value<std::string>()),
+            "Alpha multiplier\n"
+            "Format: real[,imag]")
+        ("beta",
+            (useDefaults ? po::value<std::string>()->default_value("1")
+                         : po::value<std::string>()),
+            "Beta multiplier\n"
+            "Format: real[,imag]")
+        ("lda", po::value<size_t>(),
+            "Leading dimension of the matrix A")
+        ("ldb", po::value<size_t>(),
+            "Leading dimension of the matrix B")
+        ("ldc", po::value<size_t>(),
+            "Leading dimension of the matrix C")
+        ("offA",
+            (useDefaults ? po::value<size_t>()->default_value(0)
+                         : po::value<size_t>()),
+            "Start offset in buffer of matrix A")
+        ("offBX",
+            (useDefaults ? po::value<size_t>()->default_value(0)
+                         : po::value<size_t>()),
+            "Start offset in buffer of matrix B or vector X")
+        ("offCY",
+            (useDefaults ? po::value<size_t>()->default_value(0)
+                         : po::value<size_t>()),
+            "Start offset in buffer of matrix C or vector Y")
+        ("incx",
+            (useDefaults ? po::value<int>()->default_value(1)
+                         : po::value<int>()),
+            "Increment in the array X")
+        ("incy",
+            (useDefaults ? po::value<int>()->default_value(1)
+                         : po::value<int>()),
+            "Increment in the array Y")
+    ;
+
+    po::options_description decompositionOpts("Decomposition Options");
+    decompositionOpts.add_options()
+        ("decomposition,d", po::value<std::string>(),
+            "SubproblemDim\n"
+            "Format: {subdims[0].x},{subdims[0].y},\n"
+            "        {subdims[0].bwidth},\n"
+            "        {subdims[1].x},{subdims[1].y},\n"
+            "        {subdims[1].bwidth}")
+        ("multikernel", useDefaults ? po::value<bool>()->default_value(false)
+                                    : po::value<bool>(),
+            "Allow division of one BLAS function between several kernels")
+    ;
+
+    opts.add(genOpts).add(openclOpts).add(kargsOpts).add(decompositionOpts);
+}
+
+bool
+Config::loadConfig(const char* filename)
+{
+    po::options_description cfgOpts;
+    setOptDesc(cfgOpts, false);
+
+    if ((filename == NULL) || (*filename == '\0')) {
+        return false;
+    }
+
+    try {
+        std::ifstream in(filename);
+        po::store(po::parse_config_file<char>(in, cfgOpts), vm);
+        po::notify(vm);
+    }
+    catch (const po::invalid_command_line_syntax &err) {
+#if BOOST_VERSION >= 104200
+        switch (err.kind()) {
+        case po::invalid_syntax::missing_parameter:
+            std::cerr << "Missing argument for option `" << err.tokens()
+                << "'" << std::endl;
+            break;
+        default:
+            std::cerr << "Syntax error, kind " << int(err.kind())
+                << std::endl;
+            break;
+        }
+#else
+        std::cerr << err.msg;
+#endif
+        return false;
+    }
+    catch (const po::validation_error &err) {
+        std::cerr << err.what() << std::endl;
+        return false;
+    }
+#if BOOST_VERSION >= 104200
+    catch (const po::reading_file &err) {
+        std::cerr << err.what() << std::endl;
+        return false;
+    }
+#endif
+    catch (const po::unknown_option &err) {
+        std::cerr << err.what() << std::endl;
+    }
+
+    return applyOptions(vm, false);
+}
+
+bool
+Config::parseCommandLine(int argc, char *argv[])
+{
+
+    po::options_description helpOpts("Application Arguments");
+    helpOpts.add_options()
+        ("config", po::value<std::string>()->default_value(defaultConfig_),
+            "Configuration file")
+        ("help,h", "Show this help message");
+    po::options_description visibleOpts;
+    visibleOpts.add(helpOpts);
+    setOptDesc(visibleOpts, true);
+
+    try {
+        po::store(po::parse_command_line(argc, argv, visibleOpts), vm);
+        po::notify(vm);
+    }
+    catch (const po::invalid_command_line_syntax &err) {
+#if BOOST_VERSION >= 104200
+        switch (err.kind()) {
+        case po::invalid_syntax::missing_parameter:
+            std::cerr << "Missing argument for option `" << err.tokens()
+                << "'" << std::endl;
+            break;
+        default:
+            std::cerr << "Syntax error, kind " << int(err.kind())
+                << std::endl;
+            break;
+        };
+#else
+        std::cerr << err.msg;
+#endif
+        return false;
+    }
+    catch (const po::validation_error &err) {
+        std::cerr << err.what() << std::endl;
+        return false;
+    }
+    catch (const po::unknown_option &err) {
+        std::cerr << err.what() << std::endl;
+    }
+
+    if (vm.count("help")) {
+        std::cout << visibleOpts << std::endl;
+        return false;
+    }
+    if (vm.count("config")) {
+        loadConfig(vm["config"].as<std::string>().c_str());
+    }
+
+    return applyOptions(vm);
+}
+
+bool
+Config::applyOptions(
+    const po::variables_map& vm,
+    bool stopOnError)
+{
+    bool rc;
+    ArgMultiplier v;
+
+    rc = true;
+
+    if (vm.count("function")) {
+        if (!setFunction(vm["function"].as<std::string>())) {
+            std::cerr << "Invalid function name: " <<
+                vm["function"].as<std::string>() << std::endl;
+            return false;
+        }
+    }
+
+    if (vm.count("cpp")) {
+        setCpp(vm["cpp"].as<std::string>());
+    }
+    if (vm.count("cl")) {
+        setCl(vm["cl"].as<std::string>());
+    }
+    if (vm.count("data")) {
+        if (!setDataPattern(vm["data"].as<std::string>())) {
+            std::cerr << "Invalid data pattern name" << std::endl;
+            rc = false;
+            if (stopOnError) {
+                return false;
+            }
+        }
+    }
+    if (vm.count("skip-accuracy")) {
+        setSkipAccuracy();
+    }
+
+    if (vm.count("platform")) {
+        if (!setPlatform(vm["platform"].as<std::string>())) {
+            std::cerr << "Invalid platform name" << std::endl;
+            rc = false;
+            if (stopOnError) {
+                return false;
+            }
+        }
+    }
+    if (vm.count("device")) {
+        if (!setDevice(vm["device"].as<std::string>())) {
+            std::cerr << "Invalid device name" << std::endl;
+            rc = false;
+            if (stopOnError) {
+                return false;
+            }
+        }
+    }
+    if (vm.count("build-options")) {
+        setBuildOptions(vm["build-options"].as<std::string>());
+    }
+
+    if (vm.count("order")) {
+        setOrder(vm["order"].as<clblasOrder>());
+    }
+    if (vm.count("side")) {
+        setSide(vm["side"].as<clblasSide>());
+    }
+    if (vm.count("uplo")) {
+        setUplo(vm["uplo"].as<clblasUplo>());
+    }
+    if (vm.count("transA")) {
+        setTransA(vm["transA"].as<clblasTranspose>());
+    }
+    if (vm.count("transB")) {
+        setTransB(vm["transB"].as<clblasTranspose>());
+    }
+    if (vm.count("diag")) {
+        setDiag(vm["diag"].as<clblasDiag>());
+    }
+    if (vm.count("M")) {
+        setM(vm["M"].as<size_t>());
+    }
+    if (vm.count("N")) {
+        setN(vm["N"].as<size_t>());
+    }
+    if (vm.count("K")) {
+        setK(vm["K"].as<size_t>());
+    }
+    if (vm.count("alpha")) {
+        if (!parseArgMultiplier(vm["alpha"].as<std::string>(), v)) {
+            std::cerr << "in option 'alpha': invalid option value" << std::endl;
+            rc = false;
+            if (stopOnError) {
+                return false;
+            }
+        }
+        setAlpha(v);
+    }
+    if (vm.count("beta")) {
+        if (!parseArgMultiplier(vm["beta"].as<std::string>(), v)) {
+            std::cerr << "in option 'beta': invalid option value" << std::endl;
+            rc = false;
+            if (stopOnError) {
+                return false;
+            }
+        }
+        setBeta(v);
+    }
+    if (vm.count("lda")) {
+        setLDA(vm["lda"].as<size_t>());
+    }
+    if (vm.count("ldb")) {
+        setLDB(vm["ldb"].as<size_t>());
+    }
+    if (vm.count("ldc")) {
+        setLDC(vm["ldc"].as<size_t>());
+    }
+    if (vm.count("offA")) {
+        setOffA(vm["offA"].as<size_t>());
+    }
+    if (vm.count("offBX")) {
+        setOffBX(vm["offBX"].as<size_t>());
+    }
+    if (vm.count("offCY")) {
+        setOffCY(vm["offCY"].as<size_t>());
+    }
+    if (vm.count("incx")) {
+        setIncX(vm["incx"].as<int>());
+    }
+    if (vm.count("incy")) {
+        setIncY(vm["incy"].as<int>());
+    }
+
+    if (vm.count("decomposition")) {
+        if (!parseDecompositionOpt(vm["decomposition"].as<std::string>())) {
+            std::cerr << "in option 'decomposition': invalid option value" << std::endl;
+            rc = false;
+            if (stopOnError) {
+                return false;
+            }
+        }
+    }
+
+    if (vm.count("multikernel")) {
+        setMultiKernel(vm["multikernel"].as<bool>());
+    }
+
+    return rc;
+}
+
+std::istream& operator>>(std::istream& in, clblasOrder& order)
+{
+    std::string token;
+
+    in >> token;
+    if (token == "row") {
+        order = clblasRowMajor;
+    }
+    else if (token == "column") {
+        order = clblasColumnMajor;
+    }
+    else {
+#if BOOST_VERSION >= 104200
+        throw po::validation_error(po::validation_error::invalid_option_value);
+#else
+        throw po::validation_error("invalid option value");
+#endif
+    }
+
+    return in;
+}
+
+std::ostream& operator<<(std::ostream& out, const clblasOrder& order)
+{
+    switch (order) {
+    case clblasRowMajor:
+        out << "row";
+        break;
+    case clblasColumnMajor:
+        out << "column";
+        break;
+    }
+
+    return out;
+}
+
+std::istream& operator>>(std::istream& in, clblasSide& side)
+{
+    std::string token;
+
+    in >> token;
+    if (token == "left") {
+        side = clblasLeft;
+    }
+    else if (token == "right") {
+        side = clblasRight;
+    }
+    else {
+#if BOOST_VERSION >= 104200
+        throw po::validation_error(po::validation_error::invalid_option_value);
+#else
+        throw po::validation_error("invalid option value");
+#endif
+    }
+
+    return in;
+}
+
+std::ostream& operator<<(std::ostream& out, const clblasSide& side)
+{
+    switch (side) {
+    case clblasLeft:
+        out << "left";
+        break;
+    case clblasRight:
+        out << "right";
+        break;
+    }
+
+    return out;
+}
+
+std::istream& operator>>(std::istream& in, clblasUplo& uplo)
+{
+    std::string token;
+
+    in >> token;
+    if (token == "upper") {
+        uplo = clblasUpper;
+    }
+    else if (token == "lower") {
+        uplo = clblasLower;
+    }
+    else {
+#if BOOST_VERSION >= 104200
+        throw po::validation_error(po::validation_error::invalid_option_value);
+#else
+        throw po::validation_error("invalid option value");
+#endif
+    }
+
+    return in;
+}
+
+std::ostream& operator<<(std::ostream& out, const clblasUplo& uplo)
+{
+    switch (uplo) {
+    case clblasUpper:
+        out << "upper";
+        break;
+    case clblasLower:
+        out << "lower";
+        break;
+    }
+
+    return out;
+}
+
+std::istream& operator>>(std::istream& in, clblasTranspose& trans)
+{
+    std::string token;
+
+    in >> token;
+    if (token == "n") {
+        trans = clblasNoTrans;
+    }
+    else if (token == "t") {
+        trans = clblasTrans;
+    }
+    else if (token == "c") {
+        trans = clblasConjTrans;
+    }
+    else {
+#if BOOST_VERSION >= 104200
+        throw po::validation_error(po::validation_error::invalid_option_value);
+#else
+        throw po::validation_error("invalid option value");
+#endif
+    }
+
+    return in;
+}
+
+std::ostream& operator<<(std::ostream& out, const clblasTranspose& trans)
+{
+    switch (trans) {
+    case clblasNoTrans:
+        out << "n";
+        break;
+    case clblasTrans:
+        out << "t";
+        break;
+    case clblasConjTrans:
+        out << "c";
+        break;
+    }
+
+    return out;
+}
+
+std::istream& operator>>(std::istream& in, clblasDiag& diag)
+{
+    std::string token;
+
+    in >> token;
+    if (token == "unit") {
+        diag = clblasUnit;
+    }
+    else if (token == "nonunit") {
+        diag = clblasNonUnit;
+    }
+    else {
+#if BOOST_VERSION >= 104200
+        throw po::validation_error(po::validation_error::invalid_option_value);
+#else
+        throw po::validation_error("invalid option value");
+#endif
+    }
+
+    return in;
+}
+
+std::ostream& operator<<(std::ostream& out, const clblasDiag& diag)
+{
+    switch (diag) {
+    case clblasUnit:
+        out << "unit";
+        break;
+    case clblasNonUnit:
+        out << "nonunit";
+        break;
+    }
+
+    return out;
+}
+
+bool
+Config::parseDecompositionOpt(const std::string& opt)
+{
+    size_t v[6];    // x0, y0, bwidth0, x1, y1, bwidth1
+
+    boost::tokenizer<> tok(opt);
+    boost::tokenizer<>::iterator it = tok.begin();
+
+    for (int i = 0; i < 6; i++) {
+        if (it == tok.end()) {
+            return false;
+        }
+        try {
+            v[i] = boost::lexical_cast<size_t>(*it);
+        }
+        catch (boost::bad_lexical_cast&) {
+            return false;
+        }
+        ++it;
+    }
+    if (it != tok.end()) {
+        return false;
+    }
+
+    setDecomposition(v[0], v[1], v[2], v[3], v[4], v[5]);
+    return true;
+}
+
+bool
+Config::parseArgMultiplier(
+    const std::string& opt,
+    ArgMultiplier& v)
+{
+    boost::char_separator<char> sep(",");
+    boost::tokenizer< boost::char_separator<char> > tok(opt, sep);
+    boost::tokenizer< boost::char_separator<char> >::iterator it = tok.begin();
+
+    try {
+        switch (kargs_.dtype) {
+        case TYPE_FLOAT:
+            v.argFloat = boost::lexical_cast<float>(*it);
+            ++it;
+            break;
+        case TYPE_DOUBLE:
+            v.argDouble = boost::lexical_cast<double>(*it);
+            ++it;
+            break;
+        case TYPE_COMPLEX_FLOAT:
+            v.argFloatComplex.s[0] = boost::lexical_cast<float>(*it);
+            ++it;
+            if (it == tok.end()) {
+                v.argFloatComplex.s[1] = 0;
+            }
+            else {
+                v.argFloatComplex.s[1] = boost::lexical_cast<float>(*it);
+                ++it;
+            }
+            break;
+        case TYPE_COMPLEX_DOUBLE:
+            v.argDoubleComplex.s[0] = boost::lexical_cast<double>(*it);
+            ++it;
+            if (it == tok.end()) {
+                v.argDoubleComplex.s[1] = 0;
+            }
+            else {
+                v.argDoubleComplex.s[1] = boost::lexical_cast<double>(*it);
+                ++it;
+            }
+            break;
+        }
+    }
+    catch (boost::bad_lexical_cast&) {
+        return false;
+    }
+
+    return (it == tok.end());
+}
diff --git a/src/library/tools/ktest/config.cpp b/src/library/tools/ktest/config.cpp
new file mode 100644
index 0000000..3098e36
--- /dev/null
+++ b/src/library/tools/ktest/config.cpp
@@ -0,0 +1,548 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <sys/types.h>
+
+#include <blas_funcs.h>
+#include <solution_seq.h>
+
+#include "config.h"
+
+using namespace clMath;
+
+static const char DEFAULT_PLATFORM_NAME[] = "AMD Accelerated Parallel Processing";
+
+Config::Config() :
+    defaultConfig_(""),
+    cpp_("ktest.cpp"),
+    dataPattern_(RANDOM_MATRIX),
+    buildOptions_(""),
+	funcID_(CLBLAS_GEMM),
+    hasFuncID_(false), hasSubdims_(false),
+    skipAccuracy_(false)
+{
+    setPlatform(DEFAULT_PLATFORM_NAME);
+    setDevice("");
+
+    memset(&kargs_, 0, sizeof(kargs_));
+    kargs_.kernType = CLBLAS_COMPUTING_KERNEL;
+    kargs_.A = kargs_.B = kargs_.C = NULL;
+    kargs_.offsetM = kargs_.offsetN = 0;
+    kargs_.scimage[0] = kargs_.scimage[1] = NULL;
+    kargs_.addrBits = 0;
+
+    kargs_.dtype = TYPE_FLOAT;
+    kargs_.order = clblasRowMajor;
+    kargs_.side = clblasLeft;
+    kargs_.uplo = clblasUpper;
+    kargs_.transA = clblasNoTrans;
+    kargs_.transB = clblasNoTrans;
+    kargs_.diag = clblasNonUnit;
+    kargs_.M = kargs_.N = kargs_.K = 0;
+    kargs_.lda.matrix = kargs_.ldb.matrix = kargs_.ldc.matrix = 0;
+    kargs_.offA = kargs_.offBX = kargs_.offCY = 0;
+
+    memset(&kargs_.alpha, 0, sizeof(kargs_.alpha));
+    memset(&kargs_.beta, 0, sizeof(kargs_.beta));
+
+    memset(subdims_, 0, sizeof(subdims_));
+
+    names_[CLBLAS_GEMV] = "gemv";
+    names_[CLBLAS_SYMV] = "symv";
+    names_[CLBLAS_GEMM] = "gemm";
+    names_[CLBLAS_TRMM] = "trmm";
+    names_[CLBLAS_TRSM] = "trsm";
+    names_[CLBLAS_SYRK] = "syrk";
+    names_[CLBLAS_SYR2K] = "syr2k";
+
+    cl_ = names_[funcID_] + ".cl";
+}
+
+Config::~Config()
+{
+    names_.clear();
+}
+
+const std::string&
+Config::cpp() const
+{
+    return cpp_;
+}
+
+const std::string&
+Config::cl() const
+{
+    return cl_;
+}
+
+clMath::KTestMatrixGenerator
+Config::dataPattern() const
+{
+    return dataPattern_;
+}
+
+std::string
+Config::platform() const
+{
+    std::string name;
+    cl_int err;
+    size_t sz;
+    char *pname;
+
+    err = clGetPlatformInfo(platform_, CL_PLATFORM_NAME, 0, NULL, &sz);
+    if (err != CL_SUCCESS) {
+        return "";
+    }
+    pname = new char[sz + 1];
+    err = clGetPlatformInfo(platform_, CL_PLATFORM_NAME, sz, pname, NULL);
+    if (err != CL_SUCCESS) {
+        delete[] pname;
+        return "";
+    }
+    name = pname;
+    delete[] pname;
+    return name;
+}
+
+std::string
+Config::device() const
+{
+    std::string name;
+    cl_int err;
+    size_t sz;
+    char *dname;
+
+    err = clGetDeviceInfo(device_, CL_DEVICE_NAME, 0, NULL, &sz);
+    if (err != CL_SUCCESS) {
+        return "";
+    }
+    dname = new char[sz + 1];
+    err = clGetDeviceInfo(device_, CL_DEVICE_NAME, sz, dname, NULL);
+    if (err != CL_SUCCESS) {
+        delete[] dname;
+        return "";
+    }
+    name = dname;
+    delete[] dname;
+    return name;
+}
+
+const std::string&
+Config::buildOptions() const
+{
+    return buildOptions_;
+}
+
+void
+Config::kargs(CLBlasKargs *kargs) const
+{
+    cl_int err;
+
+    *kargs = kargs_;
+    kargs->addrBits = deviceAddressBits(device_, &err);
+}
+
+bool
+Config::permitMultiKernels() const
+{
+    return multiKernel_;
+}
+
+bool Config::withAccuracy() const
+{
+    return !skipAccuracy_;
+}
+
+bool
+Config::decomposition(SubproblemDim subdims[MAX_SUBDIMS]) const
+{
+    if (!hasSubdims_)  {
+        return false;
+    }
+
+    for (int i = 0; i < MAX_SUBDIMS; i++) {
+        subdims[i] = subdims_[i];
+    }
+
+    subdims[0].itemX = subdims[0].x;
+    subdims[0].itemY = subdims[0].y;
+    subdims[1].itemX = subdims[1].x;
+    subdims[1].itemY = subdims[1].y;
+
+    return true;
+}
+
+BlasFunctionID
+Config::blasFunctionID() const
+{
+    return funcID_;
+}
+
+void
+Config::setDefaultConfig(const std::string& filename)
+{
+    defaultConfig_ = filename;
+}
+
+void
+Config::setCpp(const std::string& name)
+{
+    cpp_ = name;
+}
+
+void
+Config::setCl(const std::string& name)
+{
+    cl_ = name;
+}
+
+bool
+Config::setDataPattern(const std::string& name)
+{
+    if (strcmp(name.c_str(), "random") == 0) {
+        dataPattern_ = clMath::RANDOM_MATRIX;
+        return true;
+    }
+    if (strcmp(name.c_str(), "unit") == 0) {
+        dataPattern_ = clMath::UNIT_MATRIX;
+        return true;
+    }
+    if (strcmp(name.c_str(), "sawtooth") == 0) {
+        dataPattern_ = clMath::SAWTOOTH_MATRIX;
+        return true;
+    }
+    return false;
+}
+
+bool
+Config::setPlatform(const std::string& name)
+{
+    cl_int err;
+    cl_uint nrPlatforms;
+    cl_platform_id *platforms;
+    bool found;
+    size_t sz;
+    char *pname;
+
+    err = clGetPlatformIDs(0, NULL, &nrPlatforms);
+    if ((err != CL_SUCCESS) || (nrPlatforms == 0)) {
+        return false;
+    }
+    platforms = new cl_platform_id[nrPlatforms];
+    err = clGetPlatformIDs(nrPlatforms, platforms, NULL);
+    if (err != CL_SUCCESS) {
+        delete[] platforms;
+        return false;
+    }
+
+    found = false;
+    for (cl_uint i = 0; i < nrPlatforms; i++) {
+        err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &sz);
+        if (err != CL_SUCCESS) {
+            continue;
+        }
+        pname = new char[sz + 1];
+        err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sz, pname, NULL);
+        if (err != CL_SUCCESS) {
+            delete[] pname;
+            continue;
+        }
+        if (name.empty()) {
+            found = (strcmp(pname, DEFAULT_PLATFORM_NAME) == 0);
+        }
+        else {
+            found = (strcmp(pname, name.c_str()) == 0);
+        }
+        delete[] pname;
+        if (found) {
+            platform_ = platforms[i];
+            break;
+        }
+    }
+
+    delete[] platforms;
+    return found;
+}
+
+bool
+Config::setDevice(const std::string& name)
+{
+    cl_int err;
+    cl_uint nrDevices;
+    cl_device_id *devices;
+    bool found;
+    size_t sz;
+    char *dname;
+
+    if ((platform_ == NULL) && !setPlatform("")) {
+        return false;
+    }
+
+    err = clGetDeviceIDs(platform_, CL_DEVICE_TYPE_ALL, 0, NULL, &nrDevices);
+    if ((err != CL_SUCCESS) || (nrDevices == 0)) {
+        return false;
+    }
+    devices = new cl_device_id[nrDevices];
+    err = clGetDeviceIDs(platform_, CL_DEVICE_TYPE_ALL, nrDevices, devices, NULL);
+    if (err != CL_SUCCESS) {
+        delete[] devices;
+        return false;
+    }
+
+    found = false;
+    for (cl_uint i = 0; i < nrDevices; i++) {
+        err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 0, NULL, &sz);
+        if (err != CL_SUCCESS) {
+            continue;
+        }
+        dname = new char[sz + 1];
+        err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sz, dname, NULL);
+        if (err != CL_SUCCESS) {
+            delete[] dname;
+            continue;
+        }
+        if (name.empty()) {
+            found = true;
+        }
+        else {
+            found = (strcmp(dname, name.c_str()) == 0);
+        }
+        delete[] dname;
+        if (found) {
+            device_ = devices[i];
+            break;
+        }
+    }
+
+    delete[] devices;
+    return found;
+}
+
+void
+Config::setBuildOptions(const std::string& options)
+{
+    buildOptions_ = options;
+}
+
+bool
+Config::setFunction(const std::string& name)
+{
+    if (name.empty()) {
+        return false;
+    }
+    switch (name.c_str()[0]) {
+    case 's':
+    case 'S':
+        kargs_.dtype = TYPE_FLOAT;
+        break;
+    case 'd':
+    case 'D':
+        kargs_.dtype = TYPE_DOUBLE;
+        break;
+    case 'c':
+    case 'C':
+        kargs_.dtype = TYPE_COMPLEX_FLOAT;
+        break;
+    case 'z':
+    case 'Z':
+        kargs_.dtype = TYPE_COMPLEX_DOUBLE;
+        break;
+    default:
+        return false;
+    }
+
+    for (NameMap::iterator it = names_.begin(); it != names_.end(); ++it) {
+        if (strcmp(name.substr(1).c_str(), (*it).second.c_str()) == 0) {
+            funcID_ = (*it).first;
+            setCl((*it).second + ".cl");
+            hasFuncID_ = true;
+            return true;
+        }
+    }
+    return false;
+}
+
+void
+Config::setOrder(clblasOrder order)
+{
+    kargs_.order = order;
+}
+
+void
+Config::setSide(clblasSide side)
+{
+    kargs_.side = side;
+}
+
+void
+Config::setUplo(clblasUplo uplo)
+{
+    kargs_.uplo = uplo;
+}
+
+void
+Config::setTransA(clblasTranspose transA)
+{
+    kargs_.transA = transA;
+}
+
+void
+Config::setTransB(clblasTranspose transB)
+{
+    kargs_.transB = transB;
+}
+
+void
+Config::setDiag(clblasDiag diag)
+{
+    kargs_.diag = diag;
+}
+
+void
+Config::setM(size_t M)
+{
+    kargs_.M = M;
+}
+
+void
+Config::setN(size_t N)
+{
+    kargs_.N = N;
+}
+
+void
+Config::setK(size_t K)
+{
+    kargs_.K = K;
+}
+
+void
+Config::setAlpha(ArgMultiplier alpha)
+{
+    switch (kargs_.dtype) {
+    case TYPE_FLOAT:
+        kargs_.alpha.argFloat = alpha.argFloat;
+        break;
+    case TYPE_DOUBLE:
+        kargs_.alpha.argDouble = alpha.argDouble;
+        break;
+    case TYPE_COMPLEX_FLOAT:
+        kargs_.alpha.argFloatComplex = alpha.argFloatComplex;
+        break;
+    case TYPE_COMPLEX_DOUBLE:
+        kargs_.alpha.argDoubleComplex = alpha.argDoubleComplex;
+        break;
+    }
+}
+
+void
+Config::setBeta(ArgMultiplier beta)
+{
+    switch (kargs_.dtype) {
+    case TYPE_FLOAT:
+        kargs_.beta.argFloat = beta.argFloat;
+        break;
+    case TYPE_DOUBLE:
+        kargs_.beta.argDouble = beta.argDouble;
+        break;
+    case TYPE_COMPLEX_FLOAT:
+        kargs_.beta.argFloatComplex = beta.argFloatComplex;
+        break;
+    case TYPE_COMPLEX_DOUBLE:
+        kargs_.beta.argDoubleComplex = beta.argDoubleComplex;
+        break;
+    }
+}
+
+void
+Config::setLDA(size_t lda)
+{
+    kargs_.lda.matrix = lda;
+}
+
+void
+Config::setLDB(size_t ldb)
+{
+    kargs_.ldb.matrix = ldb;
+}
+
+void
+Config::setLDC(size_t ldc)
+{
+    kargs_.ldc.matrix = ldc;
+}
+
+void
+Config::setIncX(int incx)
+{
+    kargs_.ldb.vector = incx;
+}
+
+void
+Config::setIncY(int incy)
+{
+    kargs_.ldc.vector = incy;
+}
+
+void
+Config::setOffA(size_t offA)
+{
+    kargs_.offA = offA;
+}
+
+void
+Config::setOffBX(size_t offBX)
+{
+    kargs_.offBX = offBX;
+}
+
+void
+Config::setOffCY(size_t offCY)
+{
+    kargs_.offCY = offCY;
+}
+
+void
+Config::setMultiKernel(bool multiKernel)
+{
+    multiKernel_ = multiKernel;
+}
+
+void
+Config::setSkipAccuracy(void)
+{
+    skipAccuracy_ = true;
+}
+
+void
+Config::setDecomposition(
+    size_t x0,
+    size_t y0,
+    size_t bwidth0,
+    size_t x1,
+    size_t y1,
+    size_t bwidth1)
+{
+    subdims_[0].x = x0;
+    subdims_[0].y = y0;
+    subdims_[0].bwidth = bwidth0;
+    subdims_[1].x = x1;
+    subdims_[1].y = y1;
+    subdims_[1].bwidth = bwidth1;
+
+    hasSubdims_ = true;
+}
diff --git a/src/library/tools/ktest/config.h b/src/library/tools/ktest/config.h
new file mode 100644
index 0000000..acdcab2
--- /dev/null
+++ b/src/library/tools/ktest/config.h
@@ -0,0 +1,128 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef KTEST_CONFIG_H__
+#define KTEST_CONFIG_H__
+
+#include <string>
+#include <map>
+#include <boost/program_options.hpp>
+
+#include <clBLAS.h>
+#include <clblas-internal.h>
+#include <blas_funcs.h>
+
+#include "ktest-common.h"
+
+namespace po = boost::program_options;
+
+namespace clMath {
+
+typedef std::map<BlasFunctionID, std::string> NameMap;
+
+
+class Config {
+private:
+    std::string defaultConfig_;
+    std::string cpp_;
+    std::string cl_;
+    clMath::KTestMatrixGenerator dataPattern_;
+
+    cl_platform_id platform_;
+    cl_device_id device_;
+    std::string buildOptions_;
+
+	BlasFunctionID funcID_;
+	CLBlasKargs kargs_;
+    SubproblemDim subdims_[MAX_SUBDIMS];
+    bool hasFuncID_;
+    bool hasSubdims_;
+    bool multiKernel_;
+    bool skipAccuracy_;
+    po::variables_map vm;
+
+    NameMap names_;
+
+    void setOptDesc(po::options_description& opts, bool useDefaults);
+    bool applyOptions(const po::variables_map& vm, bool stopOnError = true);
+
+    bool parseGroupSizeOpt(const std::string& opt);
+    bool parseDecompositionOpt(const std::string& opt);
+    bool parseArgMultiplier(const std::string& opt, ArgMultiplier& v);
+
+public:
+	Config();
+	~Config();
+
+    const std::string& cpp() const;
+    const std::string& cl() const;
+    clMath::KTestMatrixGenerator dataPattern() const;
+
+    std::string platform() const;
+    std::string device() const;
+    const std::string& buildOptions() const;
+    void kargs(CLBlasKargs *kargs) const;
+    bool permitMultiKernels() const;
+    bool withAccuracy() const;
+    bool decomposition(SubproblemDim subdims[MAX_SUBDIMS]) const;
+    BlasFunctionID blasFunctionID() const;
+
+    void setDefaultConfig(const std::string& filename);
+
+    void setCpp(const std::string& name);
+    void setCl(const std::string& name);
+    bool setDataPattern(const std::string& name);
+
+    bool setPlatform(const std::string& name);
+    bool setDevice(const std::string& name);
+    void setBuildOptions(const std::string& options);
+
+	bool setFunction(const std::string& name);
+
+    void setOrder(clblasOrder order);
+	void setSide(clblasSide side);
+	void setUplo(clblasUplo uplo);
+	void setTransA(clblasTranspose transA);
+	void setTransB(clblasTranspose transB);
+	void setDiag(clblasDiag diag);
+    void setM(size_t M);
+    void setN(size_t N);
+    void setK(size_t K);
+    void setAlpha(ArgMultiplier alpha);
+    void setBeta(ArgMultiplier beta);
+    void setLDA(size_t lda);
+    void setLDB(size_t ldb);
+    void setLDC(size_t ldc);
+    void setOffA(size_t offA);
+    void setOffBX(size_t offBX);
+    void setOffCY(size_t offCY);
+    void setIncX(int incx);
+    void setIncY(int incy);
+
+    void setMultiKernel(bool multiKernel);
+    void setSkipAccuracy();
+    void setDecomposition(size_t x0, size_t y0, size_t bwidth0,
+        size_t x1, size_t y1, size_t bwidth1);
+
+    bool parseCommandLine(int argc, char *argv[]);
+    bool loadConfig(const char* filename);
+    bool isSane();
+};
+
+}   // namespace clMath
+
+#endif	// KTEST_CONFIG_H__
diff --git a/src/library/tools/ktest/ktest-common.h b/src/library/tools/ktest/ktest-common.h
new file mode 100644
index 0000000..9343b86
--- /dev/null
+++ b/src/library/tools/ktest/ktest-common.h
@@ -0,0 +1,32 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef KTEST_COMMON_H_
+#define KTEST_COMMON_H_
+
+namespace clMath {
+
+typedef enum KTestMatrixGenerator {
+    RANDOM_MATRIX,
+    UNIT_MATRIX,
+    SAWTOOTH_MATRIX,
+
+    N_MATRIX_GENERATORS
+} KTestMatrixGenerator;
+}
+
+#endif /* KTEST_COMMON_H_ */
diff --git a/src/library/tools/ktest/ktest-patterns.h b/src/library/tools/ktest/ktest-patterns.h
new file mode 100644
index 0000000..a178aee
--- /dev/null
+++ b/src/library/tools/ktest/ktest-patterns.h
@@ -0,0 +1,435 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef KTEST_PATTERNS_H_
+#define KTEST_PATTERNS_H_
+
+static std::string loadFileCode =
+"char*\n"
+"loadFile(const char* path)\n"
+"{\n"
+"    FILE *f;\n"
+"    long size;\n"
+"    char *text;\n"
+"\n"
+"    f = fopen(path, \"r\");\n"
+"    if (f == NULL) {\n"
+"        return NULL;\n"
+"    }\n"
+"\n"
+"    if (fseek(f, 0, SEEK_END) != 0) {\n"
+"        fclose(f);\n"
+"        return NULL;\n"
+"    }\n"
+"    size = ftell(f);\n"
+"    if (size == -1) {\n"
+"        fclose(f);\n"
+"        return NULL;\n"
+"    }\n"
+"    if (fseek(f, 0, SEEK_SET) != 0) {\n"
+"        fclose(f);\n"
+"        return NULL;\n"
+"    }\n"
+"\n"
+"    text = (char*)calloc(size + 1, 1);\n"
+"    if (text == NULL) {\n"
+"        fclose(f);\n"
+"        return NULL;\n"
+"    }\n"
+"\n"
+"    if (fread(text, 1, size, f) == 0) {\n"
+"        free(text);\n"
+"        fclose(f);\n"
+"        return NULL;\n"
+"    }\n"
+"    fclose(f);\n"
+"    return text;\n"
+"}\n";
+
+static std::string randomVectorCode =
+"template<typename T>\n"
+"void\n"
+"randomVector(\n"
+"    size_t N,\n"
+"    T *X,\n"
+"    int incx)\n"
+"{\n"
+"    size_t n;\n"
+"    VectorAccessor<T, int> x(X, N, incx);\n"
+"\n"
+"    for (n = 0; n < N; n++) {\n"
+"        x[n] = random<T>();\n"
+"    }\n"
+"}\n";
+
+static std::string unitVectorCode =
+"template<typename T>\n"
+"void\n"
+"unitVector(\n"
+"    size_t N,\n"
+"    T *X,\n"
+"    int incx)\n"
+"{\n"
+"    size_t n;\n"
+"    VectorAccessor<T, int> x(X, N, incx);\n"
+"\n"
+"    for (n = 0; n < N; n++) {\n"
+"        x[n] = ONE<T>();\n"
+"    }\n"
+"}\n";
+
+static std::string sawtoothVectorCode =
+"template<typename T>\n"
+"void\n"
+"sawtoothVector(\n"
+"    size_t N,\n"
+"    T *X,\n"
+"    int incx)\n"
+"{\n"
+"    T v;\n"
+"    size_t n;\n"
+"    VectorAccessor<T, int> x(X, N, incx);\n"
+"\n"
+"    v = ONE<T>();\n"
+"    for (n = 0; n < N; n++) {\n"
+"        x[n] = v;\n"
+"        v = v + ONE<T>();\n"
+"    }\n"
+"}\n";
+
+static std::string compareVectorsCode =
+"template<typename T>\n"
+"bool\n"
+"compareVectors(\n"
+"    size_t N,\n"
+"    T *blasVector,\n"
+"    T *naiveVector,\n"
+"    int incx)\n"
+"{\n"
+"    size_t n;\n"
+"    VectorAccessor<T, int> blas(blasVector, N, incx);\n"
+"    VectorAccessor<T, int> naive(naiveVector, N, incx);\n"
+"    T blasVal, naiveVal;\n"
+"\n"
+"    for (n = 0; n < N; n++) {\n"
+"        blasVal = blas[n];\n"
+"        naiveVal = naive[n];\n"
+"        if (isNAN(blasVal) && isNAN(naiveVal)) {\n"
+"            continue;\n"
+"        }\n"
+"        if (blasVal != naiveVal) {\n"
+"            return false;\n"
+"        }\n"
+"    }\n"
+"    return true;\n"
+"}\n";
+
+static std::string compareMatricesCode =
+"template<typename T>\n"
+"bool\n"
+"compareMatrices(\n"
+"    clblasOrder order,\n"
+"    size_t rows,\n"
+"    size_t columns,\n"
+"    T *blasMatrix,\n"
+"    T *naiveMatrix,\n"
+"    size_t ld)\n"
+"{\n"
+"    size_t r, c;\n"
+"    MatrixAccessor<T> blas(blasMatrix, order, clblasNoTrans, rows, columns, ld);\n"
+"    MatrixAccessor<T> naive(naiveMatrix, order, clblasNoTrans, rows, columns, ld);\n"
+"    T blasVal, naiveVal;\n"
+"\n"
+"    for (r = 0; r < rows; r++) {\n"
+"        for (c = 0; c < columns; c++) {\n"
+"            blasVal = blas[r][c];\n"
+"            naiveVal = naive[r][c];\n"
+"            if (isNAN(blasVal) && isNAN(naiveVal)) {\n"
+"                continue;\n"
+"            }\n"
+"            if (blasVal != naiveVal) {\n"
+"                return false;\n"
+"            }\n"
+"        }\n"
+"    }\n"
+"    return true;\n"
+"}\n";
+
+static std::string randomMatrixCode =
+"\n"
+"template<typename T>\n"
+"void\n"
+"randomMatrix(\n"
+"    clblasOrder order,\n"
+"    size_t rows,\n"
+"    size_t columns,\n"
+"    T *A,\n"
+"    size_t lda)\n"
+"{\n"
+"    size_t r, c;\n"
+"    MatrixAccessor<T> a(A, order, clblasNoTrans, rows, columns, lda);\n"
+"\n"
+"    for (r = 0; r < rows; r++) {\n"
+"        for (c = 0; c < columns; c++) {\n"
+"            a[r][c] = random<T>();\n"
+"        }\n"
+"    }\n"
+"}\n";
+
+static std::string unitMatrixCode =
+"\n"
+"template<typename T>\n"
+"void\n"
+"unitMatrix(\n"
+"    clblasOrder order,\n"
+"    size_t rows,\n"
+"    size_t columns,\n"
+"    T *A,\n"
+"    size_t lda)\n"
+"{\n"
+"    size_t r, c;\n"
+"    MatrixAccessor<T> a(A, order, clblasNoTrans, rows, columns, lda);\n"
+"\n"
+"    for (r = 0; r < rows; r++) {\n"
+"        for (c = 0; c < columns; c++) {\n"
+"            a[r][c] = ONE<T>();\n"
+"        }\n"
+"    }\n"
+"}\n";
+
+static std::string sawtoothMatrixCode =
+"\n"
+"template<typename T>\n"
+"void\n"
+"sawtoothMatrix(\n"
+"    clblasOrder order,\n"
+"    size_t rows,\n"
+"    size_t columns,\n"
+"    T *A,\n"
+"    size_t lda)\n"
+"{\n"
+"    size_t step;\n"
+"    T v;\n"
+"    size_t r, c;\n"
+"    MatrixAccessor<T> a(A, order, clblasNoTrans, rows, columns, lda);\n"
+"\n"
+"    step = sqrt(rows);\n"
+"    v = ONE<T>();\n"
+"\n"
+"    for (r = 0; r < rows; r++) {\n"
+"        if ((r != 0) && (r % step == 0)) {\n"
+"            v = v + ONE<T>();\n"
+"        }\n"
+"        for (c = 0; c < columns; c++) {\n"
+"            a[r][c] = v;\n"
+"        }\n"
+"    }\n"
+"}\n";
+
+static std::string setUpTRSMDiagonalCode =
+"template<typename T>\n"
+"void\n"
+"setUpTRSMDiagonal(\n"
+"    clblasOrder order,\n"
+"    clblasSide side,\n"
+"    clblasUplo uplo,\n"
+"    clblasTranspose transA,\n"
+"    clblasDiag diag,\n"
+"    size_t M,\n"
+"    size_t N,\n"
+"    T alpha,\n"
+"    T *A,\n"
+"    size_t lda,\n"
+"    T *B,\n"
+"    size_t ldb)\n"
+"{\n"
+"    size_t sizeA = (side == clblasRight) ? N : M;\n"
+"\n"
+"    if (diag == clblasNonUnit) {\n"
+"        size_t k = side == clblasLeft ? M : N;\n"
+"        MatrixAccessor<T> a(A, order, clblasNoTrans, k, k, lda);\n"
+"        for (cl_uint i = 0; i < sizeA; i++) {\n"
+"            a[i][i] = ONE<T>();\n"
+"        }\n"
+"        double ub = UPPER_BOUND<T>();\n"
+"        while (ub >= 1) {\n"
+"            size_t i = rand() % k;\n"
+"            a[i][i] = a[i][i] * TWO<T>();\n"
+"            ub /= 2;\n"
+"        }\n"
+"        \n"
+"    }\n"
+"    NaiveBlas::trmm(order, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb);\n"
+"}\n"
+"\n";
+
+static std::string forwardDeclarationsCode =
+"cl_platform_id getPlatform(const char *name);\n"
+"cl_device_id getDevice(cl_platform_id platform, const char *name);\n"
+"cl_kernel createKernel(const char *source, cl_context context,\n"
+"    const char* options, cl_int *error);\n"
+"void printExecTime(cl_ulong ns);\n";
+
+static std::string getPlatformCode =
+"cl_platform_id\n"
+"getPlatform(const char *name)\n"
+"{\n"
+"    cl_int err;\n"
+"    cl_uint nrPlatforms, i;\n"
+"    cl_platform_id *list, platform;\n"
+"    char platformName[64];\n"
+"\n"
+"    err = clGetPlatformIDs(0, NULL, &nrPlatforms);\n"
+"    if (err != CL_SUCCESS) {\n"
+"        return NULL;\n"
+"    }\n"
+"\n"
+"    list = (cl_platform_id*)calloc(nrPlatforms, sizeof(*list));\n"
+"    if (list == NULL) {\n"
+"        return NULL;\n"
+"    }\n"
+"\n"
+"    err = clGetPlatformIDs(nrPlatforms, list, NULL);\n"
+"    if (err != CL_SUCCESS) {\n"
+"        free(list);\n"
+"        return NULL;\n"
+"    }\n"
+"\n"
+"    platform = NULL;\n"
+"    for (i = 0; i < nrPlatforms; i++) {\n"
+"        err = clGetPlatformInfo(list[i], CL_PLATFORM_NAME,\n"
+"            sizeof(platformName), platformName, NULL);\n"
+"        if ((err == CL_SUCCESS) && (strcmp(platformName, name) == 0)) {\n"
+"            platform = list[i];\n"
+"            break;\n"
+"        }\n"
+"    }\n"
+"\n"
+"    free(list);\n"
+"    return platform;\n"
+"}\n";
+
+static std::string getDeviceCode =
+"cl_device_id\n"
+"getDevice(\n"
+"    cl_platform_id platform,\n"
+"    const char *name)\n"
+"{\n"
+"\n"
+"    cl_int err;\n"
+"    cl_uint nrDevices, i;\n"
+"    cl_device_id *list, device;\n"
+"    char deviceName[64];\n"
+"\n"
+"    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &nrDevices);\n"
+"    if (err != CL_SUCCESS) {\n"
+"        return NULL;\n"
+"    }\n"
+"    list = (cl_device_id*)calloc(nrDevices, sizeof(*list));\n"
+"    if (list == NULL) {\n"
+"        return NULL;\n"
+"    }\n"
+"\n"
+"    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, nrDevices, list, NULL);\n"
+"    if (err != CL_SUCCESS) {\n"
+"        free(list);\n"
+"        return NULL;\n"
+"    }\n"
+"\n"
+"    device = NULL;\n"
+"    for (i = 0; i < nrDevices; i++) {\n"
+"        err = clGetDeviceInfo(list[i], CL_DEVICE_NAME,\n"
+"            sizeof(deviceName), deviceName, NULL);\n"
+"        if ((err == CL_SUCCESS) && (strcmp(deviceName, name) == 0)) {\n"
+"            device = list[i];\n"
+"            break;\n"
+"        }\n"
+"    }\n"
+"\n"
+"    free(list);\n"
+"    return device;\n"
+"}\n";
+
+static std::string createKernelCode =
+"cl_kernel\n"
+"createKernel(\n"
+"    const char* source,\n"
+"    cl_context context,\n"
+"    const char* options,\n"
+"    cl_int* error)\n"
+"{\n"
+"\n"
+"    cl_int err;\n"
+"    cl_device_id device;\n"
+"    cl_program program;\n"
+"    cl_kernel kernel;\n"
+"    size_t logSize;\n"
+"    char *log;\n"
+"\n"
+"    err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device), &device, NULL);\n"
+"    if (err != CL_SUCCESS) {\n"
+"        if (error != NULL) {\n"
+"            *error = err;\n"
+"        }\n"
+"        return NULL;\n"
+"    }\n"
+"\n"
+"    program = clCreateProgramWithSource(context, 1, &source, NULL, error);\n"
+"    if (program == NULL) {\n"
+"        return NULL;\n"
+"    }\n"
+"\n"
+"    err = clBuildProgram(program, 1, &device, options, NULL, NULL);\n"
+"    if (err != CL_SUCCESS) {\n"
+"        logSize = 0;\n"
+"        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);\n"
+"        log = (char*)calloc(1, logSize + 1);\n"
+"        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, log, NULL);\n"
+"        printf(\"=== Build log ===\\n%s\\n\", log);\n"
+"        free(log);\n"
+"        clReleaseProgram(program);\n"
+"        if (error != NULL) {\n"
+"            *error = err;\n"
+"        }\n"
+"        return NULL;\n"
+"    }\n"
+"\n"
+"    kernel = NULL;\n"
+"    err = clCreateKernelsInProgram(program, 1, &kernel, NULL);\n"
+"    clReleaseProgram(program);\n"
+"    if (error != NULL) {\n"
+"        *error = err;\n"
+"    }\n"
+"    return kernel;\n"
+"}\n";
+
+static std::string printTimeCode =
+"void\n"
+"printExecTime(cl_ulong ns)\n"
+"{\n"
+"    if (ns > 10000000) {\n"
+"        printf(\"Kernel execution time: %lu milliseconds\\n\", ns / 1000000);\n"
+"    }\n"
+"    else if (ns > 10000) {\n"
+"        printf(\"Kernel execution time: %lu microseconds\\n\", ns / 1000);\n"
+"    }\n"
+"    else {\n"
+"        printf(\"Kernel execution time: %lu nanoseconds\\n\", ns);\n"
+"    }\n"
+"}\n";
+
+#endif /* KTEST_PATTERNS_H_ */
diff --git a/src/library/tools/ktest/ktest.cpp b/src/library/tools/ktest/ktest.cpp
new file mode 100644
index 0000000..6f668db
--- /dev/null
+++ b/src/library/tools/ktest/ktest.cpp
@@ -0,0 +1,708 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <blas_funcs.h>
+#include "var.h"
+#include "ktest.h"
+#include "ktest-patterns.h"
+
+using namespace clMath;
+
+KTest::KTest(Step *step, clMath::Config *cfg) :
+    platform_(cfg->platform()), device_(cfg->device()),
+    kernelSourceFile_(cfg->cl()), buildOptions_(cfg->buildOptions()),
+    matrixGen_(cfg->dataPattern()),
+    masterStep_(step), indent_(0), useSeveralKernels_(false)
+{
+}
+
+KTest::KTest(Step *masterStep, std::vector<Step*> *steps, clMath::Config *cfg) :
+    platform_(cfg->platform()), device_(cfg->device()),
+    kernelSourceFile_(cfg->cl()), buildOptions_(cfg->buildOptions()),
+    matrixGen_(cfg->dataPattern()),
+    masterStep_(masterStep), steps_(steps), indent_(0), useSeveralKernels_(true)
+{
+}
+
+std::string
+KTest::indent()
+{
+    std::string str = "";
+
+    for (size_t i = 0; i < indent_; i++) {
+        str += " ";
+    }
+    return str;
+}
+
+const char*
+KTest::matrixGenName(KTestMatrixGenerator gen)
+{
+    switch (gen) {
+    case RANDOM_MATRIX:
+        return "randomMatrix";
+    case UNIT_MATRIX:
+        return "unitMatrix";
+    case SAWTOOTH_MATRIX:
+        return "sawtoothMatrix";
+    default:
+        return NULL;
+    }
+}
+
+const char*
+KTest::vectorGenName(KTestMatrixGenerator gen)
+{
+    switch (gen) {
+    case RANDOM_MATRIX:
+        return "randomVector";
+    case UNIT_MATRIX:
+        return "unitVector";
+    case SAWTOOTH_MATRIX:
+        return "sawtoothVector";
+    default:
+        return NULL;
+    }
+}
+
+std::string
+KTest::generate(bool withAccuracy)
+{
+    std::stringstream ss;
+    int level;
+
+    ss << indent() << "#define _CRT_SECURE_NO_WARNINGS" << std::endl;
+
+    ss << std::endl
+       << indent() << "#include <assert.h>" << std::endl
+       << indent() << "#include <math.h>" << std::endl
+       << indent() << "#include <stdio.h>" << std::endl
+       << indent() << "#include <stdlib.h>" << std::endl
+       << indent() << "#include <string.h>" << std::endl
+       << indent() << "#include <time.h>" << std::endl
+       << indent() << "#include <string>" << std::endl;
+    if (masterStep_->blasFunctionID() == CLBLAS_TRSM) {
+        ss << indent() << "#include <math.h>" << std::endl
+           << indent() << "#define NANF NAN" << std::endl;
+    }
+
+    includes(ss);
+    ss << std::endl
+       << indent() << "#include \"naive_blas.cpp\"" << std::endl
+       << std::endl
+       << indent() << "using namespace NaiveBlas;" << std::endl;
+
+    if (masterStep_->blasFunctionID() == CLBLAS_TRSM) {
+        setUpTRSMDiagonal(ss);
+    }
+
+    level = funcBlasLevel(masterStep_->blasFunctionID());
+
+    switch (matrixGen_) {
+    case RANDOM_MATRIX:
+        if (level == 2) {
+        randomVector(ss);
+        }
+        randomMatrix(ss);
+        break;
+    case UNIT_MATRIX:
+        if (level == 2) {
+        unitVector(ss);
+        }
+        unitMatrix(ss);
+        break;
+    case SAWTOOTH_MATRIX:
+        if (level == 2) {
+            sawtoothVector(ss);
+        }
+        sawtoothMatrix(ss);
+        break;
+    default:
+        break;
+    }
+
+    if (withAccuracy) {
+        if (level == 2) {
+            compareVectors(ss);
+        }
+        else {
+            compareMatrices(ss);
+        }
+    }
+
+    declareKTestOptions(ss);
+    declareBlasOptions(ss, masterStep_);
+    declarePatternVars(ss, masterStep_);
+
+    ss << std::endl
+       << indent() << "char* loadFile(const char* path);" << std::endl;
+    forwardDeclarations(ss);
+
+    generateMain(ss, withAccuracy);
+
+    loadFile(ss);
+    auxFunctions(ss);
+
+    return ss.str();
+}
+
+void
+KTest::declareKTestOptions(std::stringstream& ss)
+{
+    ss << std::endl;
+
+    ss << indent() << "const char PLATFORM_NAME[] = \""
+       << platform_ << "\";" << std::endl;
+    ss << indent() << "const char DEVICE_NAME[] = \""
+       << device_ << "\";" << std::endl;
+    ss << indent() << "const char BUILD_OPTIONS[] = \""
+       << buildOptions_ << "\";" << std::endl;
+    ss << indent() << "const char KERNEL_SOURCE[] = \""
+       << kernelSourceFile_ << "\";" << std::endl;
+}
+
+void
+KTest::declareBlasOptions(std::stringstream& ss, Step *step)
+{
+    ss << std::endl;
+    ss << indent() << "const clblasOrder order = "
+       << ((step->kargs().order == clblasColumnMajor)
+                ? "clblasColumnMajor"
+                : "clblasRowMajor")
+       << ";" << std::endl;
+    ss << indent() << "const clblasSide side = "
+       << ((step->kargs().side == clblasRight)
+                ? "clblasRight"
+                : "clblasLeft")
+       << ";" << std::endl;
+    ss << indent() << "const clblasUplo uplo = "
+       << ((step->kargs().uplo == clblasUpper)
+                ? "clblasUpper"
+                : "clblasLower")
+       << ";" << std::endl;
+    ss << indent() << "const clblasTranspose transA = ";
+    switch (step->kargs().transA) {
+    case clblasNoTrans:
+        ss << "clblasNoTrans";
+        break;
+    case clblasTrans:
+        ss << "clblasTrans";
+        break;
+    case clblasConjTrans:
+        ss << "clblasConjTrans";
+        break;
+    }
+    ss << ";" << std::endl;
+    ss << indent() << "const clblasTranspose transB = ";
+    switch (step->kargs().transB) {
+    case clblasNoTrans:
+        ss << "clblasNoTrans";
+        break;
+    case clblasTrans:
+        ss << "clblasTrans";
+        break;
+    case clblasConjTrans:
+        ss << "clblasConjTrans";
+        break;
+    }
+    ss << ";" << std::endl;
+    ss << indent() << "const clblasDiag diag = "
+       << ((step->kargs().diag == clblasUnit)
+                ? "clblasUnit"
+                : "clblasNonUnit")
+       << ";" << std::endl;
+}
+
+void
+KTest::declarePatternVars(std::stringstream& ss, Step *step)
+{
+    VarList vars = step->vars();
+    ArrayVarList var_arays = step->arrays();
+
+    vars.insert(vars.end(), var_arays.begin(), var_arays.end());
+
+    ss << std::endl;
+    for (VarList::const_iterator it = vars.begin(); it != vars.end(); ++it) {
+        Variable *var = *it;
+        if (step != masterStep_ && var->isBuffer()) {
+            // master step buffers are used
+            continue;
+        }
+        ss << indent();
+        if (var->constant()) {
+            ss << "const ";
+        }
+        ss << var->type() << " " << var->name();
+        if (!var->defaultValue().empty()) {
+            ss << " = " << var->defaultValue();
+        }
+        ss << ";" << std::endl;
+    }
+}
+
+void
+KTest::generateMain(std::stringstream& ss, bool withAccuracy)
+{
+    ArrayVarList list;
+    std::map<unsigned int, const Variable*> kargMap = masterStep_->kargMap();
+    std::string size;
+
+    ss << std::endl;
+    ss << indent() << "int" << std::endl;
+    if (useSeveralKernels_) {
+        ss << indent() << "main(int argc, char *argv[])" << std::endl;
+    }
+    else {
+        ss << indent() << "main(void)" << std::endl;
+    }
+
+    ss << indent() << "{" << std::endl;
+    indent_ += 4;
+
+    ss << std::endl
+       << indent() << "char *source;" << std::endl
+       << indent() << "cl_ulong start, end;" << std::endl;
+
+    ss << std::endl
+       << indent() << "srand((unsigned int)time(NULL));" << std::endl;
+
+    mainInit(ss);
+
+    ss << std::endl;
+    list = masterStep_->arrays();
+    for (ArrayVarList::const_iterator it = list.begin(); it != list.end(); ++it) {
+        ss << indent() << (*it)->name() << " = (" <<(*it)->type() << ")calloc(";
+        if ((*it)->isMatrix()) {
+            ss << masterStep_->matrixSize((MatrixVariable*)(*it));
+        }
+        else {
+            ss << masterStep_->vectorSize((VectorVariable*)(*it));
+        }
+        ss << ", "
+           << "sizeof(*" << (*it)->name() << "));" << std::endl;
+        ss << indent() << "assert(" << (*it)->name() << " != NULL);" << std::endl;
+        if ((*it)->copyOf() != NULL) {
+            continue;
+        }
+        if ((*it)->isMatrix()) {
+            MatrixVariable *var = (MatrixVariable*)(*it);
+
+            ss << indent() << matrixGenName(matrixGen_) << "(order, "
+               << var->rows()->name() << ", "
+               << var->columns()->name() << ", " << var->matrixPointer() << ", "
+               << var->ld()->name() << ");" << std::endl;
+        }
+        else {
+            VectorVariable *var = (VectorVariable*)(*it);
+            ss << indent() << vectorGenName(matrixGen_) << "("
+               << var->nElems()->name() << ", "
+               << var->vectorPointer() << ", " << var->inc()->name()
+               << ");" << std::endl;
+        }
+    }
+
+    ss << indent() << masterStep_->postRandomCall() << ";" << std::endl;
+
+    for (ArrayVarList::const_iterator it = list.begin(); it != list.end(); ++it) {
+        if ((*it)->copyOf() == NULL) {
+            continue;
+        }
+        ss << indent() << "memcpy(" << (*it)->name() << ", "
+           << (*it)->copyOf()->name() << ", (";
+        if ((*it)->isMatrix()) {
+            ss << masterStep_->matrixSize((MatrixVariable*)(*it));
+        }
+        else {
+            ss << masterStep_->vectorSize((VectorVariable*)(*it));
+        }
+        ss << ") * sizeof(*" << (*it)->copyOf()->name() << "));" << std::endl;
+    }
+
+    if (withAccuracy) {
+        ss << std::endl
+           << indent() << "NaiveBlas::" << masterStep_->naiveCall() << ";"
+           << std::endl;
+    }
+
+    allocateWriteBuffers(ss);
+
+    if (useSeveralKernels_) {
+        for (unsigned int i = 0; i < steps_->size(); i++) {
+            Step *step = (*steps_)[i];
+            ss << indent() << "{" << std::endl;
+            indent_ += 4;
+
+            declareGranulation(ss, step);
+
+            ss << indent() << "const char* kernelName = argc > " << i + 1
+                    << " ? argv[" << i + 1 << "] : \""
+                    << step->kernelName() << "\";" << std::endl;
+
+            ss << std::endl
+                    << indent() << "source = loadFile(kernelName);" << std::endl
+                    << indent() << "assert(source != NULL);" << std::endl;
+            buildKernel(ss);
+
+            declareBlasOptions(ss, step);
+            declarePatternVars(ss, step);
+            setKernelArgs(ss, step);
+
+
+            ss << std::endl
+                    << indent() << "start = 0;" << std::endl
+                    << indent() << "end = 0;" << std::endl;
+            execKernel(ss);
+            ss << std::endl
+                    << indent() << "printExecTime(end - start);" << std::endl;
+
+            indent_ -= 4;
+            ss << indent() << "}" << std::endl;
+        }
+    }
+    else {
+        declareGranulation(ss, masterStep_);
+        ss << std::endl
+                << indent() << "source = loadFile(KERNEL_SOURCE);" << std::endl
+                << indent() << "assert(source != NULL);" << std::endl;
+        buildKernel(ss);
+
+        setKernelArgs(ss, masterStep_);
+
+        ss << std::endl
+                << indent() << "start = 0;" << std::endl
+                << indent() << "end = 0;" << std::endl;
+        execKernel(ss);
+        ss << std::endl
+                << indent() << "printExecTime(end - start);" << std::endl;
+    }
+    if (withAccuracy) {
+        readBuffers(ss);
+
+        ss << std::endl
+           << indent() << "if (" << masterStep_->compareCall() << ") {"
+                                 << std::endl << indent()
+           << "    printf(\"Correctness test passed\\n\");" << std::endl
+           << indent() << "}" << std::endl
+           << indent() << "else {" << std::endl
+           << indent() << "    printf(\"Correctness test failed\\n\");"
+                       << std::endl
+           << indent() << "}" << std::endl
+           << indent() << "fflush(stdout);" << std::endl;
+    }
+
+    mainFinish(ss);
+
+    ss << std::endl;
+    list = masterStep_->arrays();
+    for (ArrayVarList::const_iterator it = list.begin(); it != list.end(); ++it) {
+        ss << indent() << "free(" << (*it)->name() << ");" << std::endl;
+    }
+    ss << indent() << "free(source);" << std::endl
+       << indent() << "exit(EXIT_SUCCESS);" << std::endl;
+
+    indent_ -= 4;
+    ss << indent() << "}" << std::endl;
+}
+
+void
+KTest::loadFile(std::stringstream& ss)
+{
+    ss << loadFileCode << std::endl;
+}
+
+void
+KTest::randomVector(std::stringstream& ss)
+{
+    ss << randomVectorCode << std::endl;
+}
+
+void
+KTest::unitVector(std::stringstream& ss)
+{
+    ss << unitVectorCode << std::endl;
+}
+
+void
+KTest::sawtoothVector(std::stringstream& ss)
+{
+    ss << sawtoothVectorCode << std::endl;
+}
+
+void
+KTest::compareVectors(std::stringstream& ss)
+{
+    ss << compareVectorsCode << std::endl;
+}
+
+void
+KTest::randomMatrix(std::stringstream& ss)
+{
+    ss << randomMatrixCode << std::endl;
+}
+
+void
+KTest::unitMatrix(std::stringstream& ss)
+{
+    ss << unitMatrixCode << std::endl;
+}
+
+void
+KTest::sawtoothMatrix(std::stringstream& ss)
+{
+    ss << sawtoothMatrixCode << std::endl;
+}
+
+void
+KTest::setUpTRSMDiagonal(std::stringstream& ss)
+{
+    ss << setUpTRSMDiagonalCode << std::endl;
+}
+
+void
+KTest::compareMatrices(std::stringstream& ss)
+{
+    ss << compareMatricesCode << std::endl;
+}
+
+
+
+void
+KTest::includes(std::stringstream& ss)
+{
+    ss << std::endl
+       << indent() << "#include <CL/cl.h>" << std::endl;
+}
+
+void
+KTest::forwardDeclarations(std::stringstream& ss)
+{
+    ss << forwardDeclarationsCode << std::endl;
+}
+
+void
+KTest::auxFunctions(std::stringstream& ss)
+{
+    getPlatform(ss);
+    getDevice(ss);
+    createKernel(ss);
+    printExecTime(ss);
+}
+
+void
+KTest::getPlatform(std::stringstream& ss)
+{
+    ss << getPlatformCode << std::endl;
+}
+
+void
+KTest::getDevice(std::stringstream& ss)
+{
+    ss << getDeviceCode << std::endl;
+}
+
+void
+KTest::createKernel(std::stringstream& ss)
+{
+    ss << createKernelCode << std::endl;
+}
+
+void
+KTest::printExecTime(std::stringstream& ss)
+{
+    ss << printTimeCode;
+}
+
+void
+KTest::declareGranulation(std::stringstream& ss, Step *step)
+{
+    ss << std::endl;
+    ss << indent() << "const cl_uint workDim = "
+       << step->pgran().wgDim << ";" << std::endl;
+    ss << indent() << "const size_t localWorkSize["
+       << step->pgran().wgDim
+       << "] = { ";
+    for (unsigned int i = 0; i < step->pgran().wgDim; i++) {
+        if (i != 0) {
+            ss << ", ";
+        }
+        ss << step->pgran().wgSize[i];
+    }
+    ss << " };" << std::endl;
+    ss << indent() << "const size_t globalWorkSize["
+       << step->pgran().wgDim
+       << "] = { " << step->globalWorkSize() << " };" << std::endl;
+}
+
+void
+KTest::mainInit(std::stringstream& ss)
+{
+    ss << std::endl
+       << indent() << "cl_int err;" << std::endl
+       << indent() << "cl_platform_id platform;" << std::endl
+       << indent() << "cl_device_id device;" << std::endl
+       << indent() << "cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };" << std::endl
+       << indent() << "cl_context context;" << std::endl
+       << indent() << "cl_command_queue queue;" << std::endl
+       << indent() << "cl_kernel kernel;" << std::endl
+       << indent() << "cl_event event;" << std::endl;
+
+    ss << std::endl
+       << indent() << "platform = getPlatform(PLATFORM_NAME);" << std::endl
+       << indent() << "assert(platform != NULL);" << std::endl
+       << indent() << "device = getDevice(platform, DEVICE_NAME);" << std::endl
+       << indent() << "assert(device != NULL);" << std::endl
+       << indent() << "props[1] = (cl_context_properties)platform;" << std::endl
+       << indent() << "context = clCreateContext(props, 1, &device, NULL, NULL, &err);" << std::endl
+       << indent() << "assert(context != NULL);" << std::endl
+       << indent() << "queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);" << std::endl
+       << indent() << "assert(queue != NULL);" << std::endl;
+}
+
+void
+KTest::buildKernel(std::stringstream& ss)
+{
+    ss << indent() << "kernel = createKernel(source, context, BUILD_OPTIONS, &err);" << std::endl
+       << indent() << "assert(kernel != NULL);" << std::endl;
+}
+
+void
+KTest::getBufferSizeExpr(Variable *buffer, std::string& size)
+{
+    ArrayVariableInterface *hostPtr = (ArrayVariableInterface*)(buffer->hostPtr());
+    if (hostPtr->isMatrix()) {
+        MatrixVariable *ptrVar = (MatrixVariable*)hostPtr;
+        if (masterStep_->matrixSize(ptrVar).empty()) {
+            size += "sizeof(";
+            size += ptrVar->type();
+            size += ")";
+        }
+        else {
+            size = "(";
+            size += masterStep_->matrixSize(ptrVar);
+            size += ") * sizeof(*";
+            size += ptrVar->name();
+            size += ")";
+        }
+    }
+    else {
+        VectorVariable *ptrVar = (VectorVariable*)buffer->hostPtr();
+        size = "(";
+        size += masterStep_->vectorSize(ptrVar);
+        size += ") * sizeof(*";
+        size += ptrVar->name();
+        size += ")";
+    }
+}
+
+void
+KTest::allocateWriteBuffers(std::stringstream& ss)
+{
+    VarList list;
+    std::string size;
+
+    ss << std::endl;
+    list = masterStep_->buffers();
+    for (VarList::const_iterator it = list.begin(); it != list.end(); ++it) {
+        getBufferSizeExpr(*it, size);
+        ss << indent() << (*it)->name() << " = clCreateBuffer(context, "
+           << (*it)->flagsStr() << "," << std::endl
+           << indent() << "    " << size << ", NULL, &err);" << std::endl;
+        ss << indent() << "assert(" << (*it)->name() << " != NULL);" << std::endl;
+        if (((*it)->flags() & CL_MEM_READ_WRITE) ||
+                            ((*it)->flags() & CL_MEM_READ_ONLY)) {
+            ss << indent() << "err = clEnqueueWriteBuffer(queue, "
+               << (*it)->name() << ", CL_TRUE, 0," << std::endl
+               << indent() << "    " << size << ", "
+               << ((Variable*)(*it)->hostPtr())->name() << "," << std::endl
+               << indent() << "    0, NULL, NULL);" << std::endl;
+            ss << indent() << "assert(err == CL_SUCCESS);" << std::endl;
+        }
+    }
+}
+
+void
+KTest::setKernelArgs(std::stringstream& ss, Step *step)
+{
+    std::map<unsigned int, const Variable*> kargMap = step->kargMap();
+    ss << std::endl;
+    for (KArgMap::iterator it = kargMap.begin(); it != kargMap.end(); ++it) {
+        ss << indent() << "err = clSetKernelArg(kernel, "
+                << (*it).first << ", sizeof(" << (*it).second->type() << "), "
+                << "&" << (*it).second->name() << ");" << std::endl;
+        ss << indent() << "assert(err == CL_SUCCESS);" << std::endl;
+    }
+}
+
+void
+KTest::execKernel(std::stringstream& ss)
+{
+    ss << std::endl
+       << indent() << "event = NULL;" << std::endl
+       << indent() << "err = clEnqueueNDRangeKernel(queue, kernel, workDim, NULL," << std::endl
+       << indent() << "    globalWorkSize, localWorkSize, 0, NULL, &event);" << std::endl
+       << indent() << "assert(err == CL_SUCCESS);" << std::endl
+       << indent() << "err = clFinish(queue);" << std::endl
+       << indent() << "assert(err == CL_SUCCESS);" << std::endl;
+
+    ss << std::endl
+       << indent() << "err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START," << std::endl
+       << indent() << "    sizeof(start), &start, NULL);" << std::endl
+       << indent() << "err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END," << std::endl
+       << indent() << "    sizeof(end), &end, NULL);" << std::endl;
+}
+
+void
+KTest::readBuffers(std::stringstream& ss)
+{
+    VarList list;
+    std::string size;
+
+    ss << std::endl;
+    list = masterStep_->buffers();
+    for (VarList::const_iterator it = list.begin(); it != list.end(); ++it) {
+        if (((*it)->flags() & CL_MEM_READ_WRITE) ||
+                            ((*it)->flags() & CL_MEM_WRITE_ONLY)) {
+            getBufferSizeExpr(*it, size);
+            ss << indent() << "err = clEnqueueReadBuffer(queue, "
+               << (*it)->name() << ", CL_TRUE, 0," << std::endl
+               << indent() << "    " << size << ", "
+               << ((Variable*)(*it)->hostPtr())->name() << "," << std::endl
+               << indent() << "    0, NULL, NULL);" << std::endl;
+            ss << indent() << "assert(err == CL_SUCCESS);" << std::endl;
+        }
+    }
+}
+
+void
+KTest::mainFinish(std::stringstream& ss)
+{
+    VarList list;
+
+    ss << std::endl;
+    list = masterStep_->buffers();
+    for (VarList::const_iterator it = list.begin(); it != list.end(); ++it) {
+        ss << indent() << "err = clReleaseMemObject("
+           << (*it)->name() << ");" << std::endl;
+        ss << indent() << "assert(err == CL_SUCCESS);" << std::endl;
+    }
+    ss << indent() << "err = clReleaseKernel(kernel);" << std::endl
+       << indent() << "assert(err == CL_SUCCESS);" << std::endl
+       << indent() << "err = clReleaseCommandQueue(queue);" << std::endl
+       << indent() << "assert(err == CL_SUCCESS);" << std::endl
+       << indent() << "err = clReleaseContext(context);" << std::endl
+       << indent() << "assert(err == CL_SUCCESS);" << std::endl;
+}
diff --git a/src/library/tools/ktest/ktest.h b/src/library/tools/ktest/ktest.h
new file mode 100644
index 0000000..72daeaf
--- /dev/null
+++ b/src/library/tools/ktest/ktest.h
@@ -0,0 +1,100 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef KTEST_KTEST_H__
+#define KTEST_KTEST_H__
+
+#include <string>
+#include <sstream>
+
+#include "ktest-common.h"
+#include "step.h"
+#include "config.h"
+
+namespace clMath {
+
+/**
+ * @internal
+ * @brief Host code generation class
+ *
+ * Object of this class generate host-side source file that can execute kernels
+ * for one or several steps.
+ *
+ */
+class KTest {
+private:
+    std::string platform_;
+    std::string device_;
+    std::string kernelSourceFile_;
+    std::string buildOptions_;
+    KTestMatrixGenerator matrixGen_;
+    Step *masterStep_;
+    std::vector<Step*> *steps_;
+    size_t indent_;
+    bool useSeveralKernels_;
+
+    const char* matrixGenName(KTestMatrixGenerator gen);
+    const char* vectorGenName(KTestMatrixGenerator gen);
+
+    void typedefs(std::stringstream& ss);
+    void declareKTestOptions(std::stringstream& ss);
+    void declareBlasOptions(std::stringstream& ss, Step *step);
+    void declarePatternVars(std::stringstream& ss, Step *step);
+    void generateMain(std::stringstream& ss, bool withAccuracy);
+
+    void loadFile(std::stringstream& ss);
+
+    void randomVector(std::stringstream& ss);
+    void unitVector(std::stringstream& ss);
+    void sawtoothVector(std::stringstream& ss);
+    void compareVectors(std::stringstream& ss);
+
+    void randomMatrix(std::stringstream& ss);
+    void unitMatrix(std::stringstream& ss);
+    void sawtoothMatrix(std::stringstream& ss);
+    void setUpTRSMDiagonal(std::stringstream& ss);
+    void compareMatrices(std::stringstream& ss);
+
+    std::string indent();
+
+    void includes(std::stringstream& ss);
+    void forwardDeclarations(std::stringstream& ss);
+    void declareGranulation(std::stringstream& ss, Step *step);
+    void mainInit(std::stringstream& ss);
+    void buildKernel(std::stringstream& ss);
+    void allocateWriteBuffers(std::stringstream& ss);
+    void setKernelArgs(std::stringstream& ss, Step *step);
+    void execKernel(std::stringstream& ss);
+    void readBuffers(std::stringstream& ss);
+    void mainFinish(std::stringstream& ss);
+    void auxFunctions(std::stringstream& ss);
+
+    void getPlatform(std::stringstream& ss);
+    void getDevice(std::stringstream& ss);
+    void createKernel(std::stringstream& ss);
+    void printExecTime(std::stringstream& ss);
+    void getBufferSizeExpr(Variable *buffer, std::string& size);
+public:
+    KTest(Step *masterStep, clMath::Config *cfg);
+    KTest(Step *masterStep, std::vector<clMath::Step*> *steps, clMath::Config *cfg);
+
+    std::string generate(bool withAccuracy);
+};
+
+}   // namespace clMath
+
+#endif  // KTEST_KTEST_H__
diff --git a/src/library/tools/ktest/main.cpp b/src/library/tools/ktest/main.cpp
new file mode 100644
index 0000000..22da49c
--- /dev/null
+++ b/src/library/tools/ktest/main.cpp
@@ -0,0 +1,336 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#define __CL_ENABLE_EXCEPTIONS
+
+#include <stdlib.h>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "config.h"
+#include "step.h"
+#include "ktest.h"
+
+#include "steps/gemv.h"
+#include "steps/symv.h"
+#include "steps/gemm.h"
+#include "steps/trmm.h"
+#include "steps/trsm.h"
+#include "steps/syrk.h"
+#include "steps/syr2k.h"
+
+#include <init.h>
+#include <trace_malloc.h>
+
+clMath::Step* getMasterStep(
+    BlasFunctionID funcID,
+    std::string platformName,
+    std::string deviceName);
+clMath::Step* getStep(ListNode *node);
+void destroyPatterns(std::vector<clMath::Step*>& patterns);
+
+cl_platform_id
+getPlatform(const char *name);
+
+cl_device_id
+getDevice(
+    cl_platform_id platform,
+    const char *name);
+
+int
+main(int argc, char *argv[])
+{
+    clMath::Config cfg;
+    cfg.setDefaultConfig("ktest.cfg");
+    if (!cfg.parseCommandLine(argc, argv) || !cfg.isSane()) {
+        return 1;
+    }
+
+    clblasSetup();
+    parseEnvImplementation();
+
+    clMath::Step *masterStep = getMasterStep(cfg.blasFunctionID(),
+                                          cfg.platform(), cfg.device());
+    if (masterStep == NULL) {
+        std::cerr << "Function support not implemented yet" << std::endl;
+        return 1;
+    }
+
+    CLBlasKargs kargs;
+    SubproblemDim subdims[MAX_SUBDIMS];
+
+    cfg.kargs(&kargs);
+    masterStep->setKargs(kargs);
+    masterStep->fixLD();
+
+    ListHead seq;
+    listInitHead(&seq);
+    bool severalKernels = false;
+
+    /* Single kernel for this function */
+    if (cfg.decomposition(subdims)) {
+        masterStep->setDecomposition(subdims);
+    }
+    masterStep->completeDecompositionSingle();
+
+    if (cfg.permitMultiKernels()) {
+        masterStep->makeSolutionSequence(&seq,
+                                         getPlatform(cfg.platform().c_str()));
+        if (listLength(&seq) > 1) {
+            severalKernels = true;
+        }
+    }
+
+    if (severalKernels) {
+        std::ofstream fs;
+        ListNode *node;
+
+        std::vector<clMath::Step*> steps;
+
+        masterStep->declareVars(NULL);
+
+        for (node = listNodeFirst(&seq); node != &seq; node = node->next) {
+            steps.push_back(getStep(node));
+        }
+
+        std::string str;
+
+        for (unsigned int i = 0; i < steps.size(); i++) {
+            std::stringstream kernelFileName;
+            kernelFileName << i << "_" << steps[i]->getBlasFunctionName()
+                                       << "_" << cfg.cl();
+
+            steps[i]->setKernelName(kernelFileName.str());
+            if (cfg.decomposition(subdims)) {
+                steps[i]->setDecomposition(subdims);
+            }
+
+            steps[i]->completeDecompositionSingle();
+
+            steps[i]->declareVars(masterStep);
+
+            std::cout << "Generating '" << steps[i]->kernelName()
+                                        << "' ..." << std::endl;
+
+            str = steps[i]->generate();
+            if (str.empty()) {
+                std::cerr << "failed" << std::endl;
+                abort();
+            }
+            fs.open(kernelFileName.str().c_str());
+            fs << str;
+            fs.close();
+        }
+
+        clMath::KTest *ktest = new clMath::KTest(masterStep, &steps, &cfg);
+
+        std::cout << "Generating '" << cfg.cpp() << "' ..." << std::endl;
+        str = ktest->generate(cfg.withAccuracy());
+        if (str.empty()) {
+            std::cerr << "failed" << std::endl;
+            abort();
+        }
+        fs.open(cfg.cpp().c_str());
+        fs << str;
+        fs.close();
+
+        delete ktest;
+
+        for (std::vector<clMath::Step*>::iterator it = steps.begin();
+                it != steps.end(); ++it) {
+            delete (*it);
+        }
+        steps.clear();
+    }
+    else {
+
+        std::ofstream fs;
+
+        masterStep->setKernelName(cfg.cl());
+
+        std::cout << "Generating '" << masterStep->kernelName()
+                                    << "' ..." << std::endl;
+
+        masterStep->declareVars(NULL);
+
+        std::string str;
+        str = masterStep->generate();
+        if (str.empty()) {
+            std::cerr << "failed" << std::endl;
+            abort();
+        }
+        fs.open(cfg.cl().c_str());
+        fs << str;
+        fs.close();
+
+        clMath::KTest *ktest = new clMath::KTest(masterStep, &cfg);
+
+        std::cout << "Generating '" << cfg.cpp() << "' ..." << std::endl;
+        str = ktest->generate(cfg.withAccuracy());
+        if (str.empty()) {
+            std::cerr << "failed" << std::endl;
+            abort();
+        }
+        fs.open(cfg.cpp().c_str());
+        fs << str;
+        fs.close();
+
+        delete ktest;
+    }
+
+    if (cfg.permitMultiKernels()) {
+        masterStep->freeSolutionSequence(&seq);
+    }
+
+    delete masterStep;
+
+    return 0;
+}
+
+clMath::Step* getMasterStep(
+    BlasFunctionID funcID,
+    std::string platformName,
+    std::string deviceName)
+{
+    cl_platform_id platformID;
+    cl_device_id deviceID;
+
+    platformID = getPlatform(platformName.c_str());
+    deviceID = getDevice(platformID, deviceName.c_str());
+
+    switch (funcID) {
+    case CLBLAS_GEMV:
+        return new clMath::GemvStep(deviceID);
+    case CLBLAS_SYMV:
+        return new clMath::SymvStep(deviceID);
+    case CLBLAS_GEMM:
+        return new clMath::GemmStep(deviceID);
+    case CLBLAS_TRMM:
+        return new clMath::TrmmStep(deviceID);
+    case CLBLAS_TRSM:
+        return new clMath::TrsmStep(deviceID);
+    case CLBLAS_SYRK:
+        return new clMath::SyrkStep(deviceID);
+    case CLBLAS_SYR2K:
+        return new clMath::Syr2kStep(deviceID);
+    default:
+        return NULL;
+    }
+}
+
+clMath::Step* getStep(ListNode *node)
+{
+    switch (clMath::Step::getStepNodeFuncID(node)) {
+    case CLBLAS_GEMV:
+        return new clMath::GemvStep(node);
+    case CLBLAS_SYMV:
+        return new clMath::SymvStep(node);
+    case CLBLAS_GEMM:
+        return new clMath::GemmStep(node);
+    case CLBLAS_TRMM:
+        return new clMath::TrmmStep(node);
+    case CLBLAS_TRSM:
+        return new clMath::TrsmStep(node);
+    case CLBLAS_SYRK:
+        return new clMath::SyrkStep(node);
+    case CLBLAS_SYR2K:
+        return new clMath::Syr2kStep(node);
+    default:
+        return NULL;
+    }
+}
+
+
+cl_platform_id
+getPlatform(const char *name)
+{
+    cl_int err;
+    cl_uint nrPlatforms, i;
+    cl_platform_id *list, platform;
+    char platformName[64];
+
+    err = clGetPlatformIDs(0, NULL, &nrPlatforms);
+    if (err != CL_SUCCESS) {
+        return NULL;
+    }
+
+    list = (cl_platform_id*)calloc(nrPlatforms, sizeof(*list));
+    if (list == NULL) {
+        return NULL;
+    }
+
+    err = clGetPlatformIDs(nrPlatforms, list, NULL);
+    if (err != CL_SUCCESS) {
+        free(list);
+        return NULL;
+    }
+
+    platform = NULL;
+    for (i = 0; i < nrPlatforms; i++) {
+        err = clGetPlatformInfo(list[i], CL_PLATFORM_NAME,
+            sizeof(platformName), platformName, NULL);
+        if ((err == CL_SUCCESS) && (strcmp(platformName, name) == 0)) {
+            platform = list[i];
+            break;
+        }
+    }
+
+    free(list);
+    return platform;
+}
+
+cl_device_id
+getDevice(
+    cl_platform_id platform,
+    const char *name)
+{
+
+    cl_int err;
+    cl_uint nrDevices, i;
+    cl_device_id *list, device;
+    char deviceName[64];
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &nrDevices);
+    if (err != CL_SUCCESS) {
+        return NULL;
+    }
+    list = (cl_device_id*)calloc(nrDevices, sizeof(*list));
+    if (list == NULL) {
+        return NULL;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, nrDevices, list, NULL);
+    if (err != CL_SUCCESS) {
+        free(list);
+        return NULL;
+    }
+
+    device = NULL;
+    for (i = 0; i < nrDevices; i++) {
+        err = clGetDeviceInfo(list[i], CL_DEVICE_NAME,
+            sizeof(deviceName), deviceName, NULL);
+        if ((err == CL_SUCCESS) && (strcmp(deviceName, name) == 0)) {
+            device = list[i];
+            break;
+        }
+    }
+
+    free(list);
+    return device;
+}
diff --git a/src/library/tools/ktest/naive/naive_blas.cpp b/src/library/tools/ktest/naive/naive_blas.cpp
new file mode 100644
index 0000000..5c2c608
--- /dev/null
+++ b/src/library/tools/ktest/naive/naive_blas.cpp
@@ -0,0 +1,845 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#if defined (_MSC_VER)
+#define __template_static static
+#define isnan(x) _isnan((x))
+#pragma warning( disable : 4290 )
+#else   /* _MSC_VER */
+#define __template_static
+#endif  /* !_MSC_VER */
+
+namespace NaiveBlas {
+
+/* Problem flags */
+
+typedef enum clblasOrder {
+    clblasRowMajor,
+    clblasColumnMajor
+} clblasOrder;
+
+typedef enum clblasTranspose {
+    clblasNoTrans,
+    clblasTrans,
+    clblasConjTrans
+} clblasTranspose;
+
+typedef enum clblasUplo {
+    clblasUpper,
+    clblasLower
+} clblasUplo;
+
+typedef enum clblasDiag {
+    clblasUnit,
+    clblasNonUnit
+} clblasDiag;
+
+typedef enum clblasSide {
+    clblasLeft,
+    clblasRight
+} clblasSide;
+
+/*  Complex types and related manipulations */
+
+typedef cl_float2 FloatComplex;
+typedef cl_double2 DoubleComplex;
+
+static __inline FloatComplex
+floatComplex(float real, float imag)
+{
+    FloatComplex z;
+    z.s[0] = real;
+    z.s[1] = imag;
+    return z;
+}
+
+static __inline DoubleComplex
+doubleComplex(double real, double imag)
+{
+    DoubleComplex z;
+    z.s[0] = real;
+    z.s[1] = imag;
+    return z;
+}
+
+#define CREAL(v) ((v).s[0])
+#define CIMAG(v) ((v).s[1])
+
+// Type-dependent constants
+
+template<typename T>
+static T
+ZERO()
+{
+    return static_cast<T>(0.0);
+}
+
+template<>
+__template_static FloatComplex
+ZERO<FloatComplex>()
+{
+    return floatComplex(0.0, 0.0);
+}
+
+template<>
+__template_static DoubleComplex
+ZERO<DoubleComplex>()
+{
+    return doubleComplex(0.0, 0.0);
+}
+
+template<class T>
+static T
+ONE()
+{
+    return static_cast<T>(1.0);
+}
+
+template<>
+__template_static FloatComplex
+ONE<FloatComplex>()
+{
+    return floatComplex(1.0, 0.0);
+}
+
+template<>
+__template_static DoubleComplex
+ONE<DoubleComplex>()
+{
+    return doubleComplex(1.0, 0.0);
+}
+
+template<class T>
+static T
+TWO()
+{
+    return static_cast<T>(2.0);
+}
+
+template<>
+__template_static FloatComplex
+TWO<FloatComplex>()
+{
+    return floatComplex(2.0, 0.0);
+}
+
+template<>
+__template_static DoubleComplex
+TWO<DoubleComplex>()
+{
+    return doubleComplex(2.0, 0.0);
+}
+
+template<class T>
+static bool
+isNAN(T x)
+{
+    return (isnan(x) != 0);
+}
+
+template<>
+__template_static bool
+isNAN(FloatComplex x)
+{
+    return (isNAN(CREAL(x)) && isNAN(CIMAG(x)));
+}
+
+template<>
+__template_static bool
+isNAN(DoubleComplex x)
+{
+    return (isNAN(CREAL(x)) && isNAN(CIMAG(x)));
+}
+
+/* Type-dependent random() */
+
+template<class T>
+static T
+random(cl_double limit)
+{
+    T v;
+    cl_ulong l = static_cast<cl_ulong>(limit);
+
+    if (l == 0) {
+        return 0;
+    }
+    v = static_cast<float>(rand() % l);
+    if ((rand() % 2) == 1)
+        v = -v;
+    return v;
+}
+
+template<typename T>
+static T
+random(cl_double left, cl_double right)
+{
+    T v;
+    T l = static_cast<T>(left);
+
+    v = random<T>(right - left);
+    if (v < 0) {
+        v -= l;
+    }
+    else {
+        v += l;
+    }
+    return v;
+}
+
+template<class T>
+static T
+random()
+{
+    return random<T>(static_cast<T>(10));
+}
+
+template<>
+__template_static FloatComplex
+random<FloatComplex>()
+{
+    return floatComplex(random<cl_float>(), random<cl_float>());
+}
+
+template<>
+__template_static FloatComplex
+random<FloatComplex>(cl_double limit)
+{
+    return floatComplex(random<cl_float>(limit), random<cl_float>(limit));
+}
+
+template<>
+__template_static FloatComplex
+random<FloatComplex>(cl_double left, cl_double right)
+{
+    return floatComplex(random<cl_float>(left, right), random<cl_float>(left, right));
+}
+
+
+template<>
+__template_static DoubleComplex
+random<DoubleComplex>()
+{
+    return doubleComplex(random<cl_double>(), random<cl_double>());
+}
+
+template<>
+__template_static DoubleComplex
+random<DoubleComplex>(cl_double limit)
+{
+    return doubleComplex(random<cl_double>(limit), random<cl_double>(limit));
+}
+
+template<>
+__template_static DoubleComplex
+random<DoubleComplex>(cl_double left, cl_double right)
+{
+    return doubleComplex(random<cl_double>(left, right), random<cl_double>(left, right));
+}
+
+/* Boolean operators */
+
+template<class T>
+static bool
+operator==(T a, T b)
+{
+    return (a == b);
+}
+
+template<>
+__template_static bool
+operator==(FloatComplex a, FloatComplex b)
+{
+    return ((CREAL(a) == CREAL(b)) && (CIMAG(a) == CIMAG(b)));
+}
+
+template<>
+__template_static bool
+operator==(DoubleComplex a, DoubleComplex b)
+{
+    return ((CREAL(a) == CREAL(b)) && (CIMAG(a) == CIMAG(b)));
+}
+
+template<class T>
+static bool
+operator!=(T a, T b)
+{
+    return !(a == b);
+}
+
+/* math operators */
+
+static __inline
+float conjugate(float elem)
+{
+    return elem;
+}
+
+static __inline
+double conjugate(double elem)
+{
+    return elem;
+}
+
+static __inline
+FloatComplex conjugate(FloatComplex elem)
+{
+    return floatComplex(CREAL(elem), -CIMAG(elem));
+}
+
+static __inline
+DoubleComplex conjugate(DoubleComplex elem)
+{
+    return doubleComplex(CREAL(elem), -CIMAG(elem));
+}
+
+static __inline FloatComplex
+operator+(FloatComplex a, FloatComplex b)
+{
+    return floatComplex(CREAL(a) + CREAL(b), CIMAG(a) + CIMAG(b));
+}
+
+static __inline FloatComplex
+operator-(FloatComplex a, FloatComplex b)
+{
+    return floatComplex(CREAL(a) - CREAL(b), CIMAG(a) - CIMAG(b));
+}
+
+static __inline FloatComplex
+operator*(FloatComplex a, FloatComplex b)
+{
+    return floatComplex(
+        CREAL(a) * CREAL(b) - CIMAG(a) * CIMAG(b),
+        CREAL(a) * CIMAG(b) + CREAL(b) * CIMAG(a));
+}
+
+static __inline FloatComplex
+operator*(FloatComplex a, cl_float b)
+{
+    return floatComplex(CREAL(a) * b, CIMAG(a) * b);
+}
+
+static __inline FloatComplex
+operator/(FloatComplex a, FloatComplex b)
+{
+    cl_float div = CREAL(b) * CREAL(b) + CIMAG(b) * CIMAG(b);
+
+    return floatComplex(
+        (CREAL(a) * CREAL(b) + CIMAG(a) * CIMAG(b)) / div,
+        (CREAL(b) * CIMAG(a) - CREAL(a) * CIMAG(b)) / div);
+}
+
+static __inline FloatComplex
+operator/(FloatComplex a, cl_float b)
+{
+    return floatComplex(CREAL(a) / b, CIMAG(a) / b);
+}
+
+static __inline DoubleComplex
+operator+(DoubleComplex a, DoubleComplex b)
+{
+    return doubleComplex(CREAL(a) + CREAL(b), CIMAG(b) + CIMAG(b));
+}
+
+static __inline DoubleComplex
+operator-(DoubleComplex a, DoubleComplex b)
+{
+    return doubleComplex(CREAL(a) - CREAL(b), CIMAG(b) - CIMAG(b));
+}
+
+static __inline DoubleComplex
+operator*(DoubleComplex a, DoubleComplex b)
+{
+    return doubleComplex(
+        CREAL(a) * CREAL(b) - CIMAG(a) * CIMAG(b),
+        CREAL(a) * CIMAG(b) + CREAL(b) * CIMAG(a));
+}
+
+static __inline DoubleComplex
+operator*(DoubleComplex a, cl_double b)
+{
+    return doubleComplex(CREAL(a) * b, CIMAG(a) * b);
+}
+
+static __inline DoubleComplex
+operator/(DoubleComplex a, DoubleComplex b)
+{
+    cl_double div = CREAL(b) * CREAL(b) + CIMAG(b) * CIMAG(b);
+
+    return doubleComplex(
+        (CREAL(a) * CREAL(b) + CIMAG(a) * CIMAG(b)) / div,
+        (CREAL(b) * CIMAG(a) - CREAL(a) * CIMAG(b)) / div);
+}
+
+static __inline DoubleComplex
+operator/(DoubleComplex a, cl_double b)
+{
+    return doubleComplex(CREAL(a) / b, CIMAG(a) / b);
+}
+
+cl_int
+module(cl_int a)
+{
+    return abs(a);
+}
+
+cl_float
+module(cl_float a)
+{
+   return fabsf(a);
+}
+
+cl_double
+module(cl_double a)
+{
+   return fabs(a);
+}
+cl_float
+module(FloatComplex a)
+{
+    if ((CREAL(a) == 0.0) && (CIMAG(a) == 0.0))
+        return 0.0;
+    return sqrtf(CREAL(a) * CREAL(a) + CIMAG(a) * CIMAG(a));
+}
+
+cl_double
+module(DoubleComplex a)
+{
+    if ((CREAL(a) == 0.0) && (CIMAG(a) == 0.0))
+        return 0.0;
+    return sqrt(CREAL(a) * CREAL(a) + CIMAG(a) * CIMAG(a));
+}
+
+#define FLOAT_UPPER_BOUND   pow(2.0, 23)
+#define DOUBLE_UPPER_BOUND  pow(2.0, 52)
+
+// Type-dependant constants
+template <class T>
+static cl_double UPPER_BOUND();
+template<>
+__template_static cl_double UPPER_BOUND<cl_float>() { return FLOAT_UPPER_BOUND; }
+template<>
+__template_static cl_double UPPER_BOUND<cl_double>() { return DOUBLE_UPPER_BOUND;}
+template<>
+__template_static cl_double UPPER_BOUND<FloatComplex>() { return FLOAT_UPPER_BOUND; }
+template<>
+__template_static cl_double UPPER_BOUND<DoubleComplex>()  { return DOUBLE_UPPER_BOUND; }
+
+/* Provide simple access to vector elements */
+
+template <typename ElemType, typename IncType> class VectorAccessor {
+public:
+    VectorAccessor(
+        ElemType *vector,
+        size_t len,
+        IncType inc,
+        bool conj=false) : vector_(vector), inc_(inc), len_(len), conj_(conj)
+    {
+        /* do nothing */
+    }
+
+    ElemType&
+    operator [] (size_t idx) throw (std::string)
+    {
+        ElemType *el;
+
+        if (idx >= len_) {
+            throw std::string("Trying to access vector beyond boundary!");
+        }
+
+        if (inc_ > 0) {
+            el = vector_ + idx * inc_;
+        }
+        else {
+            el = vector_ + (len_ - idx - 1) * (-inc_);
+        }
+
+        if (conj_) {
+            tmp_ =  conjugate(*el);
+            return tmp_;
+        }
+        else {
+            return *el;
+        }
+    }
+
+private:
+    ElemType *vector_;
+    ElemType tmp_;
+    IncType inc_;
+    size_t len_;
+    bool conj_;
+};
+
+/* Mapping between logical and physical matrix layout */
+template <typename T> class MatrixAccessor {
+public:
+    MatrixAccessor(
+        T *matrix,
+        clblasOrder order,
+        clblasTranspose trans,
+        size_t nrRows,
+        size_t nrCols,
+        size_t ld) : matrix_(matrix), nrRows_(nrRows), nrCols_(nrCols), ld_(ld)
+    {
+        conj_ = (trans == clblasConjTrans);
+
+        if ((order == clblasColumnMajor && trans == clblasNoTrans) ||
+            (order == clblasRowMajor && trans != clblasNoTrans))
+        {
+            tra_ = true;
+        }
+        else {
+            tra_ = false;
+        }
+    }
+
+    void flipTransposing(void)
+    {
+        tra_ = !tra_;
+    }
+
+    VectorAccessor<T, size_t>
+    operator [] (size_t row) const throw (std::string)
+    {
+        T *vector;
+        size_t inc;
+
+        if (row >= nrRows_) {
+            throw std::string("Trying to access matrix beyond boundary!");
+        }
+
+        if (tra_) {
+            vector = matrix_ + row;
+            inc = ld_;
+        }
+        else {
+            vector = matrix_ + row * ld_;
+            inc = 1;
+        }
+
+        return VectorAccessor<T, size_t>(vector, nrCols_, inc, conj_);
+    }
+
+private:
+    T *matrix_;
+    bool tra_;
+    bool conj_;
+    size_t nrRows_;
+    size_t nrCols_;
+    size_t ld_;
+};
+
+
+template <typename T> __template_static void
+gemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    T alpha,
+    const T *A,
+    size_t lda,
+    const T *B,
+    size_t ldb,
+    T beta,
+    T *C,
+    size_t ldc)
+{
+    MatrixAccessor<T> ma(const_cast<T*>(A), order, transA, M, K, lda);
+    MatrixAccessor<T> mb(const_cast<T*>(B), order, transB, K, N, ldb);
+    MatrixAccessor<T> mc(C, order, clblasNoTrans, M, N, ldc);
+    size_t i, j, k;
+    T tmp;
+
+    for (i = 0; i < M; i++) {
+        for (j = 0; j < N; j++) {
+            tmp = ZERO<T>();
+            for (k = 0; k < K; k++) {
+                tmp = tmp + ma[i][k] * mb[k][j];
+            }
+            mc[i][j] = mc[i][j] * beta + tmp * alpha;
+        }
+    }
+}
+
+template<typename T> __template_static void
+trmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    T alpha,
+    const T *A,
+    size_t lda,
+    T *B,
+    size_t ldb)
+{
+    size_t i, j, k;
+    size_t row, col;
+    size_t rowsA = (side == clblasLeft) ? M : N;
+    size_t colsB = (side == clblasLeft) ? N : M;
+    MatrixAccessor<T> ma(const_cast<T*>(A), order, transA, rowsA, rowsA, lda);
+    MatrixAccessor<T> mb(B, order, clblasNoTrans, rowsA, colsB, ldb);
+    T tmp, a;
+    bool revPass;
+
+    revPass = (uplo == clblasLower) ^ (transA != clblasNoTrans);
+    if (side == clblasRight) {
+        ma.flipTransposing();
+        mb.flipTransposing();
+        revPass = !revPass;
+    }
+
+    for (i = 0; i < rowsA; i++) {
+        row = (revPass) ? (rowsA - i - 1) : i;
+        for (j = 0; j < colsB; j++) {
+            size_t boundK = (revPass) ? row : (rowsA - row - 1);
+
+            tmp = ZERO<T>();
+            for (k = 0; k <= boundK; k++) {
+                col = (revPass) ? k : (rowsA - k - 1);
+                if ((k == boundK) && (diag == clblasUnit)) {
+                    a = ONE<T>();
+                }
+                else {
+                    a = ma[row][col];
+                }
+                tmp = tmp + a * mb[col][j];
+            }
+            mb[row][j] = tmp * alpha;
+        }
+    }
+}
+
+template<typename T> __template_static void
+trsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    T alpha,
+    const T *A,
+    size_t lda,
+    T *B,
+    size_t ldb)
+{
+    size_t i, j, k;
+    size_t row, col;
+    size_t rowsA = (side == clblasLeft) ? M : N;
+    size_t colsB = (side == clblasLeft) ? N : M;
+    MatrixAccessor<T> ma(const_cast<T*>(A), order, transA, rowsA, rowsA, lda);
+    MatrixAccessor<T> mb(B, order, clblasNoTrans, rowsA, colsB, ldb);
+    T tmp, a;
+    bool revPass;
+
+    revPass = (uplo == clblasUpper) ^ (transA != clblasNoTrans);
+    if (side == clblasRight) {
+        ma.flipTransposing();
+        mb.flipTransposing();
+        revPass = !revPass;
+    }
+
+    for (i = 0; i < rowsA; i++) {
+        row = (revPass) ? (rowsA - i - 1) : i;
+        for (j = 0; j < colsB; j++) {
+            size_t boundK = (revPass) ? (rowsA - row - 1) : row;
+
+            tmp = ZERO<T>();
+            for (k = 0; k <= boundK; k++) {
+                col = (revPass) ? (rowsA - k - 1) : k;
+                if (col == row) {
+                    a = (diag == clblasUnit) ? ONE<T>() : ma[row][col];
+                    tmp = (mb[row][j] - tmp) / a;
+                }
+                else {
+                    tmp = tmp + ma[row][col] * mb[col][j];
+                }
+
+            }
+            mb[row][j] = tmp;
+        }
+    }
+
+    for (i = 0; i < rowsA; i++) {
+        for (j = 0; j < colsB; j++) {
+            mb[i][j] = mb[i][j] * alpha;
+        }
+    }
+}
+
+template <typename T> __template_static void
+syrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    size_t N,
+    size_t K,
+    T alpha,
+    const T *A,
+    size_t lda,
+    T beta,
+    T *C,
+    size_t ldc)
+{
+    size_t i, j, k;
+    clblasTranspose tr =
+            trans == clblasNoTrans ? clblasNoTrans : clblasTrans;
+    MatrixAccessor<T> ma(const_cast<T*>(A), order, tr, N, K, lda);
+    MatrixAccessor<T> mc(C, order, clblasNoTrans, N, N, ldc);
+    T tmp;
+
+    for (i = 0; i < N; i++) {
+        for (j = 0; j < N; j++) {
+            if ((uplo == clblasLower && j > i) ||
+                (uplo == clblasUpper && i > j)) {
+                continue;
+            }
+
+            tmp = ZERO<T>();
+            for (k = 0; k < K; k++) {
+                tmp = tmp + ma[i][k] * ma[j][k];
+            }
+            mc[i][j] = mc[i][j] * beta + tmp * alpha;
+        }
+    }
+}
+
+template <typename T> __template_static void
+syr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    size_t N,
+    size_t K,
+    T alpha,
+    const T *A,
+    size_t lda,
+    const T *B,
+    size_t ldb,
+    T beta,
+    T *C,
+    size_t ldc)
+{
+    size_t i, j, k;
+    clblasTranspose tr =
+                trans == clblasNoTrans ? clblasNoTrans : clblasTrans;
+    MatrixAccessor<T> ma(const_cast<T*>(A), order, tr, N, K, lda);
+    MatrixAccessor<T> mb(const_cast<T*>(B), order, tr, N, K, ldb);
+    MatrixAccessor<T> mc(C, order, clblasNoTrans, N, N, ldc);
+    T tmp;
+
+    for (i = 0; i < N; i++) {
+        for (j = 0; j < N; j++) {
+            if ((uplo == clblasLower && j > i) ||
+                (uplo == clblasUpper && i > j)) {
+                continue;
+            }
+
+            tmp = ZERO<T>();
+            for (k = 0; k < K; k++) {
+                tmp = tmp + ma[i][k] * mb[j][k] + ma[j][k] * mb[i][k];
+            }
+            mc[i][j] = mc[i][j] * beta + tmp * alpha;
+        }
+    }
+}
+
+template <typename T> __template_static void
+gemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    T alpha,
+    const T *A,
+    size_t lda,
+    const T *X,
+    int incx,
+    T beta,
+    T *Y,
+    int incy)
+{
+    size_t sizeX, sizeY;
+    size_t m, n;
+    T tmp;
+
+    if(transA == clblasNoTrans) {
+        sizeX = N;
+        sizeY = M;
+    }
+    else {
+        sizeX = M;
+        sizeY = N;
+    }
+
+    MatrixAccessor<T> ma(const_cast<T*>(A), order, transA, sizeY, sizeX, lda);
+    VectorAccessor<T, int> vx(const_cast<T*>(X), sizeX, incx);
+    VectorAccessor<T, int> vy(const_cast<T*>(Y), sizeY, incy);
+
+    for (m = 0; m < sizeY; m++) {
+        tmp = ZERO<T>();
+        for (n = 0; n < sizeX; n++) {
+            tmp = tmp + ma[m][n] * vx[n];
+        }
+        vy[m] = tmp * alpha + vy[m] * beta;
+    }
+}
+
+template <typename T> __template_static void
+symv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    T alpha,
+    const T *A,
+    size_t lda,
+    const T *X,
+    int incx,
+    T beta,
+    T *Y,
+    int incy)
+{
+    size_t m, n;
+    T tmp;
+
+    MatrixAccessor<T> ma(const_cast<T*>(A), order, clblasNoTrans, N, N, lda);
+    VectorAccessor<T, int> vx(const_cast<T*>(X), N, incx);
+    VectorAccessor<T, int> vy(const_cast<T*>(Y), N, incy);
+
+    for (m = 0; m < N; m++) {
+        tmp = ZERO<T>();
+        for (n = 0; n < N; n++) {
+            if (((uplo == clblasUpper) && (m <= n)) ||
+                        ((uplo == clblasLower) && (m >= n))) {
+                tmp = tmp + ma[m][n] * vx[n];
+            }
+            else {
+                tmp = tmp + ma[n][m] * vx[n];
+            }
+        }
+        vy[m] = tmp * alpha + vy[m] * beta;
+    }
+}
+
+}  /* NaiveBlas namespace */
diff --git a/src/library/tools/ktest/scripts/verify_ktest.bash b/src/library/tools/ktest/scripts/verify_ktest.bash
new file mode 100644
index 0000000..19cf8b0
--- /dev/null
+++ b/src/library/tools/ktest/scripts/verify_ktest.bash
@@ -0,0 +1,212 @@
+#!/bin/bash
+
+FUNCTIONS=(gemm trmm trsm syrk syr2k gemv symv)
+ALL_PRECISIONS=(s c)
+ALL_OPTIONS=(order transA transB side uplo diag M N K incx incy offA offBX offCY)
+
+# list of supported options for each function: gemm, trmm, trsm, syrk, ssyr2k, gemv, symv
+FUNC_OPTIONS=( "order transA transB M N K"
+               "order transA side uplo diag M N"
+               "order transA side uplo diag M N"
+               "order transA uplo N K"
+               "order  transA uplo N K"
+               "order transA M N"
+               "order uplo N" )
+
+# all options space: precision, order, transA, transB, side, uplo, unit, M, N, K
+ALL_OPTION_VALUES=( "row column"
+                    "n t c"
+                    "n t c"
+                    "left right"
+                    "upper lower"
+                    "unit nonunit"
+                    "15 16 64"
+                    "15 16 64"
+                    "15 16 64"
+                    "1"
+                    "1"
+                    "128"
+                    "256"
+                    "512" )
+
+REPORT_FILE="ktest_report.dat"
+
+PREV_KERNEL=""
+REMAINING_OPTSTR=
+CMDLINE=
+FUNCTION_INDEX=
+
+forward_options_and_call_test()
+{
+    local optidx=$1
+    local precision=$2
+    local optstr=${REMAINING_OPTSTR[@]}
+    local ret=0
+    local stat=0
+    local cmdline=
+    local msg=
+    local err_msg=
+    
+    for opt in ${optstr[@]}
+    do
+        REMAINING_OPTSTR=${REMAINING_OPTSTR[@]##$opt}
+        echo ${FUNC_OPTIONS[$FUNCTION_INDEX]} | grep $opt > /dev/null
+        if [ $? -eq 0 ]
+        then
+            break
+        fi
+
+        let "optidx += 1"
+    done
+
+    # make test and call if no more options to forward, or go further in the option list
+    if [ $optidx == ${#ALL_OPTIONS[@]} ]
+    then 
+        cmdline="--function "$PRECISION${FUNCTIONS[$FUNCTION_INDEX]}" ${CMDLINE[@]}"
+        echo ${cmdline[@]}
+        ./make-ktest ${cmdline[@]}
+        stat=$?
+        err_msg="[ERROR]: make-ktest has failed!"
+        if [ $stat -eq 0 ]
+        then
+            # check if the kernel is not the same as the last one
+            kernel=`cat *.cl`
+            if [ "${kernel[*]}" == "${LAST_KERNEL[*]}" ]
+            then
+                echo "Critical error, just the same kernel has been already generated!"
+                return 1
+            fi
+        fi
+
+        if [ $stat -eq 0 ]
+        then
+            g++ -o test ktest.cpp -I$AMDAPPSDKROOT/include -lOpenCL
+            stat=$?
+            err_msg="[ERROR]: test compilation has failed!"=
+        fi
+
+        if [ $stat -eq 0 ]
+        then
+            msg=`./test 2>&1`
+            stat=$?
+        fi
+
+        if [ $stat -eq 0 ]
+        then
+            time_msg=${msg/Correctness*/""}
+            msg=${msg##$time_msg}
+            echo $time_msg
+            echo $msg
+            echo $msg | grep "passed" > /dev/null
+            stat=$?
+        fi
+
+        if [ $stat -ne 0 ]
+        then
+            echo $err_msg
+            echo ${cmdline[@]} >> $REPORT_FILE.tmp
+        fi
+    else
+        local OPTION=${ALL_OPTIONS[$optidx]}
+        local OPTION_VALUES=${ALL_OPTION_VALUES[$optidx]}
+
+        let "optidx += 1"
+        cmdline=${CMDLINE[@]}
+        
+        for val in ${OPTION_VALUES[@]}
+        do
+            CMDLINE=${cmdline[@]}" --$OPTION ""$val"
+            (forward_options_and_call_test $optidx)
+            ret=$?
+            if [ $ret -ne 0 ]
+            then
+                break
+            fi
+
+            LAST_KERNEL=$kernel
+            rm -f *.cl > /dev/null
+        done
+    fi
+
+    return $ret
+}
+
+rm -f *.cl > /dev/null
+
+> $REPORT_FILE.tmp
+
+# test the main funtional
+for ((i = 0; i < ${#FUNCTIONS[@]}; i++))
+do
+    FUNCTION_INDEX=$i
+    for PRECISION in ${ALL_PRECISIONS[@]}
+    do
+        if [[ ${FUNCTIONS[$i]} == symv && $PRECISION == c ]]
+        then
+            continue
+        fi
+
+        CMDLINE=""
+        REMAINING_OPTSTR=${ALL_OPTIONS[@]}
+        forward_options_and_call_test 0
+    done
+done
+
+echo ==========================================================================================
+
+# test increment and offset arguments
+
+FUNC_OPTIONS=( "order transA transB M N K offA offCY"
+               "order transA side uplo diag M N offA offBX"
+               "order transA side uplo diag M N offA"
+               "order transA uplo N K offA offCY"
+               "order  transA uplo N K offA offBX offCY"
+               "order transA M N offA incx incy offA"
+               "order uplo N offA incx incy offA" )
+
+ALL_OPTION_VALUES=( "row column"
+                    "n"
+                    "n"
+                    "left"
+                    "upper"
+                    "nonunit"
+                    "64"
+                    "64"
+                    "64"
+                    "1 3 7"
+                    "1 5 9"
+                    "128"
+                    "256"
+                    "512" )
+
+for ((i = 0; i < ${#FUNCTIONS[@]}; i++))
+do
+    FUNCTION_INDEX=$i
+    for PRECISION in ${ALL_PRECISIONS[@]}
+    do
+        if [[ ${FUNCTIONS[$i]} == symv && $PRECISION == c ]]
+        then
+            continue
+        fi
+
+        CMDLINE=""
+        REMAINING_OPTSTR=${ALL_OPTIONS[@]}
+        forward_options_and_call_test 0
+    done
+done
+
+# complete the report
+report=`cat $REPORT_FILE.tmp`
+nr_fails=`cat $REPORT_FILE.tmp | wc -l`
+if [ $nr_fails == 0 ]
+then
+    echo "All tests passed" > $REPORT_FILE
+else
+    echo "Failed cases:" > $REPORT_FILE
+    echo "-----------------------------------------------------------------" >> $REPORT_FILE
+    cat $REPORT_FILE.tmp >> $REPORT_FILE
+    echo "-----------------------------------------------------------------" >> $REPORT_FILE
+    echo "Total number of failed cases: $nr_fails" >> $REPORT_FILE
+fi
+
+rm $REPORT_FILE.tmp
diff --git a/src/library/tools/ktest/step-dump.cpp b/src/library/tools/ktest/step-dump.cpp
new file mode 100644
index 0000000..a4a173e
--- /dev/null
+++ b/src/library/tools/ktest/step-dump.cpp
@@ -0,0 +1,332 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <sstream>
+#include <blas_mempat.h>
+
+#include "step.h"
+
+using namespace clMath;
+
+template <typename T>
+struct FlagsDesc {
+    T flag;
+    const char *desc;
+};
+
+static const struct FlagsDesc<KernelExtraFlags> kernelExtraFlagsDesc[] = {
+    { KEXTRA_TRANS_A,                   "KEXTRA_TRANS_A" },
+    { KEXTRA_CONJUGATE_A,               "KEXTRA_CONJUGATE_A" },
+    { KEXTRA_TRANS_B,                   "KEXTRA_TRANS_B" },
+    { KEXTRA_CONJUGATE_B,               "KEXTRA_CONJUGATE_B" },
+    { KEXTRA_COLUMN_MAJOR,              "KEXTRA_COLUMN_MAJOR" },
+    { KEXTRA_UPPER_TRIANG,              "KEXTRA_UPPER_TRIANG" },
+    { KEXTRA_SIDE_RIGHT,                "KEXTRA_SIDE_RIGHT" },
+    { KEXTRA_UNIT_DIAGONAL,             "KEXTRA_UNIT_DIAGONAL" },
+    { KEXTRA_TAILS_M,                   "KEXTRA_TAILS_M" },
+    { KEXTRA_TAILS_N,                   "KEXTRA_TAILS_N" },
+    { KEXTRA_TAILS_K,                   "KEXTRA_TAILS_K" },
+    { KEXTRA_BETA_ZERO,                 "KEXTRA_BETA_ZERO" },
+    { KEXTRA_NO_COPY_VEC_A,             "KEXTRA_NO_COPY_VEC_A" },
+    { KEXTRA_NO_COPY_VEC_B,             "KEXTRA_NO_COPY_VEC_B" },
+    { KEXTRA_NO_COPY_VEC_C,             "KEXTRA_NO_COPY_VEC_C" },
+    { KEXTRA_SYRK_SEPARATE_DIAGONAL,    "KEXTRA_SYRK_SEPARATE_DIAGONAL" },
+    { KEXTRA_SYRK_EVALUATE_DIAGONAL,    "KEXTRA_SYRK_EVALUATE_DIAGONAL" },
+    { KEXTRA_SYRK_2K_RANK,              "KEXTRA_SYRK_2K_RANK" },
+    { KEXTRA_INCX_ONE,                  "KEXTRA_INCX_ONE" },
+    { KEXTRA_INCY_ONE,                  "KEXTRA_INCY_ONE" },
+    { KEXTRA_ENABLE_MAD,                "KEXTRA_ENABLE_MAD" },
+    { KEXTRA_VENDOR_AMD,                "KEXTRA_VENDOR_AMD" },
+
+    { static_cast<KernelExtraFlags>(0), NULL }
+};
+
+static const struct FlagsDesc<CLMemLevel> memLevelFlagsDesc[] = {
+    { CLMEM_LEVEL_LDS,                  "CLMEM_LEVEL_LDS" },
+    { CLMEM_LEVEL_L1,                   "CLMEM_LEVEL_L1" },
+    { CLMEM_LEVEL_L2,                   "CLMEM_LEVEL_L2" },
+
+    { static_cast<CLMemLevel>(0), NULL }
+};
+
+template <typename T>
+static void
+dumpFlags(std::stringstream& ss, T flags, const struct FlagsDesc<T> *desc)
+{
+    bool first = true;
+
+    if (flags == static_cast<T>(0)) {
+        ss << "-";
+        return;
+    }
+
+    for (size_t i = 0; desc[i].desc != NULL; i++) {
+        if (flags & desc[i].flag) {
+            if (!first) {
+                ss << " ";
+            }
+            ss << desc[i].desc;
+            flags = static_cast<T>(flags & ~desc[i].flag);
+            first = false;
+        }
+    }
+    if (flags != static_cast<T>(0)) {
+        if (!first) {
+            ss << " ";
+        }
+        ss << flags;
+    }
+}
+
+std::string
+Step::dtypeToString(DataType dtype)
+{
+    switch (dtype) {
+    case TYPE_FLOAT:            return "cl_float";
+    case TYPE_DOUBLE:           return "cl_double";
+    case TYPE_COMPLEX_FLOAT:    return "FloatComplex";
+    case TYPE_COMPLEX_DOUBLE:   return "DoubleComplex";
+    default:                    return "";
+    }
+}
+
+std::string
+Step::multiplierToString(
+    DataType dtype,
+    ArgMultiplier arg)
+{
+    std::stringstream ss;
+
+    switch (dtype) {
+    case TYPE_FLOAT:
+        ss << arg.argFloat;
+        break;
+    case TYPE_DOUBLE:
+        ss << arg.argDouble;
+        break;
+    case TYPE_COMPLEX_FLOAT:
+        ss << "floatComplex(" << arg.argFloatComplex.s[0] << ", "
+           << arg.argFloatComplex.s[1] << ")";
+        break;
+    case TYPE_COMPLEX_DOUBLE:
+        ss << "doubleComplex(" << arg.argDoubleComplex.s[0] << ", "
+           << arg.argDoubleComplex.s[1] << ")";
+        break;
+    }
+    return ss.str();
+}
+
+std::string
+Step::dumpSubdim(const SubproblemDim *subdim)
+{
+    std::stringstream ss;
+
+    if (subdim == NULL) {
+        return ss.str();
+    }
+
+    ss << "    x      = ";
+    if (subdim->x == SUBDIM_UNUSED) {
+        ss << "SUBDIM_UNUSED";
+    }
+    else {
+        ss << subdim->x;
+    }
+    ss << std::endl;
+
+    ss << "    y      = ";
+    if (subdim->y == SUBDIM_UNUSED) {
+        ss << "SUBDIM_UNUSED";
+    }
+    else {
+        ss << subdim->y;
+    }
+    ss << std::endl;
+
+    ss << "    bwidth = " << subdim->bwidth << std::endl;
+
+    ss << "    itemX  = ";
+    if (subdim->itemX == SUBDIM_UNUSED) {
+        ss << "SUBDIM_UNUSED";
+    }
+    else {
+        ss << subdim->itemX;
+    }
+    ss << std::endl;
+
+    ss << "    itemY  = ";
+    if (subdim->itemY == SUBDIM_UNUSED) {
+        ss << "SUBDIM_UNUSED";
+    }
+    else {
+        ss << subdim->itemY;
+    }
+    ss << std::endl;
+
+    return ss.str();
+}
+
+std::string
+Step::dumpPgran()
+{
+    std::stringstream ss;
+    const PGranularity *pgran = &step_.pgran;
+
+    if (pgran == NULL) {
+        return ss.str();
+    }
+
+    ss << "    wgDim  = " << pgran->wgDim << std::endl;
+    ss << "    wgSize = (";
+    for (unsigned int i = 0; i < pgran->wgDim; i++) {
+        if (i != 0) {
+            ss << ", ";
+        }
+        ss << pgran->wgSize[i];
+    }
+    ss << ")" << std::endl;
+    ss << "    wfSize = " << pgran->wfSize << std::endl;
+    return ss.str();
+}
+
+std::string
+Step::dumpKextra()
+{
+    std::stringstream ss;
+    const CLBLASKernExtra *kextra = &kextra_;
+
+    if (kextra == NULL) {
+        return ss.str();
+    }
+
+    ss << "    dtype    = ";
+    switch (kextra->dtype) {
+    case TYPE_FLOAT:
+        ss << "TYPE_FLOAT";
+        break;
+    case TYPE_DOUBLE:
+        ss << "TYPE_DOUBLE";
+        break;
+    case TYPE_COMPLEX_FLOAT:
+        ss << "TYPE_COMPLEX_FLOAT";
+        break;
+    case TYPE_COMPLEX_DOUBLE:
+        ss << "TYPE_COMPLEX_DOUBLE";
+        break;
+    }
+    ss << std::endl;
+    ss << "    flags    = ";
+    dumpFlags<KernelExtraFlags>(ss, kextra->flags, kernelExtraFlagsDesc);
+    ss << std::endl;
+    ss << "    kernType = ";
+    switch (kextra->kernType) {
+    case CLBLAS_COMPUTING_KERNEL:
+        ss << "CLBLAS_COMPUTING_KERNEL";
+        break;
+    case CLBLAS_PREP_A_KERNEL:
+        ss << "CLBLAS_PREP_A_KERNEL";
+        break;
+    case CLBLAS_PREP_B_KERNEL:
+        ss << "CLBLAS_PREP_B_KERNEL";
+        break;
+    default:
+        ; // should not be reached
+    }
+    ss << std::endl;
+    // Deprecated data
+    ss << "    vecLen   = " << kextra->vecLen << std::endl;
+    ss << "    vecLenA  = " << kextra->vecLenA << std::endl;
+    ss << "    vecLenB  = " << kextra->vecLenB << std::endl;
+    ss << "    vecLenC  = " << kextra->vecLenC << std::endl;
+    return ss.str();
+}
+
+std::string
+Step::dumpMemoryPattern()
+{
+    std::stringstream ss;
+    const MemoryPattern *pattern = pattern_;
+    CLBLASMpatExtra *mpatExtra = static_cast<CLBLASMpatExtra*>(pattern->extra);
+
+    if (pattern == NULL) {
+        return ss.str();
+    }
+
+    ss << "    name     = " << pattern->name << std::endl;
+    ss << "    nrLevels = " << pattern->nrLevels << std::endl;
+    ss << "    cuLevel  = " << pattern->cuLevel << std::endl;
+    ss << "    thLevel  = " << pattern->thLevel << std::endl;
+
+    ss << "    sops";
+    if (pattern->sops == NULL) {
+        ss << "     = -" << std::endl;
+    }
+    else {
+        ss << std::endl;
+        ss << "        genKernel             : "
+           << ((pattern->sops->genKernel != NULL) ? "yes" : "no") << std::endl;
+        ss << "        assignKargs           : "
+           << ((pattern->sops->assignKargs != NULL) ? "yes" : "no") << std::endl;
+        ss << "        isFitToLDS            : "
+           << ((pattern->sops->isFitToLDS != NULL) ? "yes" : "no") << std::endl;
+        ss << "        innerDecompositionAxis: "
+           << ((pattern->sops->innerDecompositionAxis != NULL) ? "yes" : "no") << std::endl;
+        ss << "        calcThreads           : "
+           << ((pattern->sops->calcThreads != NULL) ? "yes" : "no") << std::endl;
+        ss << "        imgPackMode           : "
+           << ((pattern->sops->imgPackMode != NULL) ? "yes" : "no") << std::endl;
+        ss << "        getFlags              : "
+           << ((pattern->sops->getFlags != NULL) ? "yes" : "no") << std::endl;
+    }
+
+    ss << "    extra" << std::endl;
+    ss << "        aMset  = ";
+    dumpFlags<CLMemLevel>(ss, static_cast<CLMemLevel>(mpatExtra->aMset),
+                        memLevelFlagsDesc);
+    ss << std::endl;
+    ss << "        bMset  = ";
+    dumpFlags<CLMemLevel>(ss, static_cast<CLMemLevel>(mpatExtra->bMset),
+                        memLevelFlagsDesc);
+    ss << std::endl;
+    ss << "        mobjA  = ";
+    switch (mpatExtra->mobjA) {
+    case CLMEM_GLOBAL_MEMORY:
+        ss << "CLMEM_GLOBAL_MEMORY";
+        break;
+    case CLMEM_LOCAL_MEMORY:
+        ss << "CLMEM_LOCAL_MEMORY";
+        break;
+    case CLMEM_IMAGE:
+        ss << "CLMEM_IMAGE";
+        break;
+    }
+    ss << std::endl;
+    ss << "        mobjB  = ";
+    switch (mpatExtra->mobjB) {
+    case CLMEM_GLOBAL_MEMORY:
+        ss << "CLMEM_GLOBAL_MEMORY";
+        break;
+    case CLMEM_LOCAL_MEMORY:
+        ss << "CLMEM_LOCAL_MEMORY";
+        break;
+    case CLMEM_IMAGE:
+        ss << "CLMEM_IMAGE";
+        break;
+    }
+    ss << std::endl;
+    return ss.str();
+}
diff --git a/src/library/tools/ktest/step.cpp b/src/library/tools/ktest/step.cpp
new file mode 100644
index 0000000..492e7f3
--- /dev/null
+++ b/src/library/tools/ktest/step.cpp
@@ -0,0 +1,691 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <boost/lexical_cast.hpp>
+
+#include <assert.h>
+#include <kerngen.h>
+#include <clblas-internal.h>
+#include <matrix_dims.h>
+#include <solution_seq.h>
+#include "step.h"
+
+using namespace clMath;
+
+// This enum reflects CLBlasKargs structure, declared in clblas-internal.h
+typedef enum StepKarg {
+    KARG_NONE = 0,
+    // kernType
+    // dtype
+    KARG_ORDER,
+    KARG_SIDE,
+    KARG_UPLO,
+    KARG_TRANS_A,
+    KARG_TRANS_B,
+    KARG_DIAG,
+    KARG_M,
+    KARG_N,
+    KARG_K,
+    KARG_ALPHA,
+    KARG_A,
+    KARG_LDA,
+    KARG_B,
+    KARG_LDB,
+    KARG_BETA,
+    KARG_C,
+    KARG_LDC,
+    // addrBits
+    KARG_OFFSET_M,
+    KARG_OFFSET_N,
+    KARG_OFFSET_K,
+    KARG_SCIMAGE_0,
+    KARG_SCIMAGE_1,
+    KARG_OFF_A,
+    KARG_OFF_BX,
+    KARG_OFF_CY
+} StepKarg;
+
+Step::Step(
+    BlasFunctionID funcID,
+    cl_device_id device) :
+        naiveCall_(""), compareCall_(""), postRandomCall_(""), kernelName_("")
+{
+    memset(&step_, 0, sizeof(step_));
+    memset(&kextra_, 0, sizeof(kextra_));
+
+    step_.funcID = funcID;
+    step_.device.id = device;
+    identifyDevice(&step_.device);
+    step_.args.A = (cl_mem)BUFFER_A;
+    step_.args.B = (cl_mem)BUFFER_B;
+    step_.args.C = (cl_mem)BUFFER_C;
+    if (blasFunctionID() == CLBLAS_SYR2K) {
+        kextra_.flags = static_cast<KernelExtraFlags>
+                            (kextra_.flags | KEXTRA_SYRK_2K_RANK);
+        step_.extraFlags = kextra_.flags;
+    }
+}
+
+Step::Step(ListNode *node) :
+    naiveCall_(""), compareCall_(""), postRandomCall_(""), kernelName_("")
+{
+    SolutionStep *stepNode;
+    memset(&kextra_, 0, sizeof(kextra_));
+
+    stepNode = container_of(node, node, SolutionStep);
+    memcpy(&step_, stepNode, sizeof(step_));
+
+    kextra_.dtype = step_.args.dtype;
+    kextra_.flags = step_.extraFlags;
+    kextra_.kernType = CLBLAS_COMPUTING_KERNEL;
+}
+
+Step::~Step()
+{
+    for (ArrayVarList::iterator it = arrays_.begin(); it != arrays_.end(); ++it) {
+        delete (*it);
+    }
+    for (VarList::iterator it = vars_.begin(); it != vars_.end(); ++it) {
+        delete (*it);
+    }
+
+    vars_.clear();
+
+    arrays_.clear();
+
+    buffers_.clear();
+
+    kargMap_.clear();
+}
+
+BlasFunctionID
+Step::getStepNodeFuncID(ListNode *node)
+{
+    SolutionStep *stepNode;
+    stepNode = container_of(node, node, SolutionStep);
+    return stepNode->funcID;
+}
+
+void
+Step::completeDecompositionSingle()
+{
+    cl_int err;
+
+    kextra_.dtype = kargs().dtype;
+    kextra_.kernType = CLBLAS_COMPUTING_KERNEL;
+    kextra_.flags = (KernelExtraFlags)(kextra_.flags |
+            clblasArgsToKextraFlags(&step_.args, blasFunctionID()));
+    if (deviceVendor(device()) == "Advanced Micro Devices, Inc.") {
+        kextra_.flags = static_cast<KernelExtraFlags>
+            (kextra_.flags | KEXTRA_VENDOR_AMD | KEXTRA_ENABLE_MAD);
+    }
+
+    step_.pgran.wfSize = deviceWavefront(device(), &err);
+
+    step_.extraFlags = kextra_.flags;
+    step_.patternID = selectPattern(&step_, 0);
+    pattern_ = &clblasSolvers[step_.funcID].memPatterns[step_.patternID];
+
+    if (0 == step_.subdims[0].bwidth
+            && 0 == step_.subdims[0].bwidth
+            && 0 == step_.subdims[0].bwidth) {
+        getStepGranulation(&step_);
+
+    }
+    else if (pattern_->sops->checkCalcDecomp) {
+        pattern_->sops->checkCalcDecomp(&step_.pgran, step_.subdims, 2,
+                                        kextra_.dtype, PGRAN_CALC);
+    }
+    else {
+        size_t wgX, wgY;
+        size_t x0, y0;
+        SolverFlags sflags;
+
+        // Set up granulation for given dimensions
+
+        wgY = step_.subdims[0].y/ step_.subdims[1].y;
+        wgX = step_.subdims[0].x/ step_.subdims[1].x;
+
+        x0 = step_.subdims[0].x;
+        y0 = step_.subdims[0].y;
+
+        if (funcBlasLevel(blasFunctionID()) == 2) {
+            /* Level 2 decomposition size for vectors (dims[0].x) is 1.
+             * We have to "restore" it to proceed.
+             */
+            size_t xBlocks;
+
+            xBlocks = step_.subdims[0].bwidth / step_.subdims[1].bwidth;
+            x0 = step_.subdims[1].x * xBlocks;
+        }
+
+        /*
+         * adjust local size if a subproblem is not divisible
+         * between all local threads
+         */
+        for (; (wgY > 1) && (y0 < wgY); wgY /= 2) { }
+        for (; (wgX > 1) && (x0 < wgX); wgX /= 2) { }
+
+        sflags = pattern_->sops->getFlags();
+        if (sflags & SF_WSPACE_2D) {
+            step_.pgran.wgDim = 2;
+            step_.pgran.wgSize[0] = (unsigned int)wgY;
+            step_.pgran.wgSize[1] = (unsigned int)wgX;
+        }
+        else {
+            step_.pgran.wgDim = 1;
+            step_.pgran.wgSize[0] = (unsigned int)(wgX * wgY);
+            step_.pgran.wgSize[1] = 1;
+        }
+
+        // fixup work group size in respect with desired work dispatch order
+        if ((step_.pgran.wgDim == 2) && pattern_->sops->innerDecompositionAxis) {
+            if (pattern_->sops->innerDecompositionAxis(&step_.args) == DECOMP_AXIS_X) {
+                unsigned int u;
+
+                u = step_.pgran.wgSize[0];
+                step_.pgran.wgSize[0] = step_.pgran.wgSize[1];
+                step_.pgran.wgSize[1] = u;
+            }
+        }
+
+        /* Check that dimensions are bigger than whole problem size */
+        if (dimensionsExceedProblemSize(&step_)) {
+            getMinimalStepGranulation(&step_);
+        }
+    }
+    detectProblemTails(&step_);
+    kextra_.flags = step_.extraFlags;
+    if (pattern_->sops->fixupArgs) {
+        pattern_->sops->fixupArgs(&step_.args, &step_.subdims[0], &kextra_);
+}
+    step_.extraFlags = kextra_.flags;
+    detectOffsets(&step_);
+    kextra_.flags = step_.extraFlags;
+    selectVectorization(&step_, &kextra_);
+}
+
+void
+Step::makeSolutionSequence(ListHead *seq, cl_platform_id platform)
+{
+    SolutionStep *newStep;
+
+    (void)platform;
+
+    step_.args.A = (cl_mem)BUFFER_A;
+    step_.args.B = (cl_mem)BUFFER_B;
+    step_.args.C = (cl_mem)BUFFER_C;
+
+    newStep = (SolutionStep*)malloc(sizeof(SolutionStep));
+    memcpy(newStep, &step_, sizeof(SolutionStep));
+    listAddToTail(seq, &newStep->node);
+    decomposeProblemStep(newStep);
+}
+
+void
+Step::freeSolutionSequence(ListHead *seq)
+{
+    freeSolutionSeq(seq);
+}
+
+std::string
+Step::generate()
+{
+    ssize_t size;
+    char *buf;
+    std::stringstream ss;
+
+    if ((pattern_->sops == NULL) || (pattern_->sops->genKernel == NULL)) {
+        return "";
+    }
+
+    ss << "/*" << std::endl;
+    for (int i = 0; i < MAX_SUBDIMS; i++) {
+        ss << "SubproblemDim[" << i << "]" << std::endl;
+        ss << dumpSubdim(step_.subdims + i) << std::endl;
+    }
+    ss << "PGranularity" << std::endl;
+    ss << dumpPgran() << std::endl;
+    ss << "CLBLASKernExtra" << std::endl;
+    ss << dumpKextra() << std::endl;
+    ss << "MemoryPattern" << std::endl;
+    ss << dumpMemoryPattern();
+    ss << "*/" << std::endl << std::endl;
+
+    size = pattern_->sops->genKernel(NULL, 0, step_.subdims, &step_.pgran,
+        static_cast<void*>(&kextra_));
+    if (size <= 0) {
+        return 0;
+    }
+    buf = new char[size + 1];
+    if (pattern_->sops->genKernel(buf, size, step_.subdims, &step_.pgran,
+                static_cast<void*>(&kextra_)) != size) {
+        delete[] buf;
+        return "";
+    }
+    ss << buf;
+
+    delete[] buf;
+    return ss.str();
+}
+
+void
+Step::setKargs(const CLBlasKargs& kargs)
+{
+    step_.args = kargs;
+}
+
+const char*
+Step::getBlasFunctionName()
+{
+    switch (blasFunctionID()) {
+    case CLBLAS_GEMV:
+        return "gemv";
+    case CLBLAS_SYMV:
+        return "symv";
+    case CLBLAS_GEMM:
+        return "gemm";
+    case CLBLAS_TRMM:
+        return "trmm";
+    case CLBLAS_TRSM:
+        return "trsm";
+    case CLBLAS_SYRK:
+        return "syrk";
+    case CLBLAS_SYR2K:
+        return "syr2k";
+    default:
+        return "";
+    }
+}
+
+void
+Step::setDecomposition(
+    const SubproblemDim *subdims)
+{
+    for (size_t i = 0; i < MAX_SUBDIMS; i++) {
+        step_.subdims[i] = subdims[i];
+    }
+}
+
+Variable*
+Step::addVar(
+    const std::string& name,
+    const std::string& type,
+    const std::string& defaultValue)
+{
+    Variable *var = new Variable(name, type, defaultValue);
+    vars_.push_back(var);
+    return var;
+}
+
+Variable*
+Step::addConst(
+    const std::string& name,
+    const std::string& type,
+    const std::string& defaultValue)
+{
+    Variable *var = addVar(name, type, defaultValue);
+    var->setConstant(true);
+    return var;
+}
+
+Variable*
+Step::addVar(
+    const std::string& name,
+    const std::string& type,
+    size_t value)
+{
+    return addVar(name, type, boost::lexical_cast<std::string>(value));
+}
+
+Variable*
+Step::addConst(
+    const std::string& name,
+    const std::string& type,
+    size_t value)
+{
+    return addConst(name, type, boost::lexical_cast<std::string>(value));
+}
+
+Variable*
+Step::addVar(
+    const std::string& name,
+    const std::string& type,
+    int value)
+{
+    return addVar(name, type, boost::lexical_cast<std::string>(value));
+}
+
+Variable*
+Step::addConst(
+    const std::string& name,
+    const std::string& type,
+    int value)
+{
+    return addConst(name, type, boost::lexical_cast<std::string>(value));
+}
+
+MatrixVariable*
+Step::addMatrix(
+    const std::string& name,
+    const std::string& type,
+    Variable *rows,
+    Variable *columns,
+    Variable *ld,
+    Variable *off)
+{
+    MatrixVariable *var = new MatrixVariable(name, type, "NULL");
+    var->setMatrixSize(rows, columns, ld, off);
+    arrays_.push_back(var);
+    return var;
+}
+
+VectorVariable*
+Step::addVector(
+    const std::string& name,
+    const std::string& type,
+    Variable *N,
+    Variable *inc,
+    Variable *off)
+{
+    VectorVariable *var = new VectorVariable(name, type, "NULL");
+    var->setVectorSize(N, inc, off);
+    arrays_.push_back(var);
+    return var;
+}
+
+Variable*
+Step::addBuffer(
+    BufferID bufID,
+    const std::string& name,
+    const std::string& type,
+    cl_mem_flags flags,
+    ArrayVariableInterface* hostPtr)
+{
+    Variable *var = addVar(name, type, "NULL");
+    var->setIsBuffer(true);
+    var->setFlags(flags);
+    var->setHostPtr(hostPtr);
+    var->setBufferID(bufID);
+    buffers_.push_back(var);
+    return var;
+}
+
+Variable*
+Step::getBuffer(BufferID bufID)
+{
+    for (VarList::iterator it = buffers_.begin(); it != buffers_.end(); ++it) {
+        if ((*it)->getBufID() == bufID) {
+            return (*it);
+        }
+    }
+    return NULL;
+}
+
+
+void
+Step::setKernelArg(
+    unsigned int index,
+    const Variable *var)
+{
+    kargMap_[index] = var;
+}
+
+std::string
+Step::matrixSize(MatrixVariable *matrix)
+{
+    std::stringstream size;
+
+    if ((matrix->rows() == NULL) || (matrix->columns() == NULL)) {
+        return "";
+    }
+
+    if (matrix->off() != NULL) {
+        size << matrix->off()->name() << " + ";
+    }
+
+    if (matrix->ld() != NULL) {
+        size << matrix->ld()->name() << " * ";
+    }
+
+    if (step_.args.order == clblasColumnMajor) {
+        size << matrix->columns()->name();
+    }
+    else {
+        size << matrix->rows()->name();
+    }
+    return size.str();
+}
+
+std::string
+Step::vectorSize(VectorVariable *vector)
+{
+    std::stringstream size;
+
+    if (vector->nElems() == NULL) {
+        return "";
+    }
+
+    if (vector->off() != NULL) {
+        size << vector->off()->name() << " + ";
+    }
+    if (vector->inc() == NULL) {
+        size << vector->nElems()->name();
+    }
+    else {
+        size << "1 + (" << vector->nElems()->name() << " - 1) * abs("
+             << vector->inc()->name() << ")";
+    }
+    return size.str();
+}
+
+void
+Step::assignKargs(const StepKargs& map)
+{
+    CLBlasKargs args;
+    KernelArg kargsList[MAX_KERNEL_ARGS];
+    Variable *v;
+
+    if ((pattern_->sops == NULL) || (pattern_->sops->assignKargs == NULL)) {
+        return;
+    }
+
+    memset(&kargsList, KARG_NONE, sizeof(kargsList));
+
+    args.kernType = CLBLAS_COMPUTING_KERNEL;
+    args.dtype = TYPE_COMPLEX_DOUBLE;
+    args.addrBits = 0;
+
+    args.order = static_cast<clblasOrder>(KARG_ORDER);
+    args.side = static_cast<clblasSide>(KARG_SIDE);
+    args.uplo = static_cast<clblasUplo>(KARG_UPLO);
+    args.transA = static_cast<clblasTranspose>(KARG_TRANS_A);
+    args.transB = static_cast<clblasTranspose>(KARG_TRANS_B);
+    args.diag = static_cast<clblasDiag>(KARG_DIAG);
+
+    args.M = KARG_M;
+    args.N = KARG_N;
+    args.K = KARG_K;
+
+    args.lda.matrix = KARG_LDA;
+    args.ldb.matrix = KARG_LDB;
+    args.ldc.matrix = KARG_LDC;
+
+    args.offsetM = KARG_OFFSET_M;
+    args.offsetN = KARG_OFFSET_N;
+    args.offsetK = KARG_OFFSET_K;
+    args.offA = KARG_OFF_A;
+    args.offBX = KARG_OFF_BX;
+    args.offCY = KARG_OFF_CY;
+
+    args.A = reinterpret_cast<cl_mem>(KARG_A);
+    args.B = reinterpret_cast<cl_mem>(KARG_B);
+    args.C = reinterpret_cast<cl_mem>(KARG_C);
+
+    memset(&args.alpha, KARG_ALPHA, sizeof(args.alpha));
+    memset(&args.beta, KARG_BETA, sizeof(args.beta));
+
+    args.scimage[0] = reinterpret_cast<cl_mem>(KARG_SCIMAGE_0);
+    args.scimage[1] = reinterpret_cast<cl_mem>(KARG_SCIMAGE_1);
+
+    pattern_->sops->assignKargs(kargsList, static_cast<void*>(&args), &kextra_);
+
+    for (unsigned int i = 0; (i < MAX_KERNEL_ARGS) && (kargsList[i].typeSize != 0); i++) {
+        switch (static_cast<StepKarg>(kargsList[i].arg.data[0])) {
+        case KARG_M:
+            v = map.M;
+            break;
+        case KARG_N:
+            v = map.N;
+            break;
+        case KARG_K:
+            v = map.K;
+            break;
+        case KARG_ALPHA:
+            v = map.alpha;
+            break;
+        case KARG_A:
+            v = map.A;
+            break;
+        case KARG_LDA:
+            v = map.lda;
+            break;
+        case KARG_B:
+            v = map.B;
+            break;
+        case KARG_LDB:
+            v = map.ldb;
+            break;
+        case KARG_BETA:
+            v = map.beta;
+            break;
+        case KARG_C:
+            v = map.C;
+            break;
+        case KARG_LDC:
+            v = map.ldc;
+            break;
+        case KARG_OFFSET_M:
+            v = map.offsetM;
+            break;
+        case KARG_OFFSET_N:
+            v = map.offsetN;
+            break;
+        case KARG_OFFSET_K:
+            v = map.offsetK;
+            break;
+        case KARG_SCIMAGE_0:
+            v = map.scimage0;
+            break;
+        case KARG_SCIMAGE_1:
+            v = map.scimage1;
+            break;
+        case KARG_OFF_A:
+            v = map.offA;
+            break;
+        case KARG_OFF_BX:
+            v = map.offBX;
+            break;
+        case KARG_OFF_CY:
+            v = map.offCY;
+            break;
+        default:
+            // KARG_ORDER, KARG_SIDE, KARG_UPLO, KARG_TRANS_A, KARG_TRANS_B,
+            // KARG_DIAG
+            v = NULL;
+            break;
+        }
+        if (v != NULL) {
+            setKernelArg(i, v);
+        }
+    }
+}
+
+std::string
+Step::globalWorkSize()
+{
+    size_t globalWorkSize[MAX_WORK_DIM] = { 0, 0, 0 };
+    std::stringstream ss;
+    SubproblemDim dims[MAX_SUBDIMS];
+
+    memcpy(dims, step_.subdims, sizeof(dims));
+
+    if (pattern_->sops->calcThreads) {
+        pattern_->sops->calcThreads(globalWorkSize, step_.subdims,
+                                    &step_.pgran, &step_.args, &kextra_);
+    }
+    else {
+        SubproblemDim globDim;
+        const PGranularity *pg;
+
+        pg = (pattern_->nrLevels == 1) ? NULL : &step_.pgran;
+        kargsToProbDims(&globDim, blasFunctionID(), &step_.args, false);
+
+        // fixup dimensions in respect with desired work dispatch order
+        if ((pgran().wgDim == 2) && pattern_->sops->innerDecompositionAxis) {
+            if (pattern_->sops->innerDecompositionAxis(&step_.args) ==
+                DECOMP_AXIS_X) {
+
+                /*
+                 * these dimensions will not be used more anywhere, so we can
+                 * just swap them
+                 */
+                swapDimXY(&dims[0]);
+                swapDimXY(&dims[1]);
+                swapDimXY(&globDim);
+            }
+        }
+
+        calcGlobalThreads(globalWorkSize, dims, pg, globDim.y, globDim.x);
+    }
+
+    for (unsigned int i = 0; i < pgran().wgDim; i++) {
+        if (i != 0) {
+            ss << ", ";
+        }
+        ss << globalWorkSize[i];
+    }
+
+    return ss.str();
+}
+
+void
+Step::setKernelName(std::string name)
+{
+    kernelName_ = name;
+}
+
+std::string
+Step::deviceVendor(cl_device_id device)
+{
+    cl_int err;
+    size_t len;
+    char *str;
+    std::string vendor = "";
+
+    err = clGetDeviceInfo(device, CL_DEVICE_VENDOR, 0, NULL, &len);
+    if (err != CL_SUCCESS) {
+        return "";
+    }
+    str = new char[len + 1];
+    err = clGetDeviceInfo(device, CL_DEVICE_VENDOR, len, str, NULL);
+    if (err == CL_SUCCESS) {
+        vendor = str;
+    }
+    delete[] str;
+    return vendor;
+}
diff --git a/src/library/tools/ktest/step.h b/src/library/tools/ktest/step.h
new file mode 100644
index 0000000..7148c72
--- /dev/null
+++ b/src/library/tools/ktest/step.h
@@ -0,0 +1,481 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef KTEST_PATTERN_H__
+#define KTEST_PATTERN_H__
+
+#include <CL/cl.h>
+#include <list>
+#include <map>
+#include <string>
+
+#include <clblas-internal.h>
+#include <blas_funcs.h>
+#include <granulation.h>
+#include <kernel_extra.h>
+#include <solution_seq.h>
+#include <mempat.h>
+#include <list.h>
+#include "var.h"
+
+namespace clMath {
+
+// This structure reflects CLBlasKargs structure, declared in clblas-internal.h
+typedef struct StepKargs {
+    // kernType
+    // dtype
+    // order
+    // side
+    // uplo
+    // transA
+    // transB
+    // diag
+    Variable *M;
+    Variable *N;
+    Variable *K;
+    Variable *alpha;
+    Variable *A;
+    Variable *lda;
+    Variable *B;
+    Variable *ldb;
+    Variable *beta;
+    Variable *C;
+    Variable *ldc;
+    // addrBits
+    Variable *offsetM;
+    Variable *offsetN;
+    Variable *offsetK;
+    Variable *scimage0;
+    Variable *scimage1;
+    Variable *offA;
+    Variable *offBX;
+    Variable *offCY;
+} Kargs;
+
+typedef std::list<Variable*> VarList;
+typedef std::list<ArrayVariableInterface*> ArrayVarList;
+typedef std::map<unsigned int, const Variable*> KArgMap;
+
+/**
+ * @internal
+ * @brief SolutionStep wrapper object
+ * @ingroup MAKE_KTEST
+ *
+ * Objects of this class are used for problem decomposition. Each Step object
+ * contains single SolutionStep structure. For disabled multikernel feature
+ * case there is only one solution step always. For multikernel case there is
+ * one master step storing arguments of original problem and inner steps
+ * which are received from solution sequence list generated by clBLAS in
+ * makeSolutionSequence call.
+ *
+ */
+class Step {
+private:
+    CLBLASKernExtra kextra_;
+    cl_platform_id platform_;
+
+    VarList vars_;
+    ArrayVarList arrays_;
+    VarList buffers_;
+    /**
+     * @internal
+     * @brief Kernel arguments map
+     *
+     * Contains variables objects for arguments of step kernel,
+     * in respective order.
+     */
+    KArgMap kargMap_;
+
+    std::string dumpMemoryPattern();
+    std::string dumpSubdim(const SubproblemDim *subdim);
+    std::string dumpPgran();
+    std::string dumpKextra();
+
+    cl_device_id device()                       { return step_.device.id; };
+    void setKernelArg(unsigned int index, const Variable *var);
+
+protected:
+    /**
+     * @internal
+     * @brief Associated SolutionStep structure
+     */
+    SolutionStep step_;
+    /**
+     * @internal
+     * @brief Selected memory pattern pointer
+     */
+    MemoryPattern* pattern_;
+    /**
+     * @internal
+     * @brief Naive call string
+     *
+     * This string contains naive call for processing step problem. Is used in
+     * master step.
+     */
+    std::string naiveCall_;
+    /**
+     * @internal
+     * @brief Comparison call string
+     *
+     * This string contains comparison call for processed matrixes. Is used in
+     * master step.
+     */
+    std::string compareCall_;
+    /**
+     * @internal
+     * @brief Post process matrixes
+     *
+     * This string contains function call for post-processing matrixes after
+     * filling them with random data. Can be used in master step.
+     */
+    std::string postRandomCall_;
+    /**
+     * @internal
+     * @brief Step kernel name
+     */
+    std::string kernelName_;
+
+
+    /**
+     * @internal
+     * @brief Add variable into step variables list.
+     *
+     * Variable value is given by string.
+     */
+    Variable* addVar(const std::string& name, const std::string& type,
+        const std::string& defaultValue = "");
+    /**
+     * @internal
+     * @brief Add variable into step variables list.
+     *
+     * Variable value is given by unsigned value.
+     */
+    Variable* addVar(const std::string& name, const std::string& type,
+        size_t value);
+    /**
+     * @internal
+     * @brief Add variable into step variables list.
+     *
+     * Variable value is given by signed integer.
+     */
+    Variable* addVar(const std::string& name, const std::string& type,
+        int value);
+    /**
+     * @internal
+     * @brief Add constant variable into step variables list.
+     *
+     * Constant value is given by string.
+     */
+    Variable* addConst(const std::string& name, const std::string& type,
+        const std::string& defaultValue);
+    /**
+     * @internal
+     * @brief Add constant variable into step variables list.
+     *
+     * Constant value is given by unsigned value.
+     */
+    Variable* addConst(const std::string& name, const std::string& type,
+        size_t value);
+    /**
+     * @internal
+     * @brief Add constant variable into step variables list.
+     *
+     * Constant value is given by signed integer.
+     */
+    Variable* addConst(const std::string& name, const std::string& type,
+        int value);
+    /**
+     * @internal
+     * @brief Add matrix array into step host arrays list.
+     */
+    MatrixVariable* addMatrix(const std::string& name, const std::string& type,
+        Variable *rows, Variable *columns, Variable *ld, Variable *off = NULL);
+    /**
+     * @internal
+     * @brief Add vector into step host arrays list.
+     */
+    VectorVariable* addVector(const std::string& name, const std::string& type,
+        Variable *N, Variable *inc, Variable *off = NULL);
+    /**
+     * @internal
+     * @brief Add variable for OpenCL buffer into step buffers list.
+     */
+    Variable* addBuffer(BufferID bufID, const std::string& name,
+        const std::string& type, cl_mem_flags flags,
+        ArrayVariableInterface* hostPtr);
+
+    /**
+     * @internal
+     * @brief Assign kernel arguments
+     *
+     * Run pattern assign-kernel-arguments function and get information about
+     * used variables and their order which is used for generating kernel test
+     * code.
+     */
+    void assignKargs(const Kargs& kargs);
+
+    /**
+     * @internal
+     * @brief Get device vendor string
+     */
+    static std::string deviceVendor(cl_device_id device);
+
+public:
+    /**
+     * @internal
+     * @brief Constructor for master step
+     *
+     * @param[in] funcID          Function identifier
+     * @param[in] device          Device identifier
+     *
+     * Uses function id and device to compose step object. It is used for
+     * master step.
+     *
+     */
+    Step(BlasFunctionID funcID, cl_device_id device);
+    /**
+     * @internal
+     * @brief Constructor for inner step
+     *
+     * @param[in] node            Solution sequence list node
+     *
+     * Uses solution sequence node to compose step object. It is used for
+     * making inner steps from solution sequence list received from
+     * clBLAS frontend using makeSolutionSequence.
+     *
+     */
+    Step(ListNode *node);
+    /**
+     * @internal
+     * @brief Step destructor
+     */
+    virtual ~Step();
+
+    /**
+     * @internal
+     * @brief Get step variables list
+     */
+    const VarList& vars() const                 { return vars_; };
+    /**
+     * @internal
+     * @brief Get step host arrays list
+     */
+    const ArrayVarList& arrays() const          { return arrays_; };
+    /**
+     * @internal
+     * @brief Get step OpenCL buffers list
+     */
+    const VarList& buffers() const              { return buffers_; };
+
+    /**
+     * @internal
+     * @brief Fix leading dimensions to fit matrixes sizes
+     */
+    virtual void fixLD() = 0;
+    /**
+     * @internal
+     * @brief Declare variables
+     *
+     * @param[in] masterStep      Master step object
+     *
+     * Add function-specific variables and fill comparison call and naive
+     * implementation call strings. Master step object is used for handling
+     * buffers A, B, C rearrangement.
+     *
+     */
+    virtual void declareVars(Step *masterStep) = 0;
+    /**
+     * @internal
+     * @brief Get buffer by id
+     *
+     * @param[in] bufID           Buffer identifier
+     *
+     * Return variable of step for buffer A, B or C. Is used for multi-step
+     * configurations for handling buffers rearrangement in inner steps. Inner
+     * steps get buffer variables names from respective master step buffers.
+     */
+    Variable* getBuffer(BufferID bufID);
+
+    /**
+     * @internal
+     * @brief Complete problem decomposition of a single step
+     *
+     * Parallelism granularity, tails flags and vectorization values are
+     * guaranteed to be set in appropriate values after this function call.
+     */
+    void completeDecompositionSingle();
+    /**
+     * @internal
+     * @brief Wrapper for makeSolutionSeq
+     *
+     * @param[out] seq             Solution sequence list head
+     * @param[in]  platform        Platform identifier
+     *
+     * Call makeSolutionSeq from clBLAS frontend and return solution sequence
+     * list for it.
+     */
+    void makeSolutionSequence(ListHead *seq, cl_platform_id platform);
+    /**
+     * @internal
+     * @brief Wrapper for freeSolutionSeq
+     *
+     * @param[out] seq             Solution sequence list head
+     *
+     * Call freeSolutionSeq from clBLAS frontend.
+     */
+    void freeSolutionSequence(ListHead *seq);
+    /**
+     * @internal
+     * @brief Generate step kernel code
+     *
+     * @return String containing kernel code for this step
+     */
+    std::string generate();
+    /**
+     * @internal
+     * @brief Generate step global work size string
+     *
+     * @return String containing global work size for this step
+     */
+    std::string globalWorkSize();
+
+    /**
+     * @internal
+     * @brief Get step blas function identifier
+     * @return blas function id
+     */
+    BlasFunctionID blasFunctionID() const       { return step_.funcID; };
+    /**
+     * @internal
+     * @brief Get step kernel arguments
+     * @return step kernel arguments structure
+     */
+    const CLBlasKargs& kargs() const            { return step_.args; };
+    /**
+     * @internal
+     * @brief Get step parallelism granularity
+     * @return step parallelism granularity structure
+     */
+    const PGranularity& pgran() const           { return step_.pgran; };
+    /**
+     * @internal
+     * @brief Get naive call string
+     *
+     * Get string containing naive blas function call for step blas function
+     * with respective step flags and arguments.
+     * @return naive blas call string
+     */
+    const std::string& naiveCall() const        { return naiveCall_; };
+    /**
+     * @internal
+     * @brief Get comparison call string
+     *
+     * Get string containing resulting vectors of matrixes comparison function
+     * call for step blas function.
+     * @return comparison call string
+     */
+    const std::string& compareCall() const      { return compareCall_; };
+    /**
+     * @internal
+     * @brief Get post-processing call
+     *
+     * Get string containing function call which is called after setting step
+     * matrixes. Is used in TRSM now for making divisible B matrix.
+     * @return step matrixes post-processing call
+     */
+    const std::string& postRandomCall() const   { return postRandomCall_; };
+    /**
+     * @internal
+     * @brief Get step kernel name
+     * @return step kernel name
+     */
+    const std::string& kernelName() const       { return kernelName_; };
+    /**
+     * @internal
+     * @brief Get blas function name
+     * Returns blas function name from naive blas for this step.
+     * @return blas function name
+     */
+    const char* getBlasFunctionName();
+
+    /**
+     * @internal
+     * @brief Get kernel arguments variables map
+     * @return step kernel arguments variables map
+     */
+    const std::map<unsigned int, const Variable*>& kargMap() const { return kargMap_; }
+    /**
+     * @internal
+     * @brief Set step blas arguments
+     * @param[in]  kargs           Step blas arguments structure
+     */
+    void setKargs(const CLBlasKargs& kargs);
+    /**
+     * @internal
+     * @brief Set step blas subdimensions
+     * @param[in]  subdims         Step subproblem dimensions
+     */
+    void setDecomposition(const SubproblemDim *subdims);
+    /**
+     * @internal
+     * @brief Set step kernel name
+     * @param[in]  name            Step kernel name
+     */
+    void setKernelName(std::string name);
+    /**
+     * @internal
+     * @brief Get string containing matrix size
+     * @param[in]  var             Matrix variable
+     * @return matrix variable size string
+     */
+    std::string matrixSize(MatrixVariable *var);
+    /**
+     * @internal
+     * @brief Get string containing vector size
+     * @param[in]  var             Vector variable
+     * @return vector variable size string
+     */
+    std::string vectorSize(VectorVariable *vector);
+
+    /**
+     * @internal
+     * @brief Get string containing argument value
+     * @param[in]  dtype           Argument type
+     * @param[in]  arg             Argument value
+     * Get string containing argument value. Argument can have complex type.
+     * @return string containing argument value
+     */
+    static std::string multiplierToString(DataType dtype, ArgMultiplier arg);
+    /**
+     * @internal
+     * @brief Get string containing type
+     * @param[in]  dtype           Data type
+     * @return Data type string
+     */
+    static std::string dtypeToString(DataType dtype);
+    /**
+     * @internal
+     * @brief Get solution node blas function identifier
+     * @param[in]  node            Solution sequence node
+     * @return blas function id
+     */
+    static BlasFunctionID getStepNodeFuncID(ListNode *node);
+};
+
+}   // namespace clMath
+
+#endif  // KTEST_PATTERN_H__
diff --git a/src/library/tools/ktest/steps/gemm.cpp b/src/library/tools/ktest/steps/gemm.cpp
new file mode 100644
index 0000000..f7ef4e1
--- /dev/null
+++ b/src/library/tools/ktest/steps/gemm.cpp
@@ -0,0 +1,170 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <sstream>
+
+#include "gemm.h"
+
+using namespace clMath;
+
+GemmStep::GemmStep(cl_device_id device) :
+    Step(CLBLAS_GEMM, device)
+{
+}
+
+GemmStep::GemmStep(ListNode *node) :
+    Step(node)
+{
+}
+
+void
+GemmStep::declareVars(Step *masterStep)
+{
+    StepKargs args;
+    MatrixVariable *A, *B, *C, *naiveC;
+
+    memset(&args, 0, sizeof(args));
+    std::string type = dtypeToString(kargs().dtype);
+
+    args.M = addConst("M", "cl_uint", kargs().M);
+    args.N = addConst("N", "cl_uint", kargs().N);
+    args.K = addConst("K", "cl_uint", kargs().K);
+
+    args.lda = addConst("lda", "cl_uint", kargs().lda.matrix);
+    args.ldb = addConst("ldb", "cl_uint", kargs().ldb.matrix);
+    args.ldc = addConst("ldc", "cl_uint", kargs().ldc.matrix);
+
+    //TODO: remove after all gemm generators use offsets A,B,C
+    args.offsetM = addConst("offsetM", "cl_uint", kargs().offsetM);
+    args.offsetN = addConst("offsetN", "cl_uint", kargs().offsetN);
+    args.offsetK = addConst("offsetK", "cl_uint", kargs().offsetK);
+
+    args.offA = addVar("offA", "cl_uint", kargs().offA);
+    args.offBX = addVar("offB", "cl_uint", kargs().offBX);
+    args.offCY = addVar("offC", "cl_uint", kargs().offCY);
+
+    args.alpha = addVar("alpha", type,
+        multiplierToString(kargs().dtype, kargs().alpha));
+    args.beta = addVar("beta", type,
+        multiplierToString(kargs().dtype, kargs().beta));
+
+    if (kargs().transA == clblasNoTrans) {
+        A = addMatrix("A", type + "*", args.M, args.K, args.lda, args.offA);
+    }
+    else {
+        A = addMatrix("A", type + "*", args.K, args.M, args.lda, args.offA);
+    }
+    if (kargs().transB == clblasNoTrans) {
+        B = addMatrix("B", type + "*", args.K, args.N, args.ldb, args.offBX);
+    }
+    else {
+        B = addMatrix("B", type + "*", args.N, args.K, args.ldb, args.offBX);
+    }
+    C = addMatrix("C", type + "*", args.M, args.N, args.ldc, args.offCY);
+    naiveC = addMatrix("naiveC", type + "*", args.M, args.N, args.ldc, args.offCY);
+    naiveC->setCopy(C);
+
+
+    std::string bufAName, bufBName, bufCName;
+    if (NULL == masterStep) {
+        bufAName = "bufA";
+        bufBName = "bufB";
+        bufCName = "bufC";
+    }
+    else {
+        bufAName = masterStep->getBuffer((BufferID)(long)step_.args.A)->name();
+        bufBName = masterStep->getBuffer((BufferID)(long)step_.args.B)->name();
+        bufCName = masterStep->getBuffer((BufferID)(long)step_.args.C)->name();
+    }
+
+    args.A = addBuffer(BUFFER_A, bufAName, "cl_mem", CL_MEM_READ_ONLY, A);
+    args.B = addBuffer(BUFFER_B, bufBName, "cl_mem", CL_MEM_READ_ONLY, B);
+    args.C = addBuffer(BUFFER_C, bufCName, "cl_mem", CL_MEM_READ_WRITE, C);
+
+    assignKargs(args);
+
+    std::stringstream ss;
+    ss << getBlasFunctionName() << "(order, transA, transB, "
+       << args.M->name() << ", " << args.N->name() << ", "
+       << args.K->name() << ", "
+       << args.alpha->name() << ", " << A->matrixPointer() << ", "
+       << args.lda->name() << ", " << B->matrixPointer() << ", "
+       << args.ldb->name() << ", " << args.beta->name() << ", "
+       << naiveC->matrixPointer() << ", " << args.ldc->name() << ")";
+    naiveCall_ = ss.str();
+
+    ss.str("");
+    ss << "compareMatrices(order, " << args.M->name() << ", "
+       << args.N->name() << ", " << C->matrixPointer() << ", "
+       << naiveC->matrixPointer() << ", " << args.ldc->name() << ")";
+    compareCall_ = ss.str();
+}
+
+void
+GemmStep::fixLD()
+{
+    CLBlasKargs args;
+
+    args = kargs();
+
+    switch (args.transA) {
+    case clblasNoTrans:
+        if ((args.order == clblasColumnMajor) && (args.lda.matrix < args.M)) {
+            args.lda.matrix = args.M;
+        }
+        if ((args.order == clblasRowMajor) && (args.lda.matrix < args.K)) {
+            args.lda.matrix = args.K;
+        }
+        break;
+    case clblasTrans:
+    case clblasConjTrans:
+        if ((args.order == clblasColumnMajor) && (args.lda.matrix < args.K)) {
+            args.lda.matrix = args.K;
+        }
+        if ((args.order == clblasRowMajor) && (args.lda.matrix < args.M)) {
+            args.lda.matrix = args.M;
+        }
+        break;
+    }
+    switch (args.transB) {
+    case clblasNoTrans:
+        if ((args.order == clblasColumnMajor) && (args.ldb.matrix < args.K)) {
+            args.ldb.matrix = args.K;
+        }
+        if ((args.order == clblasRowMajor) && (args.ldb.matrix < args.N)) {
+            args.ldb.matrix = args.N;
+        }
+        break;
+    case clblasTrans:
+    case clblasConjTrans:
+        if ((args.order == clblasColumnMajor) && (args.ldb.matrix < args.N)) {
+            args.ldb.matrix = args.N;
+        }
+        if ((args.order == clblasRowMajor) && (args.ldb.matrix < args.K)) {
+            args.ldb.matrix = args.K;
+        }
+        break;
+    }
+    if ((args.order == clblasColumnMajor) && (args.ldc.matrix < args.M)) {
+        args.ldc.matrix = args.M;
+    }
+    if ((args.order == clblasRowMajor) && (args.ldc.matrix < args.N)) {
+        args.ldc.matrix = args.N;
+    }
+
+    setKargs(args);
+}
diff --git a/src/library/tools/ktest/steps/gemm.h b/src/library/tools/ktest/steps/gemm.h
new file mode 100644
index 0000000..1e716ab
--- /dev/null
+++ b/src/library/tools/ktest/steps/gemm.h
@@ -0,0 +1,36 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef KTEST_GEMM_H__
+#define KTEST_GEMM_H__
+
+#include "../step.h"
+
+namespace clMath {
+
+class GemmStep : public Step {
+public:
+    GemmStep(cl_device_id device);
+    GemmStep(ListNode *node);
+
+    virtual void fixLD();
+    virtual void declareVars(Step *masterStep);
+};
+
+}   // namespace clMath
+
+#endif  // KTEST_GEMM_H__
diff --git a/src/library/tools/ktest/steps/gemv.cpp b/src/library/tools/ktest/steps/gemv.cpp
new file mode 100644
index 0000000..c11e3ae
--- /dev/null
+++ b/src/library/tools/ktest/steps/gemv.cpp
@@ -0,0 +1,143 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <sstream>
+
+#include "gemv.h"
+
+using namespace clMath;
+
+GemvStep::GemvStep(cl_device_id device) :
+    Step(CLBLAS_GEMV, device)
+{
+}
+
+GemvStep::GemvStep(ListNode *node) :
+    Step(node)
+{
+}
+
+void
+GemvStep::declareVars(Step *masterStep)
+{
+    StepKargs args;
+    MatrixVariable *A;
+    VectorVariable *X, *Y, *naiveY;
+
+    memset(&args, 0, sizeof(args));
+    std::string type = dtypeToString(kargs().dtype);
+
+    args.M = addConst("M", "cl_uint", kargs().M);
+    args.N = addConst("N", "cl_uint", kargs().N);
+
+    args.lda = addConst("lda", "cl_uint", kargs().lda.matrix);
+    args.ldb = addConst("incx", "cl_int", kargs().ldb.vector);
+    args.ldc = addConst("incy", "cl_int", kargs().ldc.vector);
+
+    args.offA = addConst("offA", "cl_uint", kargs().offA);
+    args.offBX = addConst("offX", "cl_uint", kargs().offBX);
+    args.offCY = addConst("offY", "cl_uint", kargs().offCY);
+
+    args.alpha = addVar("alpha", type,
+        multiplierToString(kargs().dtype, kargs().alpha));
+    args.beta = addVar("beta", type,
+        multiplierToString(kargs().dtype, kargs().beta));
+
+    A = addMatrix("A", type + "*", args.M, args.N, args.lda, args.offA);
+
+    if (kargs().transA == clblasNoTrans) {
+        X = addVector("X", type + "*", args.N, args.ldb, args.offBX);
+        Y = addVector("Y", type + "*", args.M, args.ldc, args.offCY);
+        naiveY = addVector("naiveY", type + "*", args.M, args.ldc, args.offCY);
+    }
+    else {
+        X = addVector("X", type + "*", args.M, args.ldb, args.offBX);
+        Y = addVector("Y", type + "*", args.N, args.ldc, args.offCY);
+        naiveY = addVector("naiveY", type + "*", args.N, args.ldc, args.offCY);
+    }
+    naiveY->setCopy(Y);
+
+    std::string bufAName, bufBName, bufCName;
+    if (NULL == masterStep) {
+        bufAName = "bufA";
+        bufBName = "bufX";
+        bufCName = "bufY";
+    }
+    else {
+        bufAName = masterStep->getBuffer((BufferID)(long)step_.args.A)->name();
+        bufBName = masterStep->getBuffer((BufferID)(long)step_.args.B)->name();
+        bufCName = masterStep->getBuffer((BufferID)(long)step_.args.C)->name();
+    }
+    args.A = addBuffer(BUFFER_A, bufAName, "cl_mem", CL_MEM_READ_ONLY, A);
+    args.B = addBuffer(BUFFER_B, bufBName, "cl_mem", CL_MEM_READ_ONLY, X);
+    args.C = addBuffer(BUFFER_C, bufCName, "cl_mem", CL_MEM_READ_WRITE, Y);
+
+    assignKargs(args);
+
+    std::stringstream ss;
+    ss << getBlasFunctionName() << "(order, transA, "
+       << args.M->name() << ", " << args.N->name() << ", "
+       << args.alpha->name() << ", " << A->matrixPointer()
+       << ", " << args.lda->name() << ", "
+       << X->vectorPointer() << ", " << args.ldb->name() << ", "
+       << args.beta->name() << ", " << naiveY->vectorPointer()
+       << ", " << args.ldc->name() << ")";
+    naiveCall_ = ss.str();
+
+    ss.str("");
+    if (kargs().transA == clblasNoTrans) {
+        ss << "compareVectors(" << args.M->name() << ", "
+           << Y->vectorPointer() << ", " << naiveY->vectorPointer()
+           << ", " << args.ldc->name() << ")";
+    }
+    else {
+        ss << "compareVectors(" << args.N->name() << ", "
+           << Y->vectorPointer() << ", " << naiveY->vectorPointer()
+           << ", " << args.ldc->name() << ")";
+    }
+    compareCall_ = ss.str();
+}
+
+void
+GemvStep::fixLD()
+{
+    CLBlasKargs args;
+
+    args = kargs();
+
+    /* M is always number of rows and N is number of columns in gemv */
+
+    if ((args.order == clblasColumnMajor) && (args.lda.matrix < args.M)) {
+        args.lda.matrix = args.M;
+    }
+    if ((args.order == clblasRowMajor) && (args.lda.matrix < args.N)) {
+        args.lda.matrix = args.N;
+    }
+
+    if (args.ldb.vector == 0) {
+        args.ldb.vector = 1;
+    }
+    if (args.ldc.vector == 0) {
+        args.ldc.vector = 1;
+    }
+    /*
+     * store original height of the matrix A
+     */
+    args.K = (args.transA == clblasNoTrans) ? args.M : args.N;
+
+    setKargs(args);
+}
diff --git a/src/library/tools/ktest/steps/gemv.h b/src/library/tools/ktest/steps/gemv.h
new file mode 100644
index 0000000..6ed7cd2
--- /dev/null
+++ b/src/library/tools/ktest/steps/gemv.h
@@ -0,0 +1,36 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef KTEST_GEMV_H__
+#define KTEST_GEMV_H__
+
+#include "../step.h"
+
+namespace clMath {
+
+class GemvStep : public Step {
+public:
+    GemvStep(cl_device_id device);
+    GemvStep(ListNode *node);
+
+    virtual void fixLD();
+    virtual void declareVars(Step *masterStep);
+};
+
+}   // namespace clMath
+
+#endif  // KTEST_GEMV_H__
diff --git a/src/library/tools/ktest/steps/symv.cpp b/src/library/tools/ktest/steps/symv.cpp
new file mode 100644
index 0000000..bfe2230
--- /dev/null
+++ b/src/library/tools/ktest/steps/symv.cpp
@@ -0,0 +1,120 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <sstream>
+
+#include "symv.h"
+
+using namespace clMath;
+
+SymvStep::SymvStep(cl_device_id device) :
+    Step(CLBLAS_SYMV, device)
+{
+}
+
+SymvStep::SymvStep(ListNode *node) :
+    Step(node)
+{
+}
+
+void
+SymvStep::declareVars(Step *masterStep)
+{
+    StepKargs args;
+    MatrixVariable *A;
+    VectorVariable *X, *Y, *naiveY;
+
+    memset(&args, 0, sizeof(args));
+    std::string type = dtypeToString(kargs().dtype);
+
+    args.N = addConst("N", "cl_uint", kargs().N);
+    args.K = args.N;
+
+    args.lda = addConst("lda", "cl_uint", kargs().lda.matrix);
+    args.ldb = addConst("incx", "cl_int", kargs().ldb.vector);
+    args.ldc = addConst("incy", "cl_int", kargs().ldc.vector);
+
+    args.offsetN = addConst("offsetN", "cl_uint", kargs().offsetN);
+
+    args.offA = addConst("offA", "cl_uint", kargs().offA);
+    args.offBX = addConst("offx", "cl_uint", kargs().offBX);
+    args.offCY = addConst("offy", "cl_uint", kargs().offCY);
+
+    args.alpha = addVar("alpha", type,
+        multiplierToString(kargs().dtype, kargs().alpha));
+    args.beta = addVar("beta", type,
+        multiplierToString(kargs().dtype, kargs().beta));
+
+    A = addMatrix("A", type + "*", args.N, args.N, args.lda, args.offA);
+    X = addVector("X", type + "*", args.N, args.ldb, args.offBX);
+    Y = addVector("Y", type + "*", args.N, args.ldc, args.offCY);
+    naiveY = addVector("naiveY", type + "*", args.N, args.ldc, args.offCY);
+    naiveY->setCopy(Y);
+
+    std::string bufAName, bufBName, bufCName;
+    if (NULL == masterStep) {
+        bufAName = "bufA";
+        bufBName = "bufX";
+        bufCName = "bufY";
+    }
+    else {
+        bufAName = masterStep->getBuffer((BufferID)(long)step_.args.A)->name();
+        bufBName = masterStep->getBuffer((BufferID)(long)step_.args.B)->name();
+        bufCName = masterStep->getBuffer((BufferID)(long)step_.args.C)->name();
+    }
+    args.A = addBuffer(BUFFER_A, bufAName, "cl_mem", CL_MEM_READ_ONLY, A);
+    args.B = addBuffer(BUFFER_B, bufBName, "cl_mem", CL_MEM_READ_ONLY, X);
+    args.C = addBuffer(BUFFER_C, bufCName, "cl_mem", CL_MEM_READ_WRITE, Y);
+
+    assignKargs(args);
+
+    std::stringstream ss;
+    ss << getBlasFunctionName() << "(order, uplo, "
+       << args.N->name() << ", "
+       << args.alpha->name() << ", " << A->matrixPointer() << ", "
+       << args.lda->name() << ", " << X->vectorPointer() << ", "
+       << args.ldb->name() << ", " << args.beta->name() << ", "
+       << naiveY->vectorPointer() << ", " << args.ldc->name() << ")";
+    naiveCall_ = ss.str();
+
+    ss.str("");
+    ss << "compareVectors(" << args.N->name() << ", " << Y->vectorPointer()
+       << ", " << naiveY->vectorPointer() << ", " << args.ldc->name() << ")";
+    compareCall_ = ss.str();
+}
+
+void
+SymvStep::fixLD()
+{
+    CLBlasKargs args;
+
+    args = kargs();
+
+    if (args.lda.matrix < args.N) {
+        args.lda.matrix = args.N;
+    }
+
+    if (args.ldb.vector == 0) {
+        args.ldb.vector = 1;
+    }
+    if (args.ldc.vector == 0) {
+        args.ldc.vector = 1;
+    }
+    args.K = args.N; //store original N
+
+    setKargs(args);
+}
diff --git a/src/library/tools/ktest/steps/symv.h b/src/library/tools/ktest/steps/symv.h
new file mode 100644
index 0000000..792e442
--- /dev/null
+++ b/src/library/tools/ktest/steps/symv.h
@@ -0,0 +1,36 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef KTEST_SYMV_H__
+#define KTEST_SYMV_H__
+
+#include "../step.h"
+
+namespace clMath {
+
+class SymvStep : public Step {
+public:
+    SymvStep(cl_device_id device);
+    SymvStep(ListNode *node);
+
+    virtual void fixLD();
+    virtual void declareVars(Step *masterStep);
+};
+
+}   // namespace clMath
+
+#endif  // KTEST_SYMV_H__
diff --git a/src/library/tools/ktest/steps/syr2k.cpp b/src/library/tools/ktest/steps/syr2k.cpp
new file mode 100644
index 0000000..388a51b
--- /dev/null
+++ b/src/library/tools/ktest/steps/syr2k.cpp
@@ -0,0 +1,153 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <sstream>
+
+#include "syr2k.h"
+
+using namespace clMath;
+
+Syr2kStep::Syr2kStep(cl_device_id device) :
+    Step(CLBLAS_SYR2K, device)
+{
+}
+
+Syr2kStep::Syr2kStep(ListNode *node) :
+    Step(node)
+{
+}
+
+void
+Syr2kStep::declareVars(Step *masterStep)
+{
+    StepKargs args;
+    MatrixVariable *A, *B, *C, *naiveC;
+
+    memset(&args, 0, sizeof(args));
+    std::string type = dtypeToString(kargs().dtype);
+
+    args.N = addConst("N", "cl_uint", kargs().N);
+    args.M = args.N;
+    args.K = addConst("K", "cl_uint", kargs().K);
+
+    args.lda = addConst("lda", "cl_uint", kargs().lda.matrix);
+    args.ldb = addConst("ldb", "cl_uint", kargs().ldb.matrix);
+    args.ldc = addConst("ldc", "cl_uint", kargs().ldc.matrix);
+
+    args.offsetM = addConst("offsetM", "cl_uint", kargs().offsetM);
+
+    args.offA = addVar("offA", "cl_uint", kargs().offA);
+    args.offBX = addVar("offB", "cl_uint", kargs().offBX);
+    args.offCY = addVar("offC", "cl_uint", kargs().offCY);
+
+    args.alpha = addVar("alpha", type,
+        multiplierToString(kargs().dtype, kargs().alpha));
+    args.beta = addVar("beta", type,
+        multiplierToString(kargs().dtype, kargs().beta));
+
+    if (kargs().transA == clblasNoTrans) {
+        A = addMatrix("A", type + "*", args.N, args.K, args.lda, args.offA);
+        B = addMatrix("B", type + "*", args.N, args.K, args.lda, args.offBX);
+    }
+    else {
+        A = addMatrix("A", type + "*", args.K, args.N, args.lda, args.offA);
+        B = addMatrix("B", type + "*", args.K, args.N, args.lda, args.offBX);
+    }
+    C = addMatrix("C", type + "*", args.N, args.N, args.ldc, args.offCY);
+    naiveC = addMatrix("naiveC", type + "*", args.N, args.N, args.ldc, args.offCY);
+    naiveC->setCopy(C);
+
+    std::string bufAName, bufBName, bufCName;
+    if (NULL == masterStep) {
+        bufAName = "bufA";
+        bufBName = "bufB";
+        bufCName = "bufC";
+    }
+    else {
+        bufAName = masterStep->getBuffer((BufferID)(long)step_.args.A)->name();
+        bufBName = masterStep->getBuffer((BufferID)(long)step_.args.B)->name();
+        bufCName = masterStep->getBuffer((BufferID)(long)step_.args.C)->name();
+    }
+
+    args.A = addBuffer(BUFFER_A, bufAName, "cl_mem", CL_MEM_READ_ONLY, A);
+    args.B = addBuffer(BUFFER_B, bufBName, "cl_mem", CL_MEM_READ_ONLY, B);
+    args.C = addBuffer(BUFFER_C, bufCName, "cl_mem", CL_MEM_READ_WRITE, C);
+
+    assignKargs(args);
+
+    std::stringstream ss;
+    ss << getBlasFunctionName() << "(order, uplo, transA, "
+       << args.N->name() << ", " << args.K->name() << ", "
+       << args.alpha->name() << ", " << A->matrixPointer() << ", "
+       << args.lda->name() << ", " << B->matrixPointer() << ", "
+       << args.ldb->name() << ", " << args.beta->name() << ", "
+       << naiveC->matrixPointer() << ", " << args.ldc->name() << ")";
+    naiveCall_ = ss.str();
+
+    ss.str("");
+    ss << "compareMatrices(order, " << args.N->name() << ", " << args.N->name()
+       << ", " << C->matrixPointer() << ", " << naiveC->matrixPointer()
+       << ", " << args.ldc->name() << ")";
+    compareCall_ = ss.str();
+}
+
+void
+Syr2kStep::fixLD()
+{
+    CLBlasKargs args;
+
+    args = kargs();
+
+
+    if (args.transA == clblasNoTrans) {
+        if ((args.order == clblasColumnMajor) && (args.lda.matrix < args.N)) {
+            args.lda.matrix = args.N;
+        }
+        if ((args.order == clblasRowMajor) && (args.lda.matrix < args.K)) {
+            args.lda.matrix = args.K;
+        }
+        if ((args.order == clblasColumnMajor) && (args.ldb.matrix < args.N)) {
+            args.ldb.matrix = args.N;
+        }
+        if ((args.order == clblasRowMajor) && (args.ldb.matrix < args.K)) {
+            args.ldb.matrix = args.K;
+        }
+    }
+    else {
+        if ((args.order == clblasColumnMajor) && (args.lda.matrix < args.K)) {
+            args.lda.matrix = args.K;
+        }
+        if ((args.order == clblasRowMajor) && (args.lda.matrix < args.N)) {
+            args.lda.matrix = args.N;
+        }
+        if ((args.order == clblasColumnMajor) && (args.ldb.matrix < args.K)) {
+            args.ldb.matrix = args.K;
+        }
+        if ((args.order == clblasRowMajor) && (args.ldb.matrix < args.N)) {
+            args.ldb.matrix = args.N;
+        }
+    }
+    if (args.ldc.matrix < args.N) {
+        args.ldc.matrix = args.N;
+    }
+
+    args.transB = args.transA;
+    args.M = args.N;
+
+    setKargs(args);
+}
+
diff --git a/src/library/tools/ktest/steps/syr2k.h b/src/library/tools/ktest/steps/syr2k.h
new file mode 100644
index 0000000..b75029b
--- /dev/null
+++ b/src/library/tools/ktest/steps/syr2k.h
@@ -0,0 +1,36 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef KTEST_SYR2K_H__
+#define KTEST_SYR2K_H__
+
+#include "../step.h"
+
+namespace clMath {
+
+class Syr2kStep : public Step {
+public:
+    Syr2kStep(cl_device_id device);
+    Syr2kStep(ListNode *node);
+
+    virtual void fixLD();
+    virtual void declareVars(Step *masterStep);
+};
+
+}   // namespace clMath
+
+#endif  // KTEST_SYR2K_H__
diff --git a/src/library/tools/ktest/steps/syrk.cpp b/src/library/tools/ktest/steps/syrk.cpp
new file mode 100644
index 0000000..c21733e
--- /dev/null
+++ b/src/library/tools/ktest/steps/syrk.cpp
@@ -0,0 +1,136 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <sstream>
+
+#include "syrk.h"
+
+using namespace clMath;
+
+SyrkStep::SyrkStep(cl_device_id device) :
+    Step(CLBLAS_SYRK, device)
+{
+}
+
+SyrkStep::SyrkStep(ListNode *node) :
+    Step(node)
+{
+}
+
+void
+SyrkStep::declareVars(Step *masterStep)
+{
+    StepKargs args;
+    MatrixVariable *A, *C, *naiveC;
+
+    memset(&args, 0, sizeof(args));
+    std::string type = dtypeToString(kargs().dtype);
+
+    args.N = addConst("N", "cl_uint", kargs().N);
+    args.M = args.N;
+    args.K = addConst("K", "cl_uint", kargs().K);
+
+    args.lda = addConst("lda", "cl_uint", kargs().lda.matrix);
+    args.ldb = args.lda;
+    args.ldc = addConst("ldc", "cl_uint", kargs().ldc.matrix);
+
+    args.offsetM = addConst("offsetM", "cl_uint", kargs().offsetM);
+
+    args.offA = addVar("offA", "cl_uint", kargs().offA);
+    args.offBX = args.offA;
+    args.offCY = addVar("offC", "cl_uint", kargs().offCY);
+
+    args.alpha = addVar("alpha", type,
+        multiplierToString(kargs().dtype, kargs().alpha));
+    args.beta = addVar("beta", type,
+        multiplierToString(kargs().dtype, kargs().beta));
+
+    if (kargs().transA == clblasNoTrans) {
+        A = addMatrix("A", type + "*", args.N, args.K, args.lda, args.offA);
+    }
+    else {
+        A = addMatrix("A", type + "*", args.K, args.N, args.lda, args.offA);
+    }
+    C = addMatrix("C", type + "*", args.N, args.N, args.ldc, args.offCY);
+    naiveC = addMatrix("naiveC", type + "*", args.N, args.N, args.ldc, args.offCY);
+    naiveC->setCopy(C);
+
+    std::string bufAName, bufCName;
+    if (NULL == masterStep) {
+        bufAName = "bufA";
+        bufCName = "bufC";
+    }
+    else {
+        bufAName = masterStep->getBuffer((BufferID)(long)step_.args.A)->name();
+        bufCName = masterStep->getBuffer((BufferID)(long)step_.args.C)->name();
+    }
+
+    args.A = addBuffer(BUFFER_A, bufAName, "cl_mem", CL_MEM_READ_ONLY, A);
+    args.C = addBuffer(BUFFER_C, bufCName, "cl_mem", CL_MEM_READ_WRITE, C);
+    args.B = args.A;
+
+    assignKargs(args);
+
+    std::stringstream ss;
+    ss << getBlasFunctionName() << "(order, uplo, transA, "
+       << args.N->name() << ", " << args.K->name() << ", "
+       << args.alpha->name() << ", " << A->matrixPointer() << ", "
+       << args.lda->name() << ", " << args.beta->name() << ", "
+       << naiveC->matrixPointer() << ", " << args.ldc->name() << ")";
+    naiveCall_ = ss.str();
+
+    ss.str("");
+    ss << "compareMatrices(order, " << args.N->name() << ", " << args.N->name()
+       << ", " << C->matrixPointer() << ", " << naiveC->matrixPointer() << ", "
+       << args.ldc->name() << ")";
+    compareCall_ = ss.str();
+}
+
+void
+SyrkStep::fixLD()
+{
+    CLBlasKargs args;
+
+    args = kargs();
+
+    if (args.transA == clblasNoTrans) {
+        if ((args.order == clblasColumnMajor) && (args.lda.matrix < args.N)) {
+            args.lda.matrix = args.N;
+        }
+        if ((args.order == clblasRowMajor) && (args.lda.matrix < args.K)) {
+            args.lda.matrix = args.K;
+        }
+    }
+    else {
+        if ((args.order == clblasColumnMajor) && (args.lda.matrix < args.K)) {
+            args.lda.matrix = args.K;
+        }
+        if ((args.order == clblasRowMajor) && (args.lda.matrix < args.N)) {
+            args.lda.matrix = args.N;
+        }
+    }
+    if (args.ldc.matrix < args.N) {
+        args.ldc.matrix = args.N;
+    }
+
+    args.transB = args.transA;
+    args.M = args.N;
+    args.ldb.matrix = args.lda.matrix;
+
+    setKargs(args);
+}
+
diff --git a/src/library/tools/ktest/steps/syrk.h b/src/library/tools/ktest/steps/syrk.h
new file mode 100644
index 0000000..a217169
--- /dev/null
+++ b/src/library/tools/ktest/steps/syrk.h
@@ -0,0 +1,36 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef KTEST_SYRK_H__
+#define KTEST_SYRK_H__
+
+#include "../step.h"
+
+namespace clMath {
+
+class SyrkStep : public Step {
+public:
+    SyrkStep(cl_device_id device);
+    SyrkStep(ListNode *node);
+
+    virtual void fixLD();
+    virtual void declareVars(Step *masterStep);
+};
+
+}   // namespace clMath
+
+#endif  // KTEST_SYRK_H__
diff --git a/src/library/tools/ktest/steps/trmm.cpp b/src/library/tools/ktest/steps/trmm.cpp
new file mode 100644
index 0000000..2c6751a
--- /dev/null
+++ b/src/library/tools/ktest/steps/trmm.cpp
@@ -0,0 +1,134 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <sstream>
+
+#include "trmm.h"
+
+using namespace clMath;
+
+TrmmStep::TrmmStep(cl_device_id device) :
+    Step(CLBLAS_TRMM, device)
+{
+}
+
+TrmmStep::TrmmStep(ListNode *node) :
+    Step(node)
+{
+}
+
+void
+TrmmStep::declareVars(Step *masterStep)
+{
+    StepKargs args;
+    MatrixVariable *A, *B, *naiveB;
+
+    memset(&args, 0, sizeof(args));
+    std::string type = dtypeToString(kargs().dtype);
+
+    args.M = addConst("M", "cl_uint", kargs().M);
+    args.N = addConst("N", "cl_uint", kargs().N);
+    if (kargs().side == clblasLeft) {
+        args.K = args.M;
+    }
+    else {
+        args.K = args.N;
+    }
+
+    args.lda = addConst("lda", "cl_uint", kargs().lda.matrix);
+    args.ldb = addConst("ldb", "cl_uint", kargs().ldb.matrix);
+
+    args.offA = addVar("offA", "cl_uint", kargs().offA);
+    args.offBX = addVar("offB", "cl_uint", kargs().offBX);
+
+    args.alpha = addVar("alpha", type,
+        multiplierToString(kargs().dtype, kargs().alpha));
+
+    if (kargs().side == clblasLeft) {
+        A = addMatrix("A", type + "*", args.M, args.M, args.lda, args.offA);
+    }
+    else {
+        A = addMatrix("A", type + "*", args.N, args.N, args.lda, args.offA);
+    }
+    B = addMatrix("B", type + "*", args.M, args.N, args.ldb, args.offBX);
+    naiveB = addMatrix("naiveB", type + "*", args.M, args.N, args.ldb, args.offBX);
+    naiveB->setCopy(B);
+
+    std::string bufAName, bufBName;
+    if (NULL == masterStep) {
+        bufAName = "bufA";
+        bufBName = "bufB";
+    }
+    else {
+        bufAName = masterStep->getBuffer((BufferID)(long)step_.args.A)->name();
+        bufBName = masterStep->getBuffer((BufferID)(long)step_.args.B)->name();
+    }
+
+    args.A = addBuffer(BUFFER_A, bufAName, "cl_mem", CL_MEM_READ_ONLY, A);
+    args.B = addBuffer(BUFFER_B, bufBName, "cl_mem", CL_MEM_READ_WRITE, B);
+
+    assignKargs(args);
+
+    std::stringstream ss;
+    ss << getBlasFunctionName() << "(order, side, uplo, transA, diag, "
+       << args.M->name() << ", " << args.N->name() << ", "
+       << args.alpha->name() << ", " << A->matrixPointer() << ", "
+       << args.lda->name() << ", " << naiveB->matrixPointer() << ", "
+       << args.ldb->name() << ")";
+    naiveCall_ = ss.str();
+
+    ss.str("");
+    ss << "compareMatrices(order, " << args.M->name() << ", " << args.N->name()
+       << ", " << B->matrixPointer() << ", " << naiveB->matrixPointer()
+       << ", " << args.ldb->name() << ")";
+    compareCall_ = ss.str();
+}
+
+void
+TrmmStep::fixLD()
+{
+    CLBlasKargs args;
+
+    args = kargs();
+
+    if (args.side == clblasLeft) {
+        if (args.lda.matrix < args.M) {
+            args.lda.matrix = args.M;
+        }
+    }
+    else {
+        if (args.lda.matrix < args.N) {
+            args.lda.matrix = args.N;
+        }
+    }
+    if ((args.order == clblasColumnMajor) && (args.ldb.matrix < args.M)) {
+        args.ldb.matrix = args.M;
+    }
+    if ((args.order == clblasRowMajor) && (args.ldb.matrix < args.N)) {
+        args.ldb.matrix = args.N;
+    }
+
+    // Store original problem size in K, this is used to know it while
+    // calculating result by parts using M or N as part size
+    if (args.side == clblasLeft) {
+        args.K = args.M;
+    }
+    else {
+        args.K = args.N;
+    }
+    setKargs(args);
+}
diff --git a/src/library/tools/ktest/steps/trmm.h b/src/library/tools/ktest/steps/trmm.h
new file mode 100644
index 0000000..4fe778f
--- /dev/null
+++ b/src/library/tools/ktest/steps/trmm.h
@@ -0,0 +1,36 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef KTEST_TRMM_H__
+#define KTEST_TRMM_H__
+
+#include "../step.h"
+
+namespace clMath {
+
+class TrmmStep : public Step {
+public:
+    TrmmStep(cl_device_id device);
+    TrmmStep(ListNode *node);
+
+    virtual void fixLD();
+    virtual void declareVars(Step *masterStep);
+};
+
+}   // namespace clMath
+
+#endif  // KTEST_TRMM_H__
diff --git a/src/library/tools/ktest/steps/trsm.cpp b/src/library/tools/ktest/steps/trsm.cpp
new file mode 100644
index 0000000..1044d9f
--- /dev/null
+++ b/src/library/tools/ktest/steps/trsm.cpp
@@ -0,0 +1,142 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <sstream>
+
+#include "trsm.h"
+
+using namespace clMath;
+
+TrsmStep::TrsmStep(cl_device_id device) :
+    Step(CLBLAS_TRSM, device)
+{
+}
+
+TrsmStep::TrsmStep(ListNode *node) :
+    Step(node)
+{
+}
+
+void
+TrsmStep::declareVars(Step *masterStep)
+{
+    StepKargs args;
+    MatrixVariable *A, *B, *naiveB;
+
+    memset(&args, 0, sizeof(args));
+    std::string type = dtypeToString(kargs().dtype);
+
+    args.M = addConst("M", "cl_uint", kargs().M);
+    args.N = addConst("N", "cl_uint", kargs().N);
+    if (kargs().side == clblasLeft) {
+        args.K = args.M;
+    }
+    else {
+        args.K = args.N;
+    }
+
+    args.lda = addConst("lda", "cl_uint", kargs().lda.matrix);
+    args.ldb = addConst("ldb", "cl_uint", kargs().ldb.matrix);
+
+    args.offA = addVar("offA", "cl_uint", kargs().offA);
+    args.offBX = addVar("offB", "cl_uint", kargs().offBX);
+
+    args.alpha = addVar("alpha", type,
+        multiplierToString(kargs().dtype, kargs().alpha));
+
+    if (kargs().side == clblasLeft) {
+        A = addMatrix("A", type + "*", args.M, args.M, args.lda, args.offA);
+    }
+    else {
+        A = addMatrix("A", type + "*", args.N, args.N, args.lda, args.offA);
+    }
+    B = addMatrix("B", type + "*", args.M, args.N, args.ldb, args.offBX);
+    naiveB = addMatrix("naiveB", type + "*", args.M, args.N, args.ldb, args.offBX);
+    naiveB->setCopy(B);
+
+    std::string bufAName, bufBName;
+    if (NULL == masterStep) {
+        bufAName = "bufA";
+        bufBName = "bufB";
+    }
+    else {
+        bufAName = masterStep->getBuffer((BufferID)(long)step_.args.A)->name();
+        bufBName = masterStep->getBuffer((BufferID)(long)step_.args.B)->name();
+    }
+
+    args.A = addBuffer(BUFFER_A, bufAName, "cl_mem", CL_MEM_READ_ONLY, A);
+    args.B = addBuffer(BUFFER_B, bufBName, "cl_mem", CL_MEM_READ_WRITE, B);
+
+    assignKargs(args);
+
+    std::stringstream ss;
+    ss << getBlasFunctionName() << "(order, side, uplo, transA, diag, "
+       << args.M->name() << ", " << args.N->name() << ", "
+       << args.alpha->name() << ", " << A->matrixPointer() << ", "
+       << args.lda->name() << ", " << naiveB->matrixPointer() << ", "
+       << args.ldb->name() << ")";
+    naiveCall_ = ss.str();
+
+    ss.str("");
+    ss << "compareMatrices(order, " << args.M->name() << ", " << args.N->name()
+       << ", " << B->matrixPointer() << ", " << naiveB->matrixPointer()
+       << ", " << args.ldb->name() << ")";
+    compareCall_ = ss.str();
+
+    ss.str("");
+    ss << "setUpTRSMDiagonal(order, side, uplo, transA, diag, "
+       <<  args.M->name() << ", " << args.N->name() << ", "
+       << args.alpha->name() << ", " << A->matrixPointer() << ", "
+       << args.lda->name() << ", " << B->matrixPointer() << ", "
+       << args.ldb->name() << ")";
+    postRandomCall_ = ss.str();
+}
+
+void
+TrsmStep::fixLD()
+{
+    CLBlasKargs args;
+
+    args = kargs();
+
+    if (args.side == clblasLeft) {
+        if (args.lda.matrix < args.M) {
+            args.lda.matrix = args.M;
+        }
+    }
+    else {
+        if (args.lda.matrix < args.N) {
+            args.lda.matrix = args.N;
+        }
+    }
+    if ((args.order == clblasColumnMajor) && (args.ldb.matrix < args.M)) {
+        args.ldb.matrix = args.M;
+    }
+    if ((args.order == clblasRowMajor) && (args.ldb.matrix < args.N)) {
+        args.ldb.matrix = args.N;
+    }
+    // Store original problem size in K, this is used to know it while
+    // calculating result by parts using M or N as part size
+    if (args.side == clblasLeft) {
+        args.K = args.M;
+    }
+    else {
+        args.K = args.N;
+    }
+
+    setKargs(args);
+}
diff --git a/src/library/tools/ktest/steps/trsm.h b/src/library/tools/ktest/steps/trsm.h
new file mode 100644
index 0000000..5f415bb
--- /dev/null
+++ b/src/library/tools/ktest/steps/trsm.h
@@ -0,0 +1,36 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef KTEST_TRSM_H__
+#define KTEST_TRSM_H__
+
+#include "../step.h"
+
+namespace clMath {
+
+class TrsmStep : public Step {
+public:
+    TrsmStep(cl_device_id device);
+    TrsmStep(ListNode *node);
+
+    virtual void fixLD();
+    virtual void declareVars(Step *masterStep);
+};
+
+}   // namespace clMath
+
+#endif  // KTEST_TRSM_H__
diff --git a/src/library/tools/ktest/var.cpp b/src/library/tools/ktest/var.cpp
new file mode 100644
index 0000000..bc68156
--- /dev/null
+++ b/src/library/tools/ktest/var.cpp
@@ -0,0 +1,199 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include "var.h"
+
+using namespace clMath;
+
+struct MemFlags {
+    cl_mem_flags flag;
+    const char* name;
+};
+
+static const struct MemFlags MEM_FLAGS[] = {
+    { CL_MEM_READ_WRITE,    "CL_MEM_READ_WRITE" },
+    { CL_MEM_WRITE_ONLY,    "CL_MEM_WRITE_ONLY" },
+    { CL_MEM_READ_ONLY,     "CL_MEM_READ_ONLY" },
+    { CL_MEM_USE_HOST_PTR,  "CL_MEM_USE_HOST_PTR" },
+    { CL_MEM_ALLOC_HOST_PTR,"CL_MEM_ALLOC_HOST_PTR" },
+    { CL_MEM_COPY_HOST_PTR, "CL_MEM_COPY_HOST_PTR" },
+    { 0, NULL }
+};
+
+Variable::Variable(
+    const std::string& name,
+    const std::string& type,
+    const std::string& defaultValue)
+{
+    name_ = name;
+    type_ = type;
+    defaultValue_ = defaultValue;
+    isBuffer_ = false;
+    constant_ = false;
+
+    copyOf_ = NULL;
+
+    flags_ = 0;
+    hostPtr_ = NULL;
+}
+
+Variable::Variable()
+{
+    Variable("", "");
+}
+
+MatrixVariable::MatrixVariable(
+    const std::string& name,
+    const std::string& type,
+    const std::string& defaultValue)
+{
+    name_ = name;
+    type_ = type;
+    defaultValue_ = defaultValue;
+    isBuffer_ = false;
+    constant_ = false;
+
+    copyOf_ = NULL;
+
+    flags_ = 0;
+    hostPtr_ = NULL;
+
+    rows_ = NULL;
+    columns_ = NULL;
+    ld_ = NULL;
+    off_ = NULL;
+}
+
+VectorVariable::VectorVariable(
+    const std::string& name,
+    const std::string& type,
+    const std::string& defaultValue)
+{
+    name_ = name;
+    type_ = type;
+    defaultValue_ = defaultValue;
+    isBuffer_ = false;
+    constant_ = false;
+
+    copyOf_ = NULL;
+
+    flags_ = 0;
+    hostPtr_ = NULL;
+
+    nElems_ = NULL;
+    inc_ = NULL;
+    off_ = NULL;
+}
+
+Variable::~Variable()
+{
+}
+
+void
+Variable::setDefaultValue(const std::string& defaultValue)
+{
+    defaultValue_ = defaultValue;
+}
+
+void
+Variable::setConstant(bool constant)
+{
+    constant_ = constant;
+}
+
+void
+Variable::setCopy(Variable *copy)
+{
+    copyOf_ = copy;
+}
+
+void
+MatrixVariable::setMatrixSize(
+    Variable *rows,
+    Variable *columns,
+    Variable *ld,
+    Variable *off)
+{
+    if ((rows == NULL) || (columns == NULL)) {
+        return;
+    }
+    rows_ = rows;
+    columns_ = columns;
+    ld_ = ld;
+    off_ = off;
+    matrixPointer_ = name_;
+    if (off != NULL) {
+        matrixPointer_ += " + " + off_->name();
+}
+}
+
+void
+VectorVariable::setVectorSize(
+    Variable *nElems,
+    Variable *inc,
+    Variable *off)
+{
+    if (nElems == NULL) {
+        return;
+    }
+    nElems_ = nElems;
+    inc_ = inc;
+    off_ = off;
+    vectorPointer_ = name_;
+    if (off != NULL) {
+        vectorPointer_ += " + " + off_->name();
+}
+}
+
+std::string
+Variable::flagsStr() const
+{
+    std::string str;
+    size_t i;
+
+    if (type_ != "cl_mem") {
+        return "";
+    }
+    if (flags_ == 0) {
+        return "0";
+    }
+    for (i = 0; MEM_FLAGS[i].flag != 0; i++) {
+        if (flags_ & MEM_FLAGS[i].flag) {
+            if (!str.empty()) {
+                str += " | ";
+            }
+            str += MEM_FLAGS[i].name;
+        }
+    }
+    return str;
+}
+
+void
+Variable::setFlags(cl_mem_flags flags)
+{
+    if (type_ == "cl_mem") {
+        flags_ = flags;
+    }
+}
+
+void
+Variable::setHostPtr(Variable *hostPtr)
+{
+    if (type_ == "cl_mem") {
+        hostPtr_ = hostPtr;
+    }
+}
diff --git a/src/library/tools/ktest/var.h b/src/library/tools/ktest/var.h
new file mode 100644
index 0000000..0ebb107
--- /dev/null
+++ b/src/library/tools/ktest/var.h
@@ -0,0 +1,162 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef KTEST_VAR_H__
+#define KTEST_VAR_H__
+
+#include <CL/cl.h>
+#include <string>
+
+namespace clMath {
+
+typedef enum BufferID {
+    BUFFER_NONE,
+    BUFFER_A,
+    BUFFER_B,
+    BUFFER_C
+} BufferID;
+
+/**
+ * @internal
+ * @brief Variable class
+ *
+ * Objects of this class store name, type and other attributes of variables
+ * necessary for further code generation.
+ *
+ */
+
+class Variable {
+protected:
+    std::string name_;
+    std::string type_;
+    std::string defaultValue_;
+    bool constant_;
+    bool isBuffer_;
+    BufferID bufID_;
+
+    Variable *copyOf_;
+
+    /* Buffer object info */
+    cl_mem_flags flags_;
+    Variable *hostPtr_;
+
+public:
+    Variable(const std::string& name, const std::string& type,
+        const std::string& defaultValue = "");
+    Variable();
+    ~Variable();
+
+    const std::string& name() const         { return name_; }
+    const std::string& type() const         { return type_; }
+
+    const std::string& defaultValue() const { return defaultValue_; }
+    void setDefaultValue(const std::string& defaultValue);
+
+    bool constant() const                   { return constant_; }
+    bool isBuffer() const                   { return isBuffer_; }
+    BufferID getBufID() const               { return bufID_; }
+    void setConstant(bool constant);
+    void setIsBuffer(bool isBuffer)         { isBuffer_ = isBuffer; }
+
+    Variable* copyOf() const                { return copyOf_; }
+    void setCopy(Variable *copy);
+
+    void setBufferID(BufferID bufID)        { bufID_ = bufID; }
+
+    cl_mem_flags flags() const              { return flags_; }
+    std::string flagsStr() const;
+    void setFlags(cl_mem_flags flags);
+
+    Variable* hostPtr() const { return hostPtr_; }
+    void setHostPtr(Variable *var);
+};
+
+class ArrayVariableInterface : public Variable {
+public:
+    virtual bool isMatrix() = 0;
+    virtual bool isVector() = 0;
+    virtual ~ArrayVariableInterface() {}
+};
+
+/**
+ * @internal
+ * @brief Matrix variable class
+ *
+ * Objects of this class store information about matrix array
+ * necessary for further code generation.
+ *
+ */
+class MatrixVariable : public ArrayVariableInterface {
+private:
+    /* Matrix info */
+    Variable *rows_;
+    Variable *columns_;
+    Variable *ld_;
+    Variable *off_;
+    std::string matrixPointer_;
+public:
+    Variable* rows() const                  { return rows_; }
+    Variable* columns() const               { return columns_; }
+    Variable* ld() const                    { return ld_; }
+    Variable* off() const                   { return off_; }
+
+    bool isMatrix()                         { return true; }
+    bool isVector()                         { return false; }
+
+    const std::string& matrixPointer() const  { return matrixPointer_; }
+
+    void setMatrixSize(Variable *rows, Variable *columns,
+        Variable *ld = NULL, Variable *off = NULL);
+    MatrixVariable(const std::string& name, const std::string& type,
+        const std::string& defaultValue = "");
+    ~MatrixVariable() {};
+};
+
+/**
+ * @internal
+ * @brief Vector variable class
+ *
+ * Objects of this class store information about vector array
+ * necessary for further code generation.
+ *
+ */
+class VectorVariable : public ArrayVariableInterface {
+private:
+    /* Vector info */
+    Variable *nElems_;
+    Variable *inc_;
+    Variable *off_;
+    std::string vectorPointer_;
+public:
+    Variable* nElems() const                { return nElems_; }
+    Variable* inc() const                   { return inc_; }
+    Variable* off() const                   { return off_; }
+
+    virtual bool isMatrix()                 { return false; }
+    virtual bool isVector()                 { return true; }
+
+    const std::string& vectorPointer() const  { return vectorPointer_; }
+
+    void setVectorSize(Variable *nElems, Variable *inc,
+        Variable *off = NULL);
+    VectorVariable(const std::string& name, const std::string& type,
+        const std::string& defaultValue = "");
+};
+
+}   // namespace clMath
+
+#endif  // KTEST_VAR_H__
diff --git a/src/library/tools/tplgen/CMakeLists.txt b/src/library/tools/tplgen/CMakeLists.txt
new file mode 100644
index 0000000..a33e992
--- /dev/null
+++ b/src/library/tools/tplgen/CMakeLists.txt
@@ -0,0 +1,20 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+cmake_minimum_required(VERSION 2.6)
+project(tplgen C CXX)
+ADD_DEFINITIONS(/D_CRT_SECURE_NO_WARNINGS)
+ADD_EXECUTABLE(tplgen tplgen.cpp)
diff --git a/src/library/tools/tplgen/configure.bat b/src/library/tools/tplgen/configure.bat
new file mode 100644
index 0000000..b1f3db6
--- /dev/null
+++ b/src/library/tools/tplgen/configure.bat
@@ -0,0 +1,14 @@
+del CMakeCache.txt
+cmake -DCMAKE_BUILD_TYPE=debug -G "Visual Studio 10" ..\tplgen
+if NOT ERRORLEVEL 1 goto end
+IF ERRORLEVEL 4 goto try9
+IF ERRORLEVEL 3 goto try9
+IF ERRORLEVEL 2 goto try9
+IF ERRORLEVEL 1 goto try9
+goto end
+
+:try9
+del CMakeCache.txt
+cmake -DCMAKE_BUILD_TYPE=Debug -G "Visual Studio 9 2008" ..\tplgen
+
+:end
\ No newline at end of file
diff --git a/src/library/tools/tplgen/tplgen.cpp b/src/library/tools/tplgen/tplgen.cpp
new file mode 100644
index 0000000..25150aa
--- /dev/null
+++ b/src/library/tools/tplgen/tplgen.cpp
@@ -0,0 +1,165 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef __GNUC__
+// Linux
+    #include <sys/types.h>
+    #include <sys/stat.h>
+    #include <unistd.h>
+#else
+// Windows
+    #include <time.h>
+    #include <sys/types.h>
+    #include <sys/stat.h>
+    #define stat _stat
+#endif
+
+using namespace std;
+
+bool isModified( char *clFile, char *clTFile )
+{
+    struct stat queryClFile;
+    struct stat queryClTFile;
+    int retval1, retval2;
+
+    retval1 = stat( clFile, &queryClFile );
+    retval2 = stat( clTFile, &queryClTFile );
+
+    if (retval1 != 0)
+    {
+        //
+        // No CL file to process
+        //
+        return false;
+    }
+
+    if (retval2 == 0)
+    {
+        //
+        // Both files are present
+        //
+        return ( (queryClFile.st_mtime) >= (queryClTFile.st_mtime) )? true: false;
+    }
+
+    //
+    // Force a CLT generation - Only CL is present
+    //
+    return true;
+}
+
+int main( int argc, char *argv[] )
+{
+    bool validKernel;
+    int lineCount;
+    size_t found;
+    string str;
+    int startOptions = 1;
+    char *outputPrefix = "";
+
+    std::cout << "TPLGEN Running.....\n";
+    if (argc < 2)
+    {
+        return -1;
+    }
+
+    if (strcmp(argv[1], "-o") == 0)
+    {
+        if (argc < 3)
+        {
+            return -1;
+        }
+        outputPrefix = argv[2];
+        startOptions = 3;
+    }
+
+    for ( int i=startOptions; i<argc; i++ )
+    {
+        char cltFile[1024];
+
+        strcpy(cltFile, outputPrefix);
+        #ifdef __GNUC__
+            strcat(cltFile, "/");
+        #else
+            strcat(cltFile, "\\" );
+        #endif
+        strcat(cltFile, argv[i]);
+        strcat(cltFile, "T");
+
+        if( !isModified(argv[i], cltFile ) )
+        {
+            continue;
+        }
+        std::cout << "Processing " << argv[i] << std::endl;
+
+        ifstream inFile( argv[i] );
+        ofstream outFile( cltFile );
+
+        if( !(inFile.is_open()) || !(outFile.is_open()) )
+        {
+            cerr << "\tWARNING: couldn't open file!" << std::endl;
+            continue;
+        }
+
+        validKernel = false;
+        while( inFile.good() )
+        {
+            getline( inFile, str );
+
+            // Replace all tabs with spaces
+            found = str.find( '\t' );
+            while (found != string::npos)
+            {
+                str.replace( found, 1, "    " );
+                found = str.find( '\t' );
+            }
+
+            // Find for beginning of the kernel
+            if ( !validKernel && (str.find( "char" ) != string::npos) && (str.find( '*' ) != string::npos)
+                 && (str.find( '"' ) != string::npos))       // Beginning of the kernel
+            {
+                validKernel = true;
+                outFile << str << "\\\n";
+                lineCount = 1;
+            }
+            // Find for end of kernel
+            else if( (str.find( "\";" ) != string::npos) && validKernel )
+            {
+                outFile << str << "\n\n\n";
+                validKernel = false;
+            }
+            else if( validKernel )
+            {
+                outFile << str << "\\n\\\n";                   // All other lines
+                lineCount ++;
+                // Break the string every 50 lines so that it does not overflow string limitations on windows
+                if( (lineCount%50) == 0 )
+                    outFile << "\"\n\"\\\n";
+            } else {
+                outFile << str << std::endl;
+            }
+        }
+        inFile.close();
+        outFile.close();
+    }
+
+    return 0;
+}
diff --git a/src/library/tools/tune/CMakeLists.txt b/src/library/tools/tune/CMakeLists.txt
new file mode 100644
index 0000000..dbfcce9
--- /dev/null
+++ b/src/library/tools/tune/CMakeLists.txt
@@ -0,0 +1,156 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+set(TOOLS_SRC
+    tune.c
+    toolslib.c
+    fileio.c
+    subdim.c
+    storage_data.c
+    storage_init.c
+    storage_io.c
+    dimension.c
+)
+
+set(TOOLS_EXTERNAL_SRC
+    ../../blas/generic/common.c
+    ../../blas/generic/blas_funcs.c
+    ../../blas/init.c
+    ../../blas/impl.c
+    ../../blas/scimage.c
+    ../../blas/generic/events.c
+    ../../blas/generic/matrix_props.c
+    ../../blas/generic/matrix_dims.c
+    ../../blas/gens/tile.c
+    ../../blas/gens/tile_iter.c
+    ../../blas/gens/blas_subgroup.c
+    ../../blas/gens/decomposition.c
+    ../../blas/gens/blas_subgroup.c
+    ../../blas/gens/blas_kgen.c
+    ../../blas/gens/gen_helper.c
+    ../../blas/gens/tilemul.c
+    ../../blas/gens/fetch.c
+    ../../blas/gens/gen_init.c
+    ../../blas/gens/gemv.c
+    ../../blas/gens/symv.c
+    ../../blas/gens/gemm.c
+    ../../blas/gens/trmm.c
+    ../../blas/gens/trsm.c
+    ../../blas/gens/syrxk.c
+    ../../blas/gens/trxm_common.c
+    ../../blas/gens/trsm_kgen.c
+    ../../blas/gens/xxmv_common.c
+    ../../blas/gens/legacy/blas_kgen_legacy.c
+    ../../blas/gens/legacy/gen_helper_legacy.c
+    ../../blas/gens/legacy/trxm_common_legacy.c
+    ../../blas/gens/legacy/trsm_kgen_legacy.c
+    ../../blas/gens/legacy/blkmul.c
+    ../../blas/gens/legacy/gemm_lds.c
+    ../../blas/gens/legacy/gemm_img.c
+    ../../blas/gens/legacy/trmm_lds.c
+    ../../blas/gens/legacy/trmm_img.c
+    ../../blas/gens/legacy/trsm_lds.c
+    ../../blas/gens/legacy/trsm_img.c
+    ../../blas/gens/legacy/trsm_cached_lds.c
+    ../../common/devinfo.c
+    ../../common/kern_cache.c
+    ../../common/mutex.c
+    ../../common/list.c
+    ../../common/kerngen_core.c
+    ../../common/kgen_basic.c
+    ../../common/kgen_loop_helper.c
+    ../../common/misc.c
+    ../../common/kgen_guard.c
+    ../../common/clkern.c
+    ../../common/trace_malloc.c
+    ../../common/gens/dblock_kgen.c
+    ../../blas/generic/solution_seq_make.c
+    ../../blas/generic/solution_seq.c
+    ../../blas/generic/solution_assert.c
+    ../../blas/generic/problem_iter.c
+    ../../blas/generic/kernel_extra.c
+    ../../blas/generic/kdump.c
+	
+	../../blas/gens/trmv_reg.cpp
+	../../blas/gens/ger_lds.cpp
+	../../blas/gens/trsv_trtri.cpp
+	../../blas/gens/trsv_gemv.cpp
+	../../blas/gens/kprintf.cpp
+	../../blas/gens/syr_lds.cpp
+	../../blas/gens/symm_cached.cpp
+	../../blas/gens/gemm_cached.cpp
+	../../blas/gens/gemm_tail_cached.cpp
+	../../blas/gens/syr2_lds.cpp
+	../../blas/gens/her_lds.cpp
+	../../blas/gens/her2_lds.cpp
+	../../blas/gens/gbmv.cpp
+	../../blas/gens/tuned_numbers.c
+    ../../blas/gens/swap_reg.cpp
+    ../../blas/gens/scal_reg.cpp
+    ../../blas/gens/copy_reg.cpp
+    ../../blas/gens/axpy_reg.cpp
+    ../../blas/gens/dot.cpp
+    ../../blas/gens/reduction.cpp
+    ../../blas/gens/rotg_reg.cpp
+    ../../blas/gens/rotmg_reg.cpp
+    ../../blas/gens/rotm_reg.cpp
+    ../../blas/gens/iamax.cpp
+    ../../blas/gens/nrm2.cpp
+    ../../blas/gens/asum.cpp
+)
+
+include_directories(${OPENCL_INCLUDE_DIRS}
+    ${clBLAS_SOURCE_DIR}
+    ${clBLAS_SOURCE_DIR}/include
+    ${clBLAS_SOURCE_DIR}/library/blas/include
+    ${clBLAS_SOURCE_DIR}/library/blas/gens
+    ${clBLAS_BINARY_DIR}/include
+    ${clBLAS_SOURCE_DIR}/library/tools/tune
+)
+
+#setup Visual studio tabs
+source_group(\\ FILES ${TOOLS_SRC})
+
+option( BLAS_DEBUG_TOOLS "Compile extra debug logic with regards to tuning database" OFF )
+if( BLAS_DEBUG_TOOLS )
+    add_definitions( -D_DEBUG_TOOLS )
+endif()
+
+# Library with functions for time measurement. In Windows they are included automatically
+if(UNIX)
+    set(TIME_LIBRARY "rt")
+endif()
+
+
+add_executable(tune ${TOOLS_SRC} ${TOOLS_EXTERNAL_SRC})
+add_dependencies(tune GENERATE_CLT)
+target_link_libraries(tune ${OPENCL_LIBRARIES} ${TIME_LIBRARY} ${MATH_LIBRARY})
+
+if( TARGET_PLATFORM EQUAL 64 )
+	# CPack configuration; include the executable into the package
+	install( TARGETS tune
+			RUNTIME DESTINATION bin64
+			LIBRARY DESTINATION lib64
+			ARCHIVE DESTINATION lib64/import
+			)
+else()
+	# CPack configuration; include the executable into the package
+	install( TARGETS tune
+			RUNTIME DESTINATION bin32
+			LIBRARY DESTINATION lib32
+			ARCHIVE DESTINATION lib32/import
+			)
+endif()
diff --git a/src/library/tools/tune/dimension.c b/src/library/tools/tune/dimension.c
new file mode 100644
index 0000000..21fe162
--- /dev/null
+++ b/src/library/tools/tune/dimension.c
@@ -0,0 +1,136 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <math.h>       // sqrt()
+
+#include "toolslib.h"
+#include "clblas_stddef.h"
+#include "storage_data.h"
+
+
+unsigned int  DimensionsArrayL3[]=  {7, 13, 32, 48, 64, 64};
+unsigned int  DimensionsArrayL2[]=  {768/4, 1792/4, 3328/4, 5248/4, 6784/4,
+                                     3*1024/4};
+
+int getDimensionCount(TargetDevice* tdev, int func)
+{
+	(void)tdev;
+	(void)func;
+	return DIMARRAYCOUNT;
+}
+
+//
+dimension
+getDimensionID(TargetDevice* tdev, int func, size_t M, size_t N, size_t K)
+{
+    (void)tdev;
+	(void)func;
+	(void)M;
+	(void)N;
+	(void)K;
+
+	return 0;
+}
+
+#include <assert.h>
+
+unsigned int
+getDimension(int idx, DataType dt, DeviceInfo *devInfo, int func)
+{
+    unsigned int dim;
+    // bas - banks aligned size, in bytes, should be
+    // number of banks * number of channels * bytes per channel
+    // here it is set to 8*256 = 2048 = 512 floats
+    size_t bas = 8*256;
+    unsigned int tsize;
+
+    // The minimum step for which the tails are not.
+    size_t noTailStep;
+
+    float step;
+
+    (void) func;
+
+    tsize = dtypeSize(dt);
+    noTailStep = 256 * sizeof(cl_float) / tsize;
+
+    // !!! DEBUG
+    //printf("[%s, line %d]: devInfo->globalSize = %lu\n",
+    //        __func__, __LINE__, devInfo->globalSize);
+
+    /*
+     * Skip the smallest size, it does not provide sufficient
+     * device payload anyway
+     */
+    //i = (idx == DIMARRAYCOUNT - 1) ? (DIMARRAYCOUNT - 1) : (idx + 1);
+
+//    dim = DimensionsArray2[i];
+//    dim *= devInfo->nrComputeUnits;
+    step = (float)umin(devInfo->nrComputeUnits, funcBlasLevel(func) == 2 ? 1 : 24);
+
+    switch (dt) {
+        case TYPE_FLOAT:
+            step *= 4;
+            break;
+        case TYPE_DOUBLE:
+        case TYPE_COMPLEX_FLOAT:
+            step = 2.8f * step;
+            break;
+        case TYPE_COMPLEX_DOUBLE:
+#if defined(_WIN32) && defined(FORCE_BSOD)
+            if (func != CLBLAS_SYRK && func != CLBLAS_SYR2K) {
+                step *= 2;
+            }
+#else
+            step *= 2;
+#endif
+            break;
+    }
+
+    if (funcBlasLevel(func) == 2) {
+        dim = (unsigned int)(step * DimensionsArrayL2[idx]);
+    }
+    else {
+        dim = (unsigned int)(step * DimensionsArrayL3[idx]);
+    }
+
+    if (dim * dim * tsize > devInfo->maxMemAllocSize) {
+        dim = (unsigned int)sqrt((double)(devInfo->maxMemAllocSize / tsize));
+    }
+
+    assert(devInfo->globalSize);
+    if (dim * dim * tsize >= devInfo->globalSize / 3) {
+        dim = (unsigned int)sqrt((double)devInfo->globalSize / 3 / tsize);
+    }
+
+    dim = (unsigned int)roundUp(dim - (noTailStep/2), noTailStep);
+    if (idx == BANK_ALIGNED_CASE_RECORD_IDX) {
+        // force size to be banks aligned
+        if (dim * dtypeSize(dt) % bas != 0) {
+            dim = (unsigned int)roundUp(dim, bas / dtypeSize(dt));
+        }
+    }
+    else {
+        // avoid banks aligned size adding maximal base dimension
+        if (dim * dtypeSize(dt) % bas == 0) {
+//            dim += DimensionsArray2[DIMARRAYCOUNT - 1] /
+//                   (dtypeSize(dt) / sizeof(cl_float));
+            dim += (unsigned int)noTailStep;
+        }
+    }
+	return dim;
+}
diff --git a/src/library/tools/tune/fileio.c b/src/library/tools/tune/fileio.c
new file mode 100644
index 0000000..4f68e92
--- /dev/null
+++ b/src/library/tools/tune/fileio.c
@@ -0,0 +1,388 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <string.h>
+#include <assert.h>
+
+#include "fileio.h"
+
+#ifdef _WIN32
+const char dirDelimiter = '\\';
+#else
+const char dirDelimiter = '/';
+#endif
+
+//TODO
+typedef unsigned int      uint32_t;
+typedef uint32_t  uint_least32_t;
+/*
+  Name  : CRC-32
+  Poly  : 0x04C11DB7    x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11
+                       + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1
+  Init  : 0xFFFFFFFF
+  Revert: true
+  XorOut: 0xFFFFFFFF
+  Check : 0xCBF43926 ("123456789")
+  MaxLen: 268 435 455 ���� (2 147 483 647 ���)
+*/
+const uint_least32_t Crc32Table[256] = {
+    0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA,
+    0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
+    0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988,
+    0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
+    0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE,
+    0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+    0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC,
+    0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
+    0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172,
+    0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
+    0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940,
+    0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+    0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116,
+    0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
+    0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924,
+    0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
+    0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A,
+    0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+    0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818,
+    0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
+    0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
+    0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
+    0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C,
+    0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+    0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2,
+    0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
+    0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0,
+    0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
+    0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086,
+    0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+    0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4,
+    0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
+    0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A,
+    0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
+    0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8,
+    0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+    0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE,
+    0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
+    0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC,
+    0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
+    0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252,
+    0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+    0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60,
+    0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
+    0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236,
+    0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
+    0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04,
+    0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+    0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A,
+    0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
+    0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38,
+    0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
+    0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E,
+    0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+    0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C,
+    0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
+    0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2,
+    0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
+    0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0,
+    0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+    0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6,
+    0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
+    0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
+    0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+};
+
+void
+hfInit(HfInfo* hf) {
+	hf->file = NULL;
+	hf->hash = 0;
+}
+char *
+hfCreateFullPatch( const char* path, const char * name, const char * ext )
+{
+    char * fname;
+    size_t pathSize;
+    size_t nameSize;
+    size_t extSize;
+
+    pathSize = strlen(path);
+    nameSize = strlen(name);
+    extSize  = strlen(ext);
+
+    // Add three characters, terminating zero, slash and dot
+    fname = (char*) malloc((pathSize + nameSize + extSize + 3) * sizeof(char));
+    strcpy(fname, path);
+
+    // Added file neme
+
+    if (fname[pathSize - 1] != dirDelimiter && nameSize > 0){
+        fname[pathSize] = dirDelimiter;
+        pathSize += 1;
+        fname[pathSize] = '\0';
+    }
+    strcat(fname, name);
+    strcat(fname, ".");
+    strcat(fname, ext);
+    return fname;
+}
+
+uint_least32_t
+Crc32(const unsigned char * buf, size_t len)
+{
+    uint_least32_t crc = 0xFFFFFFFF;
+    while (len--)
+        crc = (crc >> 8) ^ Crc32Table[(crc ^ *buf++) & 0xFF];
+    return crc ^ 0xFFFFFFFF;
+}
+
+uint_least32_t
+Crc32Add(const unsigned char * buf, size_t len, uint_least32_t crc)
+{
+    //uint_least32_t crc = 0xFFFFFFFF;
+    while (len--)
+        crc = (crc >> 8) ^ Crc32Table[(crc ^ *buf++) & 0xFF];
+    return crc ^ 0xFFFFFFFF;
+}
+
+int
+hfOpenRead(HfInfo* hf, const char* filename)
+{
+    hf->hash = 0;
+#ifdef _DEBUG_TOOLS
+    hf->fileLog = NULL;
+#endif // _DEBUG
+
+    if (filename == NULL) {
+        return FILE_NOT_FOUND;
+    }
+    hf->file = fopen(filename, "rb");
+    if (hf->file == NULL){
+        return  FILE_NOT_FOUND;
+    }
+    return FILE_OK;
+}
+
+
+int
+hfOpenWrite(HfInfo* hf, const char* filename)
+{
+    hf->hash = 0;
+#ifdef _DEBUG_TOOLS
+    {
+        char* logName = hfCreateFullPatch(filename, "", "log");
+        hf->fileLog = fopen(logName, "w");
+        free(logName);
+    }
+#endif // _DEBUG
+    hf->file = fopen(filename, "wb");
+    if (!hf->file){
+        return FILE_ERROR_OPEN_FOR_WRITING;
+    }
+
+    return FILE_OK;
+}
+
+int
+hfOpenReWrite(HfInfo* hf, const char* filename)
+{
+    hf->hash = 0;
+    if (hf->file != NULL){
+        hfClose(hf);
+    }
+
+
+#ifdef _DEBUG_TOOLS
+    {
+        char* logName = hfCreateFullPatch(filename, "", "log");
+        hf->fileLog = fopen(logName, "a");
+        fprintf(hf->fileLog, " ====================== \n");
+        free(logName);
+    }
+#endif // _DEBUG
+    hf->file = fopen(filename, "rb+");
+    return FILE_OK;
+}
+
+int
+hfClose( HfInfo* hf )
+{
+    int ret = 0;
+    if (hf->file != NULL){
+        ret = fclose(hf->file);
+        hf->file = NULL;
+    }
+#ifdef _DEBUG_TOOLS
+    if (hf->fileLog != NULL){
+        ret = fclose(hf->fileLog);
+        hf->fileLog = NULL;
+    }
+#endif // _DEBUG
+    return ret;
+}
+
+int
+hfWrite( HfInfo* hf, const void* buff, size_t size )
+{
+    hf->hash = Crc32Add(buff, size, hf->hash);
+#ifdef _DEBUG_TOOLS
+    hf->start = ftell(hf->file);
+#endif // _DEBUG
+    fwrite(buff, size, 1, hf->file);
+#ifdef _DEBUG_TOOLS
+    hf->end = ftell(hf->file);
+    fprintf(hf->fileLog, "       %8d - %8d (%8d) \n",(int)hf->start, (int)hf->end, (int)size );
+    fflush(hf->fileLog);
+#endif // _DEBUG
+    return 0;
+}
+
+int
+hfWriteCRC( HfInfo* hf )
+{
+#ifdef _DEBUG_TOOLS
+    hf->start = ftell(hf->file);
+#endif // _DEBUG
+    fwrite (&hf->hash, sizeof(hf->hash), 1, hf->file);
+    hf->hash = 0;
+
+#ifdef _DEBUG_TOOLS
+    hf->end = ftell(hf->file);
+    fprintf(hf->fileLog, "CRC    %8d - %8d (%8lu) \n",(int)hf->start, (int)hf->end, sizeof(hf->hash) );
+    fflush(hf->fileLog);
+#endif // _DEBUG
+    //}
+    fflush(hf->file);
+    return FILE_OK;
+}
+
+int
+hfReadWithoutCRC( HfInfo* hf, void* buff, size_t size )
+{
+    size_t readSize;
+    readSize = fread(buff, 1, size, hf->file);
+    return (int)readSize;
+}
+
+int
+hfRead( HfInfo* hf, void* buff, int c, size_t size )
+{
+    size_t readSize;
+    int i=0;
+#ifdef _DEBUG_TOOLS
+    hf->start = ftell(hf->file);
+#endif // _DEBUG
+
+    readSize = fread(buff, size, c, hf->file);
+    if (readSize != (size_t)c){
+        return FILE_ERROR_READ_DATA;
+    }
+    for (; i < c; ++i){
+        hf->hash = Crc32Add((const unsigned char*)buff + (i*size), size, hf->hash);
+    }
+#ifdef _DEBUG_TOOLS
+    hf->end = ftell(hf->file);
+#endif // _DEBUG
+    return FILE_OK;
+}
+
+int
+hfReadConst( HfInfo* hf, const void* buff, size_t size )
+{
+    int ret;
+    void* buff2 = malloc(size);
+    ret = FILE_OK;
+    hfRead(hf, buff2, 1,  size);
+    if (memcmp(buff, buff2, size) != 0){
+        ret = FILE_ERROR_BUFFER_MISMATCH;
+    }
+    free(buff2);
+    //hf->isUseHach = true;
+    return ret;
+
+}
+
+int
+hfCheckCRC( HfInfo* hf )
+{
+    int ret;
+    TYPECRC crc = 0;
+    size_t readSize;
+
+#ifdef _DEBUG_TOOLS
+    hf->start = ftell(hf->file);
+#endif // _DEBUG
+
+    readSize = fread(&crc, sizeof(crc), 1, hf->file);
+    if (readSize == 1){
+        if (crc == hf->hash){
+            ret = FILE_OK;
+        }
+        else {
+            ret = FILE_ERROR_CRC;
+        }
+
+    }
+    else {
+        ret = FILE_ERROR_READ_DATA;
+    }
+
+#ifdef _DEBUG_TOOLS
+    hf->end = ftell(hf->file);
+#endif // _DEBUG
+
+    hf->hash = 0;
+    return ret ;
+}
+
+int
+hfReadString( HfInfo* hf, char** str )
+{
+    int status;
+    unsigned int strLen;
+    //long int pos = ftell(hf->file);
+    status = hfRead(hf, &strLen, 1,  sizeof(unsigned int));
+    *str =  malloc(strLen + 1);
+    status += hfRead(hf, *str, 1, (size_t)strLen);
+    (*str)[strLen] = '\0';
+    return status;
+}
+
+int
+hfWriteString( HfInfo* hf, const char* buff )
+{
+    int status;
+    unsigned int strLen = (unsigned int)strlen(buff);
+    status = hfWrite(hf, &strLen, sizeof(unsigned int));
+    status = hfWrite(hf, buff, strLen);
+    return status;
+}
+
+
+int
+hfJump( HfInfo* hf, POSFILE pos )
+{
+    fseek(hf->file, (long)pos, SEEK_SET);
+    hf->hash = 0;
+    return FILE_OK;
+}
+
+int
+hfGetCurentPosition( HfInfo* hf, POSFILE* pos )
+{
+    *pos = ftell(hf->file);
+    return FILE_OK;
+}
diff --git a/src/library/tools/tune/fileio.h b/src/library/tools/tune/fileio.h
new file mode 100644
index 0000000..eed5223
--- /dev/null
+++ b/src/library/tools/tune/fileio.h
@@ -0,0 +1,96 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef FILEIO_H__
+#define FILEIO_H__
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <defbool.h>
+
+#include <trace_malloc.h>
+
+#define FILE_OK                         0x0000
+#define FILE_NOT_FOUND                  0x0100
+#define FILE_ERROR_OPEN_FOR_WRITING     0x0101
+#define FILE_ERROR_READ_DATA            0x0201
+#define FILE_ERROR_RESERVED_OVERFLOW    0x0501
+#define FILE_ERROR_RESERVED_NOT_FULL    0x0502
+#define FILE_ERROR_BUFFER_MISMATCH      0x0601
+#define FILE_ERROR_CRC                  0x0701
+#define FILE_ERROR_INDALID_KERNAL_SIZE  0x0801
+
+typedef unsigned int TYPECRC;
+
+#if defined (_WIN32)
+typedef  unsigned __int64 POSFILE;
+#else
+#include <sys/types.h>
+
+typedef u_int64_t POSFILE;
+#endif
+
+typedef struct HfInfo
+{
+    FILE*       file;
+    TYPECRC     hash;       // CRC32
+#ifdef _DEBUG_TOOLS
+    FILE*       fileLog;
+    POSFILE     start;
+    POSFILE     end;
+#endif // _DEBUG
+
+}HfInfo;
+
+// Structure initialization
+void hfInit(HfInfo* hf);
+// Open file for reading
+int hfOpenRead (HfInfo* hf, const char* filename);
+// Open file for writing.
+// if _DEBUG macro is defined, the log file is created.
+int hfOpenWrite(HfInfo* hf, const char* filename);
+int hfOpenReWrite(HfInfo* hf, const char* filename);
+
+int hfReadWithoutCRC( HfInfo* hf, void* buff, size_t size );
+
+int hfRead(HfInfo* hf, void* buff, int c, size_t size);
+// Skip data witch calculate CRC
+// int hfSkip(HfInfo* hf, size_t c, size_t  size);
+//Jamp to position "pos" without calculation CRC
+int hfJump(HfInfo* hf, POSFILE  pos);
+//
+int hfGetCurentPosition(HfInfo* hf, POSFILE* pos);
+
+int hfReadString(HfInfo* hf, char** str);
+
+//! Read data and compare with buff
+//! \return HF_FILE_ERROR_BUFFER_MISMATCH
+int hfReadConst(HfInfo* hf, const void* buff, size_t size);
+//!
+int hfCheckCRC(HfInfo* hf);
+
+
+
+int hfWrite(HfInfo* hf, const void* buff, size_t size);
+int hfWriteString(HfInfo* hf, const char* buff);
+int hfWriteCRC(HfInfo* hf);
+
+int hfClose(HfInfo* hf);
+
+char * hfCreateFullPatch( const char* path, const char * name, const char * ext );
+
+#endif /* FILEIO_H__ */
diff --git a/src/library/tools/tune/storage_data.c b/src/library/tools/tune/storage_data.c
new file mode 100644
index 0000000..a3d5708
--- /dev/null
+++ b/src/library/tools/tune/storage_data.c
@@ -0,0 +1,374 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include "storage_data.h"
+#include "assert.h"
+
+BlasParamInfo*
+findParam(
+    StorageCacheImpl* cacheImpl,
+    const char* pattName,
+    const DataType dt,
+    const KernelExtraFlags kflag,
+    int dim)
+{
+    unsigned int func;
+    BlasFunctionInfo *functionInfo = cacheImpl->functionInfo;
+    //unsigned int mask[BLAS_FUNCTIONS_NUMBER];
+
+    //initMask(mask);
+    for (func =0; func < BLAS_FUNCTIONS_NUMBER; ++ func) {
+        unsigned int patt;
+
+        BlasFunctionInfo* bFunc = &functionInfo[func];
+        for (patt =0; patt < bFunc->numPatterns; ++ patt) {
+            unsigned int extra;
+
+            BlasPatternInfo* bPatt = &bFunc->pattInfo[patt];
+            if (strcmp(bPatt->name, pattName) == 0) {
+                KernelExtraFlags flag  = kflag & bFunc->maskForTuningsKernel;
+                for (extra =0; extra < bPatt->numExtra; ++ extra) {
+                    BlasExtraInfo* bExtra = &bPatt->extra[extra];
+                    if (bExtra->dtype == dt && bExtra->flags == flag)
+                    {
+                        unsigned int param;
+                        BlasParamInfo* bestParam    = NULL;
+                        unsigned int bestDimDelta = 50000;
+
+                        if (dim == 0) {
+                            //leading dimension banks aligned case
+                            bestParam =
+                                &bExtra->param[BANK_ALIGNED_CASE_RECORD_IDX];
+                        }
+                        else {
+                            for (param = 0; param < bExtra->numParam; ++param) {
+                                BlasParamInfo* bParam = &bExtra->param[param];
+                                unsigned int dimDelta = abs(dim - bParam->dim);
+
+                                if (param == BANK_ALIGNED_CASE_RECORD_IDX) {
+                                    continue;
+                                }
+
+                                if (dimDelta < bestDimDelta){
+                                    bestDimDelta = dimDelta;
+                                    bestParam    = bParam;
+                                }
+                            }
+                        }
+                        return bestParam;
+                    }
+                }
+            }
+        }
+    }
+    return NULL;
+}
+
+
+BlasPatternInfo *
+getPatternInfo(StorageCacheImpl* cache, unsigned int func, unsigned int patt)
+{
+	BlasPatternInfo* bPatt = NULL;
+
+	if (func != BLAS_FUNCTIONS_NUMBER) {
+		BlasFunctionInfo* bFunc = &cache->functionInfo[func];
+
+		bPatt = &bFunc->pattInfo[patt];
+	}
+	return bPatt;
+}
+
+
+void
+nextPattern(StorageCacheImpl* cache, unsigned int* func, unsigned int* patt)
+{
+	BlasFunctionInfo* bFunc = &cache->functionInfo[*func];
+
+	(*patt)++;
+	if (bFunc->numPatterns == *patt) {
+		(*func)++;
+		*patt = 0;
+	}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+bool
+isValidFlagMatrix(DataType curType, unsigned int  flags)
+{
+    bool ret;
+    // todo Make refactoring expressions.
+
+    ret = !isComplexType(curType)
+             && ( (flags & KEXTRA_CONJUGATE_A) || (flags & KEXTRA_CONJUGATE_B));
+    //  The flag KEXTRA_CONJUGATE_X can be set TRUE only when the flag KEXTRA_TRANS_X is TRUE.
+    ret = ret || (flags & (KEXTRA_TRANS_A | KEXTRA_CONJUGATE_A))
+    		== KEXTRA_CONJUGATE_A;
+    ret = ret || (flags & (KEXTRA_TRANS_B | KEXTRA_CONJUGATE_B))
+    		== KEXTRA_CONJUGATE_B;
+
+    return ret;
+}
+
+size_t
+getDTypeArray(DataType * dTypes, size_t dtypeCount, DeviceInfo* defInf )
+{
+    if (dtypeCount < 4) {
+        return 0;
+    }
+    if (defInf->nativeDouble) {
+        if (defInf->nativeComplex) {
+            dTypes[0] =  TYPE_FLOAT;
+            dTypes[1] =  TYPE_COMPLEX_FLOAT;
+            dTypes[2] =  TYPE_DOUBLE;
+            dTypes[3] =  TYPE_COMPLEX_DOUBLE;
+            dtypeCount = 4;
+        }
+        else {
+            dTypes[0] =  TYPE_FLOAT;
+            dTypes[1] =  TYPE_DOUBLE;
+            dtypeCount = 2;
+        }
+    }
+    else {
+        if (defInf->nativeComplex) {
+            dTypes[0] =  TYPE_FLOAT;
+            dTypes[1] =  TYPE_COMPLEX_FLOAT;
+            dtypeCount = 2;
+        }
+        else {
+            dTypes[0] =  TYPE_FLOAT;
+            dtypeCount = 1;
+        }
+    }
+    return dtypeCount;
+}
+
+void
+initParamData (BlasParamInfo* bParam, int dim)
+{
+    memset(bParam->sDim,        0, sizeof(SubproblemDim) * MAX_SUBDIMS);
+    memset(&bParam->pGran,      0, sizeof(PGranularity) );
+    memset(bParam->kernel,      0, sizeof(OFFSET) * MAX_CLBLAS_KERNELS_PER_STEP);
+    memset(bParam->kSize,       0, sizeof(size_t)* MAX_CLBLAS_KERNELS_PER_STEP);
+
+    bParam->time = 1e50; // any large number;
+    bParam->dim  = dim;
+
+    bParam->offset = 0;
+    bParam->size   = 0;
+    bParam->sstatus = SS_NOLOAD;
+}
+
+void
+initExtraData(BlasExtraInfo* bExtra, DataType dTypes, unsigned int flags, DeviceInfo* di)
+{
+    unsigned int param;
+    int func = bExtra->parent->parent->funcNo;
+
+    assert(bExtra->param == 0);
+
+    bExtra->dtype = dTypes;
+    bExtra->flags = flags;
+
+    if (isComplexType(dTypes)) {
+        bExtra->vecLen = 2;
+    }
+    else {
+        bExtra->vecLen = 4;
+    }
+
+    bExtra->numParam = getDimensionCount(di->tdev, func);
+
+    bExtra->offset = 0;
+    bExtra->size   = 0;
+    bExtra->sstatus = SS_NOLOAD;
+
+    bExtra->param = calloc( bExtra->numParam, sizeof(BlasParamInfo));
+    for (param = 0; param < bExtra->numParam; ++param) {
+        BlasParamInfo* bParam = &bExtra->param[param];
+        initParamData(bParam, getDimension(param, bExtra->dtype, di, func));
+     }
+}
+
+int
+genExtraDatasForPattern(
+    BlasPatternInfo* bPatt,
+    unsigned int tuningsMask,
+    unsigned int uniqueMask,
+    DeviceInfo* defInf)
+{
+    size_t dtypeCount;
+    size_t ndt;
+    unsigned int  flags;
+    unsigned int index;
+    DataType  dTypes[4];
+    BlasExtraInfo* extra;
+    BlasFunctionInfo* bFunc;
+    unsigned int extraCount;
+
+    bFunc = bPatt->parent;
+    extra = bPatt->extra;
+    extraCount = bPatt->numExtra;
+    bPatt->numTuneExtra = 0;
+
+    dtypeCount = getDTypeArray(dTypes, 4, defInf);
+    index = 0;
+    for (flags = 0; flags <= uniqueMask; flags++) {
+        unsigned int m = flags & (~uniqueMask);
+        if (!m){
+            for (ndt = 0; ndt < dtypeCount; ++ndt) {
+                DataType curType = dTypes[ndt];
+                if ( bFunc->isValidFlag != NULL
+                     && bFunc->isValidFlag(curType, flags)) {
+                    continue;
+                }
+
+                if (extra != NULL) {
+                    unsigned int tm;
+                    if (index == extraCount) {
+                        return index;
+                    }
+
+                    extra[index].parent = bPatt;
+                    initExtraData(&extra[index], dTypes[ndt], flags, defInf);
+                    tm = flags & (~tuningsMask);
+                    extra[index].isUseForTunning = tm == 0;
+                    if (extra[index].isUseForTunning) {
+                        bPatt->numTuneExtra++;
+                    }
+                }
+                ++index;
+            }
+        }
+        else {
+            m = (m&(m-1))^m;
+            flags = flags + m - 1;
+        }
+    }
+    return index;
+}
+
+void
+initPatternData (BlasPatternInfo*  bPatt, DeviceInfo* defInf)
+{
+    unsigned int tuningsMask = bPatt->parent->maskForTuningsKernel;
+    unsigned int uniqueMask = bPatt->parent->maskForUniqueKernels;
+
+    assert(bPatt->numExtra == 0);
+    assert(bPatt->extra == 0);
+
+    bPatt->numExtra = genExtraDatasForPattern(bPatt, tuningsMask,
+                uniqueMask, defInf);
+
+    bPatt->offset = 0;
+    bPatt->size   = 0;
+    bPatt->sstatus = SS_NOLOAD;
+
+    bPatt->extra = calloc( bPatt->numExtra, sizeof(BlasExtraInfo));
+    genExtraDatasForPattern(bPatt, tuningsMask, uniqueMask, defInf);
+}
+
+void
+initFuncData (BlasFunctionInfo* bFunc, DeviceInfo* defInf)
+{
+    unsigned int patt;
+    bFunc->isValidFlag = isValidFlagMatrix;
+
+    if (bFunc->initFunctionInfo != NULL) {
+        bFunc->initFunctionInfo(bFunc);
+    }
+
+    for (patt = 0 ; patt < bFunc->numPatterns; ++patt) {
+        BlasPatternInfo*  bPatt = &bFunc->pattInfo[patt];
+        bPatt->parent = bFunc;
+        bPatt->name = bFunc->pattern[patt].name;
+        bPatt->pattNo = patt;
+        initPatternData (bPatt, defInf);
+    }
+}
+
+void
+initCacheData (BlasFunctionInfo* bFuncs, DeviceInfo* defInf)
+{
+    unsigned int func;
+
+    for (func=0; func < BLAS_FUNCTIONS_NUMBER; ++func) {
+        BlasFunctionInfo* bFunc = &bFuncs[func];
+        bFunc->funcNo = func;
+        initFuncData(bFunc, defInf);
+    }
+}
+
+void
+destroyParamData(BlasParamInfo* bParam)
+{
+    int k;
+
+    for (k=0; k < MAX_CLBLAS_KERNELS_PER_STEP; ++k) {
+        bParam->kSize[0] = 0;
+    }
+}
+
+void
+destroyExtraData(BlasExtraInfo* bExtra)
+{
+    unsigned int param;
+
+    if (bExtra == NULL) {
+        return;
+    }
+
+    for (param = 0; param < bExtra->numParam; ++param) {
+        BlasParamInfo* bParam = &bExtra->param[param];
+        destroyParamData(bParam);
+    }
+    free(bExtra->param);
+}
+
+void
+destroyPatternData(BlasPatternInfo*  bPatt)
+{
+    unsigned int extra;
+
+    for (extra = 0 ; extra < bPatt->numExtra; ++extra){
+        BlasExtraInfo*  bExtra = &bPatt->extra[extra];
+        destroyExtraData (bExtra);
+    }
+    free (bPatt->extra);
+}
+
+void
+destroyFuncData(BlasFunctionInfo* bFunc)
+{
+    unsigned int patt;
+
+    for (patt = 0 ; patt < bFunc->numPatterns; ++patt) {
+        BlasPatternInfo*  bPatt = &bFunc->pattInfo[patt];
+        destroyPatternData (bPatt);
+    }
+}
+
+void
+destroyData(BlasFunctionInfo* fInfo)
+{
+    unsigned int func;
+
+    for (func =0; func < BLAS_FUNCTIONS_NUMBER; ++ func){
+        destroyFuncData( &fInfo[func]);
+    }
+}
+
diff --git a/src/library/tools/tune/storage_data.h b/src/library/tools/tune/storage_data.h
new file mode 100644
index 0000000..5efcf5e
--- /dev/null
+++ b/src/library/tools/tune/storage_data.h
@@ -0,0 +1,201 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef STORAGEDATA_H_
+#define STORAGEDATA_H_
+
+#include <malloc.h>
+#include <math.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <CL/cl.h>
+
+#include <trace_malloc.h>
+
+#include "toolslib.h"
+#include "solution_seq.h"
+#include "matrix_dims.h"
+
+//
+typedef unsigned int OFFSET;
+
+/* Device information needed for tuning CLBLAS kernels. */
+typedef struct CLDeviceInfoRec {
+    cl_uint       nrComputeUnits;   /* CL_DEVICE_MAX_COMPUTE_UNITS */
+    unsigned int  nrStreamCores;    /* Number of stream cores per Compute Unit */
+    cl_ulong      globalSize;       /* CL_DEVICE_GLOBAL_MEM_SIZE */
+    cl_ulong      maxMemAllocSize;  /* CL_DEVICE_MAX_MEM_ALLOC_SIZE */
+    cl_ulong      ldsSize;          /* CL_DEVICE_LOCAL_MEM_SIZE */
+    unsigned int  wavefront;        /* Number of work-items executed in parallel on hardware */
+    cl_uint       alignment;        /* CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE */
+    unsigned int  addressBits;      /* CL_DEVICE_ADDRESS_BITS */
+    size_t        workItemSizes[3]; /* CL_DEVICE_MAX_WORK_ITEM_SIZES */
+    cl_uint       workItemSizesDim; /* CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS */
+    size_t        workGroupSizes;   /* CL_DEVICE_MAX_WORK_GROUP_SIZE */
+    bool          nativeDouble;     /* Specifies whether device supports double precision float */
+    bool          nativeComplex;    /* Specifies whether device supports complex float */
+    TargetDevice* tdev;
+} DeviceInfo;
+
+
+typedef enum Dimensions{
+    DIMARRAY_SMALL,
+    DIMARRAY_SHORT,
+    DIMARRAY_MIDDLE,
+    DIMARRAY_BIG,
+    DIMARRAY_HUGE,
+    DIMARRAY_BANK_CONFLICT,
+    DIMARRAYCOUNT,      //
+}Dimensions;
+
+//
+struct SubDimInfo;
+struct BlasFunctionInfo;
+struct BlasPatternInfo;
+struct BlasExtraInfo;
+struct MatrixInfo;
+
+typedef enum SynchStatus
+{
+    SS_NOLOAD,
+    SS_CORRECT_DATA,
+    SS_INCORRECT_DATA,
+}SynchStatus;
+
+
+typedef struct BlasParamInfo
+{
+    int             dim;
+    SubproblemDim   sDim[MAX_SUBDIMS];
+    PGranularity    pGran;
+
+    OFFSET          kernel[MAX_CLBLAS_KERNELS_PER_STEP];
+    unsigned int    kSize[MAX_CLBLAS_KERNELS_PER_STEP];
+    double          time;
+
+    OFFSET      offset;
+    size_t      size;
+    SynchStatus sstatus;
+} BlasParamInfo;
+
+
+typedef struct BlasExtraInfo
+{
+    struct BlasPatternInfo* parent;
+
+    unsigned int      numParam;
+    DataType          dtype;
+    KernelExtraFlags  flags;
+    unsigned int      vecLen;
+    bool              isUseForTunning;
+
+    BlasParamInfo* param;
+
+    OFFSET      offset;
+    size_t      size;
+    SynchStatus sstatus;
+} BlasExtraInfo;
+
+typedef struct BlasPatternInfo
+{
+    struct BlasFunctionInfo* parent;
+
+    unsigned int   numExtra;
+    unsigned int   numTuneExtra;
+    BlasExtraInfo* extra;
+    const char   * name;
+
+    OFFSET      offset;
+    size_t      size;
+    SynchStatus sstatus;
+
+    unsigned int pattNo;
+    bool (*isPGValid) (struct SubDimInfo* sdi);
+    void (*initSubdim)(struct SubDimInfo* sdi);
+
+} BlasPatternInfo;
+
+typedef struct BlasFunctionInfo
+{
+    unsigned int      numPatterns;
+    int               funcNo;
+    unsigned int      maskForTuningsKernel;
+    unsigned int      maskForUniqueKernels;
+    const char*       envImplementation;
+    int               defaultPattern;
+    const char*       name;
+    //
+
+    bool (*isValidFlag) (DataType curType, unsigned int  flags);
+    void (*initFunctionInfo) (struct BlasFunctionInfo* bFunc);
+    void (*initKNM) (struct MatrixInfo*, unsigned int baseDim);
+
+    BlasPatternInfo   pattInfo[MEMPAT_PER_BLASFN];
+    MemoryPattern     pattern[MEMPAT_PER_BLASFN];
+
+} BlasFunctionInfo;
+
+
+typedef struct StorageCacheImpl
+{
+    char* fpath;
+    char* fpath_tmp;
+    bool isInit;     //
+    bool isPopulate; // The cache has been initialized,
+                     // but does not contain data
+    BlasFunctionInfo functionInfo[BLAS_FUNCTIONS_NUMBER];
+    DeviceIdent  devIdent;
+
+    OFFSET endFile;
+} StorageCacheImpl;
+
+/*
+ * The 'force' argument set to true means returning a cache object even
+ * if the file on disk doesn't exist
+ */
+StorageCacheImpl* getStorageCache(TargetDevice* devID, bool force);
+
+BlasParamInfo*  findParam(StorageCacheImpl* cache,
+                          const char* pattName, const DataType dt,
+                          const KernelExtraFlags kflag, int dim);
+
+void loadKernelsFromFile(StorageCacheImpl* cache, BlasParamInfo* bParam,
+                unsigned char** buffer, size_t* sizeBuffer);
+
+void loadDataFromFile(StorageCacheImpl* cache);
+
+char * createFullPatch(const char * name, bool tmp);
+
+OFFSET calcOffset(BlasFunctionInfo* functionInfo);
+
+BlasPatternInfo * getPatternInfo(StorageCacheImpl* cache, unsigned int func,
+                                 unsigned int patt);
+
+void  nextPattern(StorageCacheImpl* cache, unsigned int* func,
+                  unsigned int* patt);
+void saveBestParam(TargetDevice* tdev, BlasParamInfo* bParam);
+
+unsigned int getDimension(int idx, DataType dt, DeviceInfo* di, int func);
+bool initReadingData(StorageCacheImpl* cacheImpl, TargetDevice* devID );
+void initBlasFuncionData(BlasFunctionInfo* fInfo);
+void initCacheData (BlasFunctionInfo* bFunc, DeviceInfo* defInfo);
+void initCLDeviceInfoRec(TargetDevice* tdev, DeviceInfo *devInfo);
+void destroyData(BlasFunctionInfo* fInfo);
+
+#endif /* STORAGEDATA_H_ */
diff --git a/src/library/tools/tune/storage_init.c b/src/library/tools/tune/storage_init.c
new file mode 100644
index 0000000..adefb60
--- /dev/null
+++ b/src/library/tools/tune/storage_init.c
@@ -0,0 +1,202 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include "storage_data.h"
+
+void
+initGemm(BlasFunctionInfo* bFunc)
+{
+    bFunc->name = "GEMM";
+    bFunc->envImplementation = "AMD_CLBLAS_GEMM_IMPLEMENTATION";
+    bFunc->numPatterns = initGemmMemPatterns(bFunc->pattern);
+    bFunc->defaultPattern = bFunc->numPatterns - 2;
+    bFunc->maskForTuningsKernel =
+            KEXTRA_TRANS_A
+            | KEXTRA_TRANS_B
+            | KEXTRA_COLUMN_MAJOR
+            ;
+
+    bFunc->maskForUniqueKernels =
+            KEXTRA_TRANS_A
+            | KEXTRA_CONJUGATE_A
+            | KEXTRA_TRANS_B
+            | KEXTRA_CONJUGATE_B
+            | KEXTRA_COLUMN_MAJOR
+            | KEXTRA_BETA_ZERO
+            ;
+}
+
+void
+initTrmm(BlasFunctionInfo* bFunc)
+{
+    bFunc->name = "TRMM";
+    bFunc->envImplementation = "AMD_CLBLAS_TRMM_IMPLEMENTATION";
+    bFunc->numPatterns = initTrmmMemPatterns(bFunc->pattern);
+    bFunc->defaultPattern = bFunc->numPatterns - 1;
+    bFunc->maskForTuningsKernel =
+        KEXTRA_TRANS_A
+        | KEXTRA_UPPER_TRIANG
+        | KEXTRA_SIDE_RIGHT
+        | KEXTRA_COLUMN_MAJOR
+        ;
+    bFunc->maskForUniqueKernels =
+        KEXTRA_TRANS_A
+        | KEXTRA_CONJUGATE_A
+        | KEXTRA_UPPER_TRIANG
+        | KEXTRA_SIDE_RIGHT
+        | KEXTRA_UNIT_DIAGONAL
+        | KEXTRA_COLUMN_MAJOR
+        ;
+}
+void
+initTrsm(BlasFunctionInfo* bFunc)
+{
+    bFunc->name = "TRSM";
+    bFunc->envImplementation = "AMD_CLBLAS_TRSM_IMPLEMENTATION";
+    bFunc->numPatterns = initTrsmMemPatterns(bFunc->pattern);
+    // FIXME Correct, when adding a new pattern will not lead to corrupt it.
+
+    // don't create a partition for new TRSM pattern
+    if (bFunc->numPatterns == 3) {
+        bFunc->numPatterns = 2;
+    }
+    bFunc->defaultPattern = bFunc->numPatterns - 1;
+    bFunc->maskForTuningsKernel =
+        KEXTRA_TRANS_A
+        | KEXTRA_UPPER_TRIANG
+        | KEXTRA_SIDE_RIGHT
+        | KEXTRA_COLUMN_MAJOR
+        ;
+    bFunc->maskForUniqueKernels =
+        KEXTRA_TRANS_A
+        | KEXTRA_CONJUGATE_A
+        | KEXTRA_UPPER_TRIANG
+        | KEXTRA_SIDE_RIGHT
+        | KEXTRA_UNIT_DIAGONAL
+        | KEXTRA_COLUMN_MAJOR
+        ;
+}
+
+void
+initGemv(BlasFunctionInfo* bFunc)
+{
+    bFunc->name = "GEMV";
+    bFunc->envImplementation = NULL;
+    bFunc->numPatterns = initGemvMemPatterns(bFunc->pattern);
+    bFunc->defaultPattern = bFunc->numPatterns - 1;
+    bFunc->maskForTuningsKernel =
+        KEXTRA_TRANS_A
+        | KEXTRA_COLUMN_MAJOR
+        | KEXTRA_UPPER_TRIANG
+        ;
+    bFunc->maskForUniqueKernels =
+        KEXTRA_TRANS_A
+        | KEXTRA_COLUMN_MAJOR
+        | KEXTRA_UPPER_TRIANG
+        | KEXTRA_BETA_ZERO
+        | KEXTRA_INCX_ONE
+        | KEXTRA_INCY_ONE
+        ;
+}
+
+void
+initSymv(BlasFunctionInfo* bFunc)
+{
+    bFunc->name = "SYMV";
+    bFunc->envImplementation = NULL;
+    bFunc->numPatterns = initSymvMemPatterns(bFunc->pattern);
+    bFunc->defaultPattern = bFunc->numPatterns - 1;
+    bFunc->maskForTuningsKernel =
+        KEXTRA_COLUMN_MAJOR
+        | KEXTRA_UPPER_TRIANG
+        ;
+    bFunc->maskForUniqueKernels =
+        KEXTRA_COLUMN_MAJOR
+        | KEXTRA_UPPER_TRIANG
+        | KEXTRA_BETA_ZERO
+        | KEXTRA_INCX_ONE
+        | KEXTRA_INCY_ONE
+        ;
+}
+
+void
+initSyr2k(BlasFunctionInfo* bFunc)
+{
+    bFunc->name = "SYR2K";
+    bFunc->envImplementation = NULL;
+    bFunc->numPatterns = initSyr2kMemPatterns(bFunc->pattern);
+    bFunc->defaultPattern = bFunc->numPatterns - 1;
+    bFunc->maskForTuningsKernel =
+        KEXTRA_TRANS_A
+        //| KEXTRA_CONJUGATE_A
+        //| KEXTRA_TRANS_B
+        //| KEXTRA_CONJUGATE_B
+        | KEXTRA_COLUMN_MAJOR
+        //| KEXTRA_UPPER_TRIANG
+        //|KEXTRA_SIDE_RIGHT
+        //| KEXTRA_TAILS_M
+        //| KEXTRA_TAILS_N
+        //| KEXTRA_TAILS_K
+        //| KEXTRA_BETA_ZERO
+        //| KEXTRA_NO_COPY_VEC_A = 0x1000,
+        //| KEXTRA_NO_COPY_VEC_B = 0x2000,
+        //| KEXTRA_NO_COPY_VEC_C = 0x4000,
+        ;
+    bFunc->maskForUniqueKernels = bFunc->maskForTuningsKernel;
+}
+
+void
+initSyrk(BlasFunctionInfo* bFunc)
+{
+    bFunc->name = "SYRK";
+    bFunc->envImplementation = NULL;
+    bFunc->numPatterns = initSyrkMemPatterns(bFunc->pattern);
+    bFunc->defaultPattern = bFunc->numPatterns - 1;
+    bFunc->maskForTuningsKernel =
+        KEXTRA_TRANS_A
+        //| KEXTRA_CONJUGATE_A
+        //| KEXTRA_TRANS_B
+        //| KEXTRA_CONJUGATE_B
+        | KEXTRA_COLUMN_MAJOR
+        //| KEXTRA_UPPER_TRIANG
+        //|KEXTRA_SIDE_RIGHT
+        //| KEXTRA_TAILS_M
+        //| KEXTRA_TAILS_N
+        //| KEXTRA_TAILS_K
+        //| KEXTRA_BETA_ZERO
+        //| KEXTRA_NO_COPY_VEC_A = 0x1000,
+        //| KEXTRA_NO_COPY_VEC_B = 0x2000,
+        //| KEXTRA_NO_COPY_VEC_C = 0x4000,
+        ;
+    bFunc->maskForUniqueKernels = bFunc->maskForTuningsKernel;}
+
+void
+initBlasFuncionData(BlasFunctionInfo* fInfo)
+{
+//    unsigned int func;
+
+    memset(fInfo, 0, BLAS_FUNCTIONS_NUMBER * sizeof(BlasFunctionInfo));
+
+    fInfo[CLBLAS_GEMM].initFunctionInfo = initGemm;
+    fInfo[CLBLAS_TRMM].initFunctionInfo = initTrmm;
+    fInfo[CLBLAS_TRSM].initFunctionInfo = initTrsm;
+    fInfo[CLBLAS_GEMV].initFunctionInfo = initGemv;
+    fInfo[CLBLAS_SYMV].initFunctionInfo = initSymv;
+    fInfo[CLBLAS_SYR2K].initFunctionInfo = initSyr2k;
+    fInfo[CLBLAS_SYRK].initFunctionInfo = initSyrk;
+
+}
diff --git a/src/library/tools/tune/storage_io.c b/src/library/tools/tune/storage_io.c
new file mode 100644
index 0000000..4d9dd37
--- /dev/null
+++ b/src/library/tools/tune/storage_io.c
@@ -0,0 +1,751 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+
+#include <malloc.h>
+#include <string.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#include "fileio.h"
+#include "storage_data.h"
+
+#define  SUBDIM_UNUSED_FILE_VALUE 10000
+const char *ENV_FILE_PATH = "AMD_CLBLAS_STORAGE_PATH";
+const char *FileID  = "CBS";
+const char *FileExt = "kdb";
+const char *FileExtTmp = "kdb.tmp";
+const int fileVersion = 3;
+
+POSFILE
+findPattern(HfInfo* file, const char* name)
+{
+	const int bufSize = 1024*64;
+	char  buffer[1024*64];
+	POSFILE fpos = 0;
+	int ib;
+	int in;
+	int bufRead;
+	int nameLen = (int)strlen(name);
+
+	hfJump(file, 1);
+	in = 0;
+
+	do {
+		hfGetCurentPosition(file, &fpos);
+		bufRead = hfReadWithoutCRC(file, buffer, bufSize);
+		for (ib = 0; ib < bufRead; ++ib) {
+			if (name[in] == buffer[ib]) {
+				in++;
+				if (in >= nameLen) {
+					fpos += + ib - nameLen + 1 - sizeof(unsigned int);
+					hfJump(file, fpos);
+					return true;
+				}
+			}else{
+				in = 0;
+			}
+
+		}
+	} while (bufRead == bufSize);
+
+	return 0;
+}
+
+bool
+checkFile(HfInfo* file, size_t pos2, int status)
+{
+    POSFILE pos;
+
+    hfGetCurentPosition(file, &pos);
+    if ((POSFILE)pos2 == pos && status == FILE_OK) {
+        return true;
+    }
+    return false;
+}
+
+// PATTERN
+void
+calcPatternOffset(BlasPatternInfo*  bPatt, POSFILE* offset)
+{
+    unsigned int len = (unsigned int)strlen(bPatt->name) + 1;
+
+    bPatt->size  = sizeof(len);
+    bPatt->size += len;
+    bPatt->size += sizeof(bPatt->numExtra);
+    bPatt->size += sizeof(TYPECRC);
+
+    bPatt->offset = (OFFSET)*offset;
+    *offset += (POSFILE)bPatt->size;
+}
+
+// PARAM
+void
+calcParamOffset(BlasParamInfo* bParam, POSFILE* offset)
+{
+    bParam->size  = sizeof(unsigned int) * 5 * MAX_SUBDIMS;
+    bParam->size += sizeof(PGranularity);
+    bParam->size += sizeof(POSFILE)*MAX_CLBLAS_KERNELS_PER_STEP;
+    bParam->size += sizeof(bParam->kSize);
+    bParam->size += sizeof(double);
+    bParam->size += sizeof(TYPECRC);
+    bParam->offset = (OFFSET)*offset;
+    *offset += (POSFILE)bParam->size;
+}
+
+int
+loadParamData(HfInfo* file, BlasParamInfo* bParam)
+{
+    int status = 0;
+    int i = 0;
+    int ret = 0;
+    bool dimExist = true;
+
+    for (i =0; i < MAX_SUBDIMS; i++){
+        unsigned int temp;
+        status+= hfRead(file, &temp, 1, sizeof(temp));
+        bParam->sDim[i].x = (size_t)temp;
+        status+= hfRead(file, &temp, 1, sizeof(temp));
+        bParam->sDim[i].y = (size_t)temp;
+        status+= hfRead(file, &temp, 1, sizeof(temp));
+        bParam->sDim[i].itemX = (temp >= SUBDIM_UNUSED_FILE_VALUE)
+            ? SUBDIM_UNUSED
+            : (size_t)temp;
+        status+= hfRead(file, &temp, 1, sizeof(temp));
+        bParam->sDim[i].itemY = (temp >= SUBDIM_UNUSED_FILE_VALUE)
+            ? SUBDIM_UNUSED
+            : (size_t)temp;
+        status+= hfRead(file, &temp, 1, sizeof(temp));
+        bParam->sDim[i].bwidth = (size_t)temp;
+
+    }
+
+    status += hfRead(file, &bParam->pGran, 1, sizeof(PGranularity));
+    status += hfRead(file, bParam->kernel, 1, sizeof(POSFILE) * MAX_CLBLAS_KERNELS_PER_STEP);
+    status += hfRead(file, bParam->kSize,  1, sizeof(bParam->kSize));
+    status += hfRead(file, &bParam->time,  1, sizeof(double) );
+
+    if ((status == FILE_OK) && (bParam->sDim[0].y == 0)) {
+        dimExist = false;
+    }
+
+    status += hfCheckCRC(file);
+
+    if (!dimExist && (status == FILE_ERROR_CRC)) {
+        ret = 1;    // file is valid but doesn't have actual data
+    }
+    else if (!checkFile(file, (size_t)bParam->offset + bParam->size, status)) {
+        ret = -1;   // file is corrupted
+    }
+    else if (bParam->time > 10000.0) {
+        ret = 1;
+    }
+
+    if (ret) {
+        memset(bParam->sDim, 0, sizeof(SubproblemDim) * MAX_SUBDIMS);
+        memset(&bParam->pGran, 0, sizeof(PGranularity) );
+        memset(bParam->kernel, 0, sizeof(POSFILE) * MAX_CLBLAS_KERNELS_PER_STEP );
+        memset(bParam->kSize, 0, sizeof(unsigned int) * MAX_CLBLAS_KERNELS_PER_STEP );
+
+        bParam->time = 1e50; // any large number;
+    }
+
+    return ret;
+}
+
+// EXTRA DATA
+
+void
+calcExtraOffset(BlasExtraInfo* bExtra, POSFILE* offset)
+{
+    bExtra->size  = sizeof(unsigned int);
+    bExtra->size += sizeof(unsigned int);
+    bExtra->size += sizeof(unsigned int);
+    bExtra->size += sizeof(TYPECRC);
+    bExtra->offset = (OFFSET)*offset;
+    *offset += (OFFSET)bExtra->size;
+}
+
+bool
+readExtraData(
+	HfInfo* file,
+	BlasExtraInfo*  bExtra,
+	int numParam)
+{
+	int param;
+	int ret = 0;
+        if (bExtra->param == NULL)
+            return false;
+
+	for (param = 0; param < numParam; ++ param) {
+		BlasParamInfo* bpi = &bExtra->param[param];
+		ret += loadParamData(file, bpi);
+		if (ret == 0) {
+			bpi->sstatus = SS_CORRECT_DATA;
+		}
+
+	}
+
+	if (ret == 0) {
+		bExtra->sstatus = SS_CORRECT_DATA;
+	}
+	return false;
+}
+
+bool
+loadPatternDataFromFile(
+	HfInfo * file,
+	char** name,
+	unsigned int* len,
+	unsigned int* numExtra)
+{
+    int  status = 0;
+
+    status += hfRead(file, len, 1, sizeof(*len));
+    *name = malloc((*len)* sizeof(char));
+    status += hfRead(file, *name, 1, *len);
+    status += hfRead(file, numExtra, 1, sizeof(unsigned int));
+    status += hfCheckCRC (file);
+
+    return status == FILE_OK;
+}
+
+
+int
+readExtaDataHeader (
+	HfInfo * file,
+	unsigned int* dtype,
+	unsigned int* flags,
+	unsigned int* numParam)
+{
+    int  status = 0;
+
+    status += hfRead(file, dtype, 1, sizeof(unsigned int));
+    status += hfRead(file, flags, 1, sizeof(unsigned int));
+
+    status += hfRead(file, numParam, 1, sizeof(unsigned int));
+    status += hfCheckCRC(file);
+
+    return status;
+}
+
+bool
+readPatternData(
+		HfInfo* file,
+		BlasPatternInfo*  bPatt,
+		int numExtra)
+{
+    unsigned int dtype;
+    unsigned int flags;
+    unsigned int numParam;
+	int  ief = 0;
+	int  ied = 0;
+	int ret;
+	POSFILE extraSize = 0;
+
+	if (numExtra > 2) {
+		extraSize = bPatt->extra[1].offset - bPatt->extra[0].offset;
+	}
+
+	for (ief = 0; ief < numExtra; ++ief) {
+		BlasExtraInfo* bExtra = &bPatt->extra[ied];
+		POSFILE curPos;
+
+		ied++;
+		hfGetCurentPosition(file, &curPos);
+		ret = readExtaDataHeader(file, &dtype, &flags, &numParam);
+		if (ret != FILE_OK) {
+			hfJump(file, curPos + extraSize);
+			continue;
+		}
+		bExtra->sstatus	= SS_CORRECT_DATA;
+		if ((bExtra->dtype == dtype) &&
+			(bExtra->flags == flags)) {
+			readExtraData(file, bExtra, numParam);
+		}
+		else {
+
+		}
+	}
+
+
+	return true;
+}
+
+int
+loadHeader(HfInfo* file)
+{
+	int version;
+    int status = 0;
+    unsigned blasFunctionNumber;
+    POSFILE posFile;
+
+    status =  hfReadConst(file, FileID, strlen(FileID));
+    status += hfRead(file, &version, 1, sizeof(version));
+    status += hfRead(file, &blasFunctionNumber, 1,
+    		sizeof(blasFunctionNumber));
+    status += hfRead(file, &posFile, 1, sizeof(posFile));
+    status += hfCheckCRC(file);
+
+    return (status == 0)? version:0;
+}
+
+void
+saveHeader(HfInfo* file, unsigned int blasFunctionNumber, POSFILE binData)
+{
+    int status = 0;
+
+    status =  hfWrite(file, FileID, strlen(FileID));
+    status += hfWrite(file, &fileVersion, sizeof(fileVersion));
+    status += hfWrite(file, &blasFunctionNumber, sizeof(blasFunctionNumber));
+    status += hfWrite(file, &binData, sizeof(binData));
+    status += hfWriteCRC(file);
+
+}
+bool
+checkOffset(BlasFunctionInfo* functionInfo)
+{
+    unsigned int func;
+    unsigned int patt;
+    unsigned int extra;
+    unsigned int param;
+    bool ret = false;
+
+    for (func =0; func < BLAS_FUNCTIONS_NUMBER; ++ func) {
+        BlasFunctionInfo* bFunc = &functionInfo[func];
+        for (patt =0; patt < bFunc->numPatterns; ++ patt) {
+            BlasPatternInfo* bPatt = &bFunc->pattInfo[patt];
+
+            ret |= (bPatt->offset == 0);
+            for (extra =0; extra < bPatt->numExtra; ++ extra) {
+                BlasExtraInfo* bExtra = &bPatt->extra[extra];
+
+                ret |= (bExtra->offset == 0 );
+                for (param =0; param < bExtra->numParam; ++ param) {
+                    BlasParamInfo* bParam = &bExtra->param[param];
+
+                    ret |= (bParam->offset == 0 );
+                }
+            }
+        }
+    }
+    return ret;
+}
+
+void
+loadDataFromFile(StorageCacheImpl* cache)
+{
+    bool structIsCorrect = true;
+    char* name = NULL;
+    unsigned int nameLen;
+    unsigned int numExtra;
+    unsigned int curFunc = 0;
+    unsigned int curPatt = 0;
+    unsigned int func;
+    unsigned int patt;
+    HfInfo file;
+
+    if ( hfOpenRead(&file, cache->fpath) == FILE_NOT_FOUND ) {
+        cache->isPopulate = false;
+        return;
+    }
+
+    // Read file Header
+    loadHeader(&file);
+
+    // Read pattern header
+    structIsCorrect &= loadPatternDataFromFile(&file, &name, &nameLen,
+    		&numExtra);
+
+    while (structIsCorrect)
+    {
+        unsigned int func = curFunc;
+        unsigned int patt = curPatt;
+        bool ret;
+        BlasPatternInfo* bPatt = getPatternInfo(cache, func, patt);
+
+
+        while (bPatt != NULL && memcmp(name, bPatt->name, nameLen) != 0 ) {
+            nextPattern(cache, &func, &patt);
+            bPatt = getPatternInfo(cache, func, patt);
+        }
+
+        if (bPatt != NULL) {
+            bPatt->sstatus = SS_CORRECT_DATA;
+
+            // Read pattern data
+        	ret = readPatternData(&file, bPatt, numExtra);
+
+        	// go to next pattern
+        	nextPattern(cache, &func, &patt);
+        	// if the pattern is read witch error or not completely
+        	if (!ret) {
+        		bPatt = getPatternInfo(cache, func, patt);
+        	    hfJump(&file, bPatt->offset);
+        	}
+
+        	curFunc = func;
+            curPatt = patt;
+        }
+        free(name);
+        name = NULL;
+        structIsCorrect &= loadPatternDataFromFile(&file, &name, &nameLen,
+    			&numExtra );
+    }
+
+    for (func =0; func < BLAS_FUNCTIONS_NUMBER; ++ func) {
+    	BlasFunctionInfo* bFunc = &cache->functionInfo[func];
+
+    	for (patt =0; patt < bFunc->numPatterns; ++ patt){
+    		BlasPatternInfo* bPatt = &bFunc->pattInfo[patt];
+    		if (bPatt->sstatus == SS_NOLOAD) {
+    			POSFILE ret = findPattern(&file, bPatt->name);
+    			if (ret != 0) {
+    				loadPatternDataFromFile(&file, &name, &nameLen,
+    	    			&numExtra );
+    				readPatternData(&file, bPatt, numExtra);
+    			}
+    		}
+    	}
+    }
+
+    free(name);
+    cache->isPopulate = true;
+    hfClose(&file);
+    checkOffset(cache->functionInfo);
+}
+
+char *
+createFullPatch(const char * name, bool tmp)
+{
+    char* path = getenv(ENV_FILE_PATH);
+    const char * ext = (tmp)? FileExtTmp: FileExt;
+
+    if (path == NULL) {
+        return NULL;
+    }
+
+	return hfCreateFullPatch(path, name, ext);
+}
+
+OFFSET
+calcOffset(BlasFunctionInfo* functionInfo)
+{
+    unsigned int func;
+    unsigned int patt;
+    unsigned int extra;
+    unsigned int param;
+    POSFILE pos = 0;
+
+    pos += (POSFILE)strlen(FileID);
+    pos += sizeof(int);    // Version
+    pos += sizeof(unsigned int);  // Func Count;
+    pos += sizeof(POSFILE);    // Func Count;
+    pos += sizeof(TYPECRC);
+
+    for (func =0; func < BLAS_FUNCTIONS_NUMBER; ++ func) {
+        BlasFunctionInfo* bFunc = &functionInfo[func];
+        for (patt =0; patt < bFunc->numPatterns; ++ patt) {
+            BlasPatternInfo* bPatt = &bFunc->pattInfo[patt];
+
+            calcPatternOffset(bPatt, &pos);
+            for (extra =0; extra < bPatt->numExtra; ++ extra) {
+                BlasExtraInfo* bExtra = &bPatt->extra[extra];
+                calcExtraOffset(bExtra, &pos);
+
+                for (param =0; param < bExtra->numParam; ++ param) {
+                    BlasParamInfo* bParam = &bExtra->param[param];
+                    calcParamOffset(bParam, &pos);
+                }
+            }
+        }
+    }
+    return (OFFSET)pos;
+}
+
+
+void
+loadKernelData(
+    HfInfo* file,
+    BlasParamInfo* bParam,
+    unsigned char** buffer,
+    size_t* sizeBuffer)
+{
+    int k;
+    int status = FILE_ERROR_READ_DATA;
+
+    for (k =0; k < MAX_CLBLAS_KERNELS_PER_STEP; ++k) {
+        sizeBuffer[k] = bParam->kSize[k];
+
+        if (sizeBuffer[k] != 0 && bParam->kernel[k] != 0) {
+            buffer[k] = malloc(sizeBuffer[k]);
+
+            hfJump(file, bParam->kernel[k]);
+            hfRead(file, buffer[k], 1, sizeBuffer[k]);
+            status = hfCheckCRC(file);
+        }
+
+        if (status != FILE_OK)
+        {
+            sizeBuffer[k] = 0;
+            buffer[k] = NULL;
+        }
+    }
+}
+
+void
+loadKernelsFromFile(
+    StorageCacheImpl* cache,
+    BlasParamInfo* bParam,
+    unsigned char** buffer,
+    size_t* sizeBuffer)
+{
+    HfInfo file;
+
+    hfOpenRead(&file, cache->fpath);
+    loadKernelData(&file, bParam, buffer, sizeBuffer);
+    hfClose(&file);
+}
+
+
+void
+saveKernelData (
+    StorageCacheImpl* cacheImpl,
+    HfInfo* file,
+    unsigned char** buffer,
+    size_t* sizeBuffer)
+{
+    int  status;
+    POSFILE pos;
+    unsigned int k;
+
+    for (k =0; k < MAX_CLBLAS_KERNELS_PER_STEP; ++k) {
+        pos = cacheImpl->endFile;
+        status = hfJump(file, pos);
+        status += hfWrite(file, &sizeBuffer[k], sizeof(size_t));
+        status += hfWrite(file, buffer[k], sizeBuffer[k]);
+        status += hfWriteCRC(file);
+
+        status += hfGetCurentPosition(file, &pos);
+        if (status == FILE_OK) {
+            cacheImpl->endFile = (OFFSET)pos;
+        }
+    }
+}
+
+bool
+copyKernalData(
+    StorageCacheImpl* cacheImpl,
+    HfInfo* oldfile,
+    HfInfo* newfile,
+    BlasParamInfo* bParam)
+{
+    int k;
+    unsigned char* buffer[MAX_CLBLAS_KERNELS_PER_STEP];
+    size_t sizeBuffer[MAX_CLBLAS_KERNELS_PER_STEP];
+
+    loadKernelData(oldfile, bParam, buffer, sizeBuffer);
+    saveKernelData(cacheImpl, newfile, buffer, sizeBuffer);
+
+    for (k =0; k < MAX_CLBLAS_KERNELS_PER_STEP; ++k) {
+        free (buffer[k]);
+    }
+    return false;
+}
+
+bool
+saveParamData (HfInfo* file, BlasParamInfo* bParam)
+{
+    int  status;
+    int  i;
+
+    status = hfJump(file, bParam->offset);
+    for (i =0; i < MAX_SUBDIMS; i++){
+        unsigned int temp;
+
+        temp = (unsigned int)bParam->sDim[i].x;
+        status+= hfWrite(file, &temp, sizeof(temp));
+
+        temp = (unsigned int)bParam->sDim[i].y;
+        status+= hfWrite(file, &temp, sizeof(temp));
+
+        temp = (bParam->sDim[i].itemX == SUBDIM_UNUSED)
+            ? SUBDIM_UNUSED_FILE_VALUE
+            : (unsigned int)bParam->sDim[i].itemX;
+        status+= hfWrite(file, &temp, sizeof(temp));
+
+        temp = (bParam->sDim[i].itemY == SUBDIM_UNUSED)
+            ? SUBDIM_UNUSED_FILE_VALUE
+            : (unsigned int)bParam->sDim[i].itemY;
+        status+= hfWrite(file, &temp, sizeof(temp));
+
+        temp = (unsigned int)bParam->sDim[i].bwidth;
+        status+= hfWrite(file, &temp, sizeof(temp));
+    }
+
+    status += hfWrite(file, &bParam->pGran, sizeof(PGranularity));
+    status += hfWrite(file, bParam->kernel,
+            sizeof(POSFILE)*MAX_CLBLAS_KERNELS_PER_STEP);
+    status += hfWrite(file, bParam->kSize,  sizeof(bParam->kSize));
+    status += hfWrite(file, &bParam->time,   sizeof(double));
+    status += hfWriteCRC(file);
+
+    return checkFile(file, (unsigned int) (bParam->offset + bParam->size), status);
+}
+
+bool
+saveExtraHeader(HfInfo* file, BlasExtraInfo* bExtra)
+{
+    unsigned int dtype = (unsigned int)bExtra->dtype;
+    unsigned int flags = (unsigned int)bExtra->flags;
+
+    int  status = hfJump(file, bExtra->offset);
+
+    status += hfWrite(file, &dtype, sizeof(unsigned int));
+    status += hfWrite(file, &flags, sizeof(unsigned int));
+    status += hfWrite(file, &bExtra->numParam, sizeof(unsigned int));
+    status += hfWriteCRC(file);
+
+    return checkFile(file, (size_t)bExtra->offset + bExtra->size, status);
+}
+
+
+bool
+savePatternHeader(HfInfo* file, BlasPatternInfo*  bPatt)
+{
+    unsigned int len;
+    int  status = hfJump(file, bPatt->offset);
+
+    len = (unsigned int)strlen(bPatt->name) + 1;
+    status += hfWrite(file, &len, sizeof(len));
+    status += hfWrite(file, bPatt->name, len);
+    status += hfWrite(file, &bPatt->numExtra, sizeof(bPatt->numExtra));
+    status += hfWriteCRC(file);
+
+    return checkFile(file, (size_t)bPatt->offset + bPatt->size, status);
+}
+
+static void
+printErrorMessage (int i, const char* filename)
+{
+    switch (i) {
+    case FILE_NOT_FOUND:
+        printf("File \'%s\' not found\n", filename);
+        break;
+    case FILE_ERROR_CRC:
+    case FILE_ERROR_INDALID_KERNAL_SIZE:
+        printf("File \'%s\' is corrupted.\n", filename);
+        break;
+    case FILE_ERROR_OPEN_FOR_WRITING:
+        printf("Can't open file \'%s\' for writing.\n", filename);
+        break;
+    case FILE_ERROR_BUFFER_MISMATCH:
+        printf("Out of memory to read the file \'%s\'.\n", filename);
+        break;
+    }
+    fflush(stdout);
+}
+
+///
+void
+writeStorageCache(TargetDevice* tdev)
+{
+	int func;
+	unsigned int patt;
+	unsigned int extra;
+	unsigned int param;
+	int fret;
+	HfInfo outfile;
+    HfInfo infile;
+
+    StorageCacheImpl* cache = getStorageCache(tdev, true);
+
+    // Open file for save
+    fret = hfOpenWrite(&infile, cache->fpath);
+    if (fret) {
+        printErrorMessage(fret, cache->fpath);
+        exit(2);
+    }
+    fret = hfOpenWrite(&outfile, cache->fpath_tmp);
+    if (fret) {
+        printErrorMessage(fret, cache->fpath_tmp);
+        exit(2);
+    }
+
+    saveHeader(&outfile, BLAS_FUNCTIONS_NUMBER, 0);
+
+    // For each function
+    for (func =0; func < BLAS_FUNCTIONS_NUMBER; ++ func) {
+        BlasFunctionInfo* bFunc = &cache->functionInfo[func];
+
+        // For each pattern
+        for (patt =0; patt < bFunc->numPatterns; ++ patt){
+            BlasPatternInfo* bPatt = &bFunc->pattInfo[patt];
+
+            // Save pattern header
+            savePatternHeader(&outfile, bPatt);
+
+            for (extra =0; extra < bPatt->numExtra; ++ extra){
+                BlasExtraInfo* bExtra = &bPatt->extra[extra];
+
+                saveExtraHeader(&outfile, bExtra);
+
+                //
+                for (param =0; param < bExtra->numParam; ++param){
+                    BlasParamInfo* bParam = &bExtra->param[param];
+
+                    saveParamData(&outfile, bParam);
+                }
+            }
+        }
+    }
+    hfClose(&infile);
+    hfClose(&outfile);
+
+    // rename file
+    fret = remove(cache->fpath);
+    if (fret == 0) {
+        fret = rename(cache->fpath_tmp, cache->fpath);
+    }
+
+    // Re-init storage cache
+    destroyStorageCache ();
+    initStorageCache();
+}
+
+//Saving of the best parameter. It is running at tuning of subproblem dimension.
+//The parameter saving in in advance selected place.
+
+void
+saveBestParam(TargetDevice* tdev, BlasParamInfo* bParam)
+{
+	HfInfo file;
+	int    status;
+    StorageCacheImpl* cache;
+
+    cache = getStorageCache(tdev, false);
+    hfInit(&file);
+	status = hfOpenReWrite(&file, cache->fpath);
+	if (status == FILE_OK) {
+		POSFILE pos = bParam->offset;
+		hfJump(&file, pos);
+		saveParamData(&file, bParam);
+		bParam->sstatus = SS_CORRECT_DATA;
+	}
+	hfClose(&file);
+}
+
diff --git a/src/library/tools/tune/subdim.c b/src/library/tools/tune/subdim.c
new file mode 100644
index 0000000..37ead33
--- /dev/null
+++ b/src/library/tools/tune/subdim.c
@@ -0,0 +1,768 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>
+#include <assert.h>
+
+#include <blas_mempat.h>
+#include <blas_kgen.h>
+#include <clblas-internal.h>
+#include <kern_cache.h>
+#include <blas_funcs.h>
+
+#include "fileio.h"
+#include "toolslib.h"
+
+#include "tune.h"
+#include "subdim.h"
+#include <math.h>
+
+#if defined(_MSC_VER)
+#define fmin min
+#define fmax max
+#endif
+
+#define isLdsUsed(pattern)                                          \
+    (checkMatrixMemLevelSet(pattern, MATRIX_A, CLMEM_LEVEL_LDS) ||  \
+     checkMatrixMemLevelSet(pattern, MATRIX_B, CLMEM_LEVEL_LDS))
+
+int VISIBILITY_HIDDEN
+getDataTypeSize(DataType dataType)
+{
+    int dataTypeSize = 0;
+
+    switch (dataType) {
+        case TYPE_FLOAT:
+            dataTypeSize = 4;
+            break;
+        case TYPE_DOUBLE:
+        case TYPE_COMPLEX_FLOAT:
+            dataTypeSize = 8;
+            break;
+        case TYPE_COMPLEX_DOUBLE:
+            dataTypeSize = 16;
+            break;
+    }
+    return dataTypeSize;
+}
+/*
+*  Checks current dimensionality on a validity
+*/
+bool VISIBILITY_HIDDEN
+isSubDimValid(SubDimInfo* sd)
+{
+    int j;
+    size_t wgX = sd->pgran.wgSize[0];
+    size_t wgY = sd->pgran.wgSize[1];
+    SubproblemDim l0 = sd->sdim[0];
+    SubproblemDim l1 = sd->sdim[1];
+    size_t dataTypeSize = getDataTypeSize(sd->dtype);
+    size_t dataFloatSize = getDataTypeSize(TYPE_FLOAT);
+    int maxRegistr = 64;
+    bool ret = true;
+    bool inv;
+    IgnoreItem* ii = sd->first;
+
+    // if pattern-based validation is available
+    if( NULL != sd->pattern->sops->checkCalcDecomp ){
+
+        return sd->pattern->sops->checkCalcDecomp(
+            &sd->pgran,
+            sd->sdim,
+            2,
+            sd->dtype,
+            PGRAN_CHECK );
+    }
+
+    ret = ret && (l1.y >= 4*dataFloatSize/dataTypeSize);
+
+    if (sd->blasLevel == 3) {
+        if (!isMatrixAccessColMaj(sd->func, sd->flag, MATRIX_A) ||
+                !isMatrixAccessColMaj(sd->func, sd->flag, MATRIX_B)) {
+            /* Avoid small bwidth and big x0, y0 for cases other than
+             * column major access to both matrixes */
+            ret = ret && (l1.bwidth >= 4*dataFloatSize/dataTypeSize);
+            ret = ret && (l0.y < 128);
+            ret = ret && (l0.x < 128);
+        }
+    }
+
+    if ( 0 == l1.bwidth ){
+        return false;
+    }
+    else{
+        ret = ret && ((l0.bwidth % l1.bwidth) == 0);
+        ret = ret && (wgX*wgY == 64);
+    }
+    //ret = ret && (wgX*wgY < sd->workGroupSizes);
+    //ret = ret && (wgX*wgY > 16);
+    if (sd->blasLevel == 2) {
+        ret = ret && (l0.y > l1.y);
+    }
+    else {
+        ret = ret && (l0.x > l1.x);
+        ret = ret && (l0.y > l1.y);
+        ret = ret && (l1.x >= 4*dataFloatSize/dataTypeSize);
+    }
+    if (sd->is2D) {
+        bool r = ret;
+        ret = ret && (wgY * l1.itemX == l0.x);
+        ret = ret && (wgX * l1.itemY == l0.y);
+        if (r != ret) {
+            return ret;
+        }
+    }
+
+    if (ret && sd->isSquareBlock) {
+        ret = ret &&  (l0.x == l0.y && l0.x == l0.bwidth);
+    }
+
+    //if (!(isLdsUsed(sd->pattern) || (sd->isSquareBlock && sd->nrLevel == 2))) {
+    //    ret = ret &&  l0.bwidth == l1.bwidth;
+    //}
+
+    if (ret) {
+        int r ;
+        r = (int)(l1.x*l1.bwidth + l1.y*l1.bwidth + l1.x*l1.y);
+
+        r = r * (int)dataTypeSize / sizeof(cl_float4);
+
+        if (r > maxRegistr) {
+            return false;
+        }
+    }
+
+    if  (ret &&  sd->pattern->sops->isFitToLDS != NULL) {
+        bool isFitToLDS;
+        CLBlasKargs args;
+
+        convKExtraFlagToArg(sd->flag, &args);
+
+        isFitToLDS = sd->pattern->sops->isFitToLDS(sd->sdim, sd->dtype,
+                                               sd->ldsSize, &args);
+        if (!isFitToLDS)
+            return false;
+    }
+
+    // Skip ignored dimension
+    for (;ii != NULL; ii = ii->next) {
+        inv = true;
+        for(j = 0; j < V_COUNT; ++j) {
+            int v1 = ii->var[j];
+            int v2 = get(&sd->var[j]);
+            if (v1 == -1) {
+                continue;
+            }
+            if (v1 == v2) {
+                continue;
+            }
+            inv = false;
+            break;
+        }
+        if (inv) {
+            ret = false;
+        }
+    }
+
+    return ret;
+}
+
+/*
+ * Set invalid SubDimension.
+ * Invalid SubDimensions will be skipped.
+ */
+void VISIBILITY_HIDDEN
+setInvalid(SubDimInfo* sdi, int l0x, int l0y, int l0w,
+                            int l1x, int l1y, int l1w)
+{
+    IgnoreItem* ii = malloc(sizeof(IgnoreItem));
+    ii->var[V_L0_X]  = l0x;
+    ii->var[V_L0_Y]  = l0y;
+    ii->var[V_L0_BW] = l0w;
+    ii->var[V_L1_X]  = l1x;
+    ii->var[V_L1_Y]  = l1y;
+    ii->var[V_L1_BW] = l1w;
+    ii->next = sdi->first;
+    sdi->first = ii;
+}
+
+void VISIBILITY_HIDDEN
+initVector(SubDimInfo* sd)
+{                //0 1 2 3  4  5  6   7   8   9   10   11
+    int dim  [] = {1,2,4,8,16,32,64,128,256,512,1024,2048, 4096};
+    if (sd->blasLevel == 2 ) {
+        setVariable(sd, V_L0_X,  1, &dim[0]);
+        setVariable(sd, V_L0_Y,  6, &dim[4]);
+        setVariable(sd, V_L0_BW, 10, &dim[0]);
+        setVariable(sd, V_L1_X,  1, &dim[0]);
+        setVariable(sd, V_L1_Y,  6, &dim[1]);
+        setVariable(sd, V_L1_BW, 6, &dim[0]);
+    }
+    else {
+        setVariable(sd, V_L0_X,  4, &dim[4]);
+        setVariable(sd, V_L0_Y,  4, &dim[4]);
+        setVariable(sd, V_L0_BW, 6, &dim[0]);
+        setVariable(sd, V_L1_X,  6, &dim[0]);
+        setVariable(sd, V_L1_Y,  6, &dim[0]);
+        setVariable(sd, V_L1_BW, 6, &dim[0]);
+    }
+}
+
+void VISIBILITY_HIDDEN
+initKNMVector(
+        SubDimInfo* sd,
+        unsigned int baseDim,
+        unsigned int* K,
+        unsigned int* N,
+        unsigned int* M
+        )
+{
+    if (sd->blasLevel == 2 ) {
+        *K = 1;
+        *N = baseDim * 2;
+        *M = baseDim * 2;
+    } else
+    {
+        *K = baseDim;
+        *N = baseDim;
+        *M = baseDim;
+    }
+}
+
+int VISIBILITY_HIDDEN
+get(SubDimItem* sd)
+{
+    return sd->data[sd->curId];
+}
+
+void VISIBILITY_HIDDEN
+calcPGranularity (SubDimInfo* sd)
+{
+    SubproblemDim* dim = sd->sdim;
+    PGranularity* pgran = &sd->pgran;
+    //int level = sd->cuLevel;
+
+    pgran->wgDim = 2;
+    pgran->wfSize = 64;
+
+
+    // if pattern provides granularity calculation
+    // call the pattern function
+    if( NULL != sd->pattern->sops->checkCalcDecomp ){
+
+        sd->pattern->sops->checkCalcDecomp(
+            pgran,
+            dim,
+            2,
+            sd->dtype,
+            PGRAN_CALC );
+    }
+    else{
+        pgran->wgSize[1] =  (unsigned int)(dim[0].x / dim[1].itemX);
+        pgran->wgSize[0] =  (unsigned int)(dim[0].y / dim[1].itemY);
+
+        if (!sd->is2D) {
+            pgran->wgDim = 1;
+            pgran->wgSize[0] *= pgran->wgSize[1];
+            pgran->wgSize[1] = 1;
+        }
+    }
+
+}
+
+void VISIBILITY_HIDDEN
+calcParam(SubDimInfo* sd)
+{
+    SubproblemDim* dim = sd->sdim;
+
+    int dataTypeSize = getDataTypeSize(sd->dtype);
+
+    memset(dim, 0, sizeof(sd->sdim));
+
+    dim[0].x      = get(&sd->var[V_L0_X]);
+    dim[0].itemX  = get(&sd->var[V_L0_X]);
+    dim[0].y      = get(&sd->var[V_L0_Y]);
+    dim[0].itemY  = get(&sd->var[V_L0_Y]);
+    dim[0].bwidth = get(&sd->var[V_L0_BW]);
+
+    dim[1].x      = get(&sd->var[V_L1_X]);
+    dim[1].itemX  = get(&sd->var[V_L1_X]);
+    dim[1].y      = get(&sd->var[V_L1_Y]);
+    dim[1].itemY  = get(&sd->var[V_L1_Y]);
+    dim[1].bwidth = get(&sd->var[V_L1_BW])
+            / (dataTypeSize / getDataTypeSize(TYPE_FLOAT));
+
+    if (funcHasTriangMatrix((BlasFunctionID)sd->func) && !sd->is2D) {
+        dim[0].itemY  = SUBDIM_UNUSED;
+    }
+
+    if (sd->blasLevel == 2) {
+        size_t xBlocks;
+
+        xBlocks = dim[0].x / dim[1].x;
+        dim[0].x = 1;
+        dim[1].itemX = 1;
+        dim[1].x = 1;
+        if( NULL == sd->pattern->sops->checkCalcDecomp ){
+            dim[0].bwidth = dim[1].bwidth * xBlocks;
+        }
+    }
+
+    calcPGranularity(sd);
+}
+
+bool VISIBILITY_HIDDEN
+next(SubDimItem var[V_COUNT])
+{
+    int i = V_COUNT - 1;
+    bool next;
+    do {
+        next = false;
+        var[i].curId ++;
+        if (var[i].curId >= var[i].maxId) {
+            var[i].curId = 0;
+            next = true;
+            -- i;
+        }
+    } while (next && i >= 0 );
+    return (next && i < 0);
+}
+
+void VISIBILITY_HIDDEN
+findValidSubdimInit(SubDimInfo* sd)
+{
+    bool n = false;
+    do {
+        n = false;
+        calcParam(sd);
+        sd->valid = sd->isValid(sd);
+        if (!sd->valid) {
+            n = !next(sd->var);
+            sd->valid = false;
+        }
+    } while (n);
+}
+
+bool
+nextSubdimElem(SubDimInfo* sd)
+{
+    bool n = false;
+
+    // !!! DEBUG
+    if (sd->count > 500) {
+        *(int*)0 = 0;
+    }
+
+    sd->count ++;
+    if (sd->valid == false) {
+        return false;
+    }
+
+    if (sd->init != NULL) {
+        sd->valid = false;
+        n = !next(sd->var);
+        if (n)
+            findValidSubdimInit(sd);
+    }
+    return sd->valid;
+}
+
+/*
+ * The variant included of the group.
+ */
+bool
+isMemberOfGroup(GroupStatInfo* gsi,  Variant* vi)
+{
+    bool res = true;
+    res &= gsi->var[V_L0_X]  == -1 || vi->var[V_L0_X]  == gsi->var[V_L0_X];
+    res &= gsi->var[V_L0_Y]  == -1 || vi->var[V_L0_Y]  == gsi->var[V_L0_Y];
+    res &= gsi->var[V_L0_BW] == -1 || vi->var[V_L0_BW] == gsi->var[V_L0_BW];
+    res &= gsi->var[V_L1_X]  == -1 || vi->var[V_L1_X]  == gsi->var[V_L1_X];
+    res &= gsi->var[V_L1_Y]  == -1 || vi->var[V_L1_Y]  == gsi->var[V_L1_Y];
+    res &= gsi->var[V_L1_BW] == -1 || vi->var[V_L1_BW] == gsi->var[V_L1_BW];
+    return res;
+}
+
+/*
+ * Calculate the minimum expected run time.
+ */
+
+double
+calcMinExpectedTimeForGroup(GroupStatInfo* gsi)
+{
+    /*
+     * K_INCREASE - Expected range of time values in the group
+     * K_GLOBAL -
+     */
+    const double K_INCREASE = 1.5;
+    const double K_GLOBAL = 0.97;
+
+    /* Number of variants in group */
+    double m = gsi->allCount;
+    /* Number of variants in group for whom time is measured*/
+    double i = gsi->count;
+
+    /*
+     *  k - Reflects the expected spread of values in the group,
+     *  depending on the number of measurements
+     *  decreases with increasing i
+     *  if i == 1 then k K_INCREASE
+     *  if i == m then k = 1
+     */
+
+    double ki = 1/ ((K_INCREASE + K_INCREASE/(m+i) -1)/(i) + (m-K_INCREASE)/(m+1));
+    double averageTime = (gsi->allTime / m);
+
+    /*
+     * kdelta - Reflects the expected spread of values in the group,
+     * depending on the spread of values of the measured variations
+     */
+
+    double kdelta = (gsi->minTime*3)/((gsi->minTime*2) + averageTime);
+    double t = K_GLOBAL * kdelta * ki * gsi->minTime;
+
+    /*
+     * Select the minimum time between the minimum time for the current group
+     * and the minimum time for the previous groups
+     */
+    return t;
+}
+
+bool
+nextSubdim(SubDimInfo* sd, int maxParam, double time)
+{
+    int i;
+    int j;
+    double minW = -5000;
+    int vari = 0;
+    double midTime;
+    int iCount = 0;
+    double maxTime;
+    const int MAX_WEIGHT = 99;
+
+    Variant* v0 = sd->curVar;   // Current variant
+    Variant* varNext = NULL;    // Next Variant
+
+    if (sd->count >= maxParam) {
+        return false;
+    }
+
+    if (sd->returnAll) {
+        bool ret = nextSubdimElem (sd);
+        calcParam(sd);
+        sd->curVarID = sd->count;
+        return ret;
+    }
+
+    v0->time = time;
+    sd->sumTime += time;
+
+    midTime = sd->sumTime/(sd->count + 1);
+
+    if (time > 0)  {
+        sd->minTime = fmin(sd->minTime, (float)time);
+    }
+
+    maxTime = fmax(2.1*midTime - sd->minTime,  sd->minTime*5);
+
+    /* Initialize all groups */
+    for (j = 0; j < sd->infoCount; j++ ) {
+        GroupStatInfo* si = &sd->info[j];
+        si->allTime = 0;
+        si->count   = 0;
+        si->minTime = 1e9;
+    }
+
+    /* Calculate an estimate for the groups */
+    for (i = 0; i < sd->varCount; ++i) {
+        Variant* vi = &sd->allVariant[i];
+        /* If time for variant is measured*/
+        if (vi->time > 0) {
+            for (j = 0; j < sd->infoCount; j++ ) {
+                GroupStatInfo* gsi = &sd->info[j];
+                // For each group, if variant is member this group
+                if (isMemberOfGroup(gsi, vi)) {
+                    gsi->minTime = fmin(gsi->minTime, vi->time);
+                    gsi->allTime += fmin(vi->time, maxTime);
+                    gsi->count ++;
+                    gsi->minTime = calcMinExpectedTimeForGroup(gsi);
+                }
+           }
+        }
+        vi->minTime = 0;
+        vi->maxTime = 5000;
+        vi->weight  = MAX_WEIGHT;
+    }
+
+    /*
+     * Calculate the estimate run-time variant
+     */
+    for (i = 0; i < sd->varCount; ++i) {
+        Variant* vi = &sd->allVariant[i];
+
+        vi->weight = MAX_WEIGHT;
+        if (vi->time == 0) {
+            double kgroup = 1.0;
+
+            for (j = 0; j < sd->infoCount; j++ ) {
+                GroupStatInfo* gsi = &sd->info[j];
+                // if the variant included of the group
+                if (isMemberOfGroup(gsi, vi)) {
+                    if (gsi->count > 0) {
+                        vi->minTime = fmax(vi->minTime, gsi->minTime);
+                        vi->weight  = sd->minTime/vi->minTime;
+                    }
+                    else {
+                        /*
+                         * If variant don't included of the group
+                         * then to reduce estimated time
+                         */
+                        kgroup *= 1.1;
+                    }
+                }
+            }
+            vi->weight *= kgroup;
+            vi->minTime /= kgroup;
+        }
+    }
+
+    /* Find variant with minimal run time */
+
+    for (i = 0; i < sd->varCount; ++i)
+    {
+        Variant* vi = &sd->allVariant[i];
+        if (vi->time == 0 && vi->weight >= 0.01 ) {
+            iCount ++;
+
+            if (minW < vi->weight) {
+                minW = vi->weight;
+                varNext = vi;
+                vari = i;
+            }
+        }
+    }
+
+    //
+    if (varNext == NULL) {
+        return false;
+    }
+
+    sd->curVar =  varNext;
+    sd->curVarID = vari;
+#ifdef TEST_LOG
+    printf ("%4d %6.2f [%6.2f:%5.2f ]",iCount, sd->minTime,
+            sd->curVar->minTime, sd->curVar->weight);
+#endif
+
+    for(j = 0; j < V_COUNT; ++j) {
+        sd->var[j].curId = varNext->var[j];
+    }
+
+    calcParam(sd);
+    sd->count++;
+    return true;
+}
+
+void
+resetSubdim(SubDimInfo* sd)
+{
+    int i;
+    for (i=0; i< V_COUNT; ++i) {
+        sd->var[i].curId = 0;
+    }
+
+    sd->count = 0;
+
+    sd->valid = false;
+    if (sd->init != NULL) {
+        sd->init(sd);
+        findValidSubdimInit(sd);
+
+        assert(sd->valid);
+    }
+}
+
+/*
+ * Groups variants in nonzero parameters.
+ *
+ * Example: l0x = 1 and remaining parameters = 0;
+ * At different variants the parameter l0x accepts values 16, 32, 64.
+ * At the first stage creates are 3 groups (a set of groups).
+ * At the second stage all variants are arranged on these groups.
+ *
+ * The each variant included one group of the set of group.
+ * The each variant included in each set of group.
+ * In set of group can be only one group
+ */
+
+void setGroup(SubDimInfo* sd,
+         int l0x, int l0y, int l0w,
+         int l1x, int l1y, int l1w,
+         int pg)
+{
+    int i, j;
+    int start = sd->infoCount;
+    int end   = sd->infoCount;
+
+    (void) pg;
+
+    //For each variant
+    for (i = 0; i < sd->varCount; ++i) {
+        Variant* vi = &sd->allVariant[i];
+        int  id = -1;
+        // For each group of the set of group
+        for (j = start; j < end; j++ ) {
+            bool bj = true;
+            bj &= l0x == 0 || vi->var[V_L0_X] == sd->info[j].var[V_L0_X];
+            bj &= l0y == 0 || vi->var[V_L0_Y] == sd->info[j].var[V_L0_Y];
+            bj &= l0w == 0 || vi->var[V_L0_BW] == sd->info[j].var[V_L0_BW];
+            bj &= l1x == 0 || vi->var[V_L1_X] == sd->info[j].var[V_L1_X];
+            bj &= l1y == 0 || vi->var[V_L1_Y] == sd->info[j].var[V_L1_Y];
+            bj &= l1w == 0 || vi->var[V_L1_BW] == sd->info[j].var[V_L1_BW];
+            // if the variant belongs to group
+            if (bj) {
+                id = j;
+                break;
+            }
+        }
+        /*
+         * if the variant doesn't belong to any group create new group
+         */
+
+        if (id == -1) {
+            sd->info[end].var[V_L0_X]  = (l0x == 1)? vi->var[V_L0_X]  : -1;
+            sd->info[end].var[V_L0_Y]  = (l0y == 1)? vi->var[V_L0_Y]  : -1;
+            sd->info[end].var[V_L0_BW] = (l0w == 1)? vi->var[V_L0_BW] : -1;
+            sd->info[end].var[V_L1_X]  = (l1x == 1)? vi->var[V_L1_X]  : -1;
+            sd->info[end].var[V_L1_Y]  = (l1y == 1)? vi->var[V_L1_Y]  : -1;
+            sd->info[end].var[V_L1_BW] = (l1w == 1)? vi->var[V_L1_BW] : -1;
+            sd->info[end].pg = 0;
+
+            sd->info[end].allTime = 0;
+            sd->info[end].allCount = 1;
+
+            end++;
+            sd->infoCount++;
+        }
+        else {
+            sd->info[id].allCount++;
+        }
+    }
+}
+
+void
+initSubDimInfo(SubDimInfo* sd,
+               MemoryPattern* mempatt,
+               DeviceInfo* devinfo,
+               unsigned int func,
+               unsigned int patt,
+               DataType dtype,
+               KernelExtraFlags flag)
+{
+    int i = 0;
+
+    memset(sd, 0, sizeof(SubDimInfo));
+
+    sd->func = func;
+    sd->patt = patt;
+    sd->dtype = dtype;
+    sd->flag  = flag;
+    sd->pattern = mempatt;
+    sd->first = NULL;
+
+    sd->is2D  = (sd->pattern->sops->getFlags() & SF_WSPACE_2D)?true:false;
+    sd->isSquareBlock = ((sd->pattern->sops->getFlags() &
+                          SF_TOP_INPUT_SQUARE_BLOCKS) != 0);
+    sd->blasLevel = funcBlasLevel(sd->func);
+    sd->nrLevel   = sd->pattern->nrLevels;
+
+    sd->ldsSize = devinfo->ldsSize;
+    sd->workGroupSizes = devinfo->workGroupSizes;
+
+    // Virtual function
+    sd->isValid = isSubDimValid;
+    sd->init = initVector;
+
+    resetSubdim(sd);
+
+    i = 0;
+    do {
+        i++;
+    } while (nextSubdimElem(sd));
+    sd->allVariant = malloc(i* sizeof(Variant));
+
+    resetSubdim(sd);
+    sd->varCount = i;
+
+    for (i = 0; i < sd->varCount; ++i) {
+        int j;
+        int gpx;
+        int gpy;
+
+        for(j = 0; j < V_COUNT; ++j) {
+            sd->allVariant[i].var[j]  = sd->var[j].curId;
+        }
+
+        sd->allVariant[i].minTime = 0.0;
+        sd->allVariant[i].probableTime = 0.0;
+        sd->allVariant[i].maxTime = 5000.0;
+        sd->allVariant[i].weight = 10;
+        sd->allVariant[i].time = 0;
+
+        gpx = get(&sd->var[V_L0_X])/ get(&sd->var[V_L1_X]);
+        gpy = get(&sd->var[V_L0_Y])/ get(&sd->var[V_L1_Y]);
+        sd->allVariant[i].pg =  gpx * 1000 + gpy;
+
+        nextSubdimElem(sd);
+    }
+    resetSubdim(sd);
+
+    sd->minTime = 9999;
+    sd->curVar = &sd->allVariant[0];
+    sd->curVarID = 0;
+
+
+    // Initializing group
+    sd->infoMaxCount = 5000;
+    sd->infoCount  = 0;
+    sd->info = malloc(sd->infoMaxCount * sizeof(GroupStatInfo) );
+
+    //           L0       L1       PG
+    //           x  y  w  x  y  w
+    setGroup(sd, 1, 1, 0, 0, 0, 0, 0);
+    setGroup(sd, 1, 1, 1, 0, 0, 0, 0);
+    setGroup(sd, 0, 0, 0, 1, 1, 1, 0);
+    setGroup(sd, 1, 1, 0, 1, 1, 0, 0);
+}
+
+void
+setVariable(struct SubDimInfo* sdi, SubDimVariable var, int dcount, int* dim)
+{
+    size_t size =  dcount*sizeof(int);
+
+    sdi->var[var].curId = 0;
+    sdi->var[var].maxId = dcount;
+
+    if (sdi->var[var].data != NULL) {
+        free (sdi->var[var].data);
+        sdi->var[var].data = NULL;
+    }
+    sdi->var[var].data = malloc(size);
+    memcpy(sdi->var[var].data, dim, size);
+}
+
+
diff --git a/src/library/tools/tune/subdim.h b/src/library/tools/tune/subdim.h
new file mode 100644
index 0000000..932906f
--- /dev/null
+++ b/src/library/tools/tune/subdim.h
@@ -0,0 +1,143 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#ifndef SUBDIM_H__
+#define SUBDIM_H__
+
+//#define TEST_LOG
+
+typedef struct SubDimItem
+{
+    int curId;
+    int maxId;
+    int* data;
+}SubDimItem;
+
+int get(SubDimItem * sdi);
+
+///////////////////////////////////////////////////////////////////////////////
+enum
+{
+    V_NONE = -1,
+};
+typedef enum SubDimVariable
+{
+    V_L0_X,
+    V_L0_Y,
+    V_L0_BW,
+    V_L1_X,
+    V_L1_Y,
+    V_L1_BW,
+    V_COUNT,
+}SubDimVariable;
+
+typedef struct IgnoreItem
+{
+    int var[V_COUNT];
+    struct IgnoreItem* next;
+}IgnoreItem;
+
+typedef struct GroupStatInfo
+{
+    int var[V_COUNT];
+    int pg;
+
+    double minTime;
+    double allTime;
+    int count;
+    int allCount;
+}GroupStatInfo;
+
+typedef struct Variant
+{
+    //
+    int var[V_COUNT];
+    int pg;
+    // Estimated time performance
+    double minTime;      // lower bound
+    double probableTime; //
+    double maxTime;      // upper bound
+
+    double weight;
+    double time;
+}Variant;
+
+///////////////////////////////////////////////////////////////////////////////
+
+typedef struct SubDimInfo
+{
+    // dynamic array for statistics
+    GroupStatInfo * info;
+    int infoCount;
+    int infoMaxCount;
+
+    Variant* allVariant;
+
+    SubDimItem var[V_COUNT];
+
+    PGranularity    pgran;
+    SubproblemDim   sdim[MAX_SUBDIMS];
+
+    MemoryPattern * pattern;
+    bool valid;
+
+    DataType            dtype;
+    KernelExtraFlags    flag;
+
+    unsigned int func;
+    unsigned int patt;
+
+    bool is2D;
+
+    int  blasLevel;
+    int  nrLevel;
+    bool isSquareBlock;
+    unsigned long ldsSize;
+    size_t workGroupSizes;
+
+    //
+    IgnoreItem * first;
+
+    int count;
+    double sumTime;
+
+    Variant* curVar;
+    int curVarID;
+    int varCount;
+    float minTime;
+
+    void (*init)(struct SubDimInfo* sdi);
+    bool (*isValid)(struct SubDimInfo* sdi);
+
+//#ifdef TEST_LOG
+    bool returnAll;
+//#endif
+
+}SubDimInfo;
+
+void setVariable(struct SubDimInfo* sdi, SubDimVariable var, int dcount, int* dim);
+void setInvalid (struct SubDimInfo* sdi, int l0x, int l0y, int l0w,
+        int l1x, int l1y, int l1w);
+
+bool nextSubdim(SubDimInfo* sd, int maxParam, double time);
+void resetSubdim(SubDimInfo* sd);
+void initSubDimInfo(SubDimInfo* sd, MemoryPattern* mempatt,
+               DeviceInfo* devinfo, unsigned int func, unsigned int patt,
+               DataType dtype, KernelExtraFlags flag);
+
+void destroySubdim(SubDimInfo* sd);
+void convKExtraFlagToArg(KernelExtraFlags flags, CLBlasKargs* args);
+#endif /* SUBDIM_H__ */
diff --git a/src/library/tools/tune/toolslib.c b/src/library/tools/tune/toolslib.c
new file mode 100644
index 0000000..680a219
--- /dev/null
+++ b/src/library/tools/tune/toolslib.c
@@ -0,0 +1,540 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <malloc.h>
+#include <string.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#include "storage_data.h"
+#include "toolslib.h"
+#include "devinfo.h"
+#include "assert.h"
+#include "clblas_stddef.h"
+#include "mutex.h"
+
+
+// The array size is the total number devices on all platforms
+static StorageCacheImpl* storageCacheArray = NULL;
+// Number of items in storage cache array
+// is the number of unique devices.
+static unsigned int    storageCacheArrayCount = 0;
+
+static mutex_t *storageCacheLock = NULL;
+
+
+static void
+clearPatternsNumber(BlasFunctionInfo *funcInfo)
+{
+    int i;
+
+    for (i = 0; i < BLAS_FUNCTIONS_NUMBER; i++) {
+        funcInfo[i].numPatterns = 0;
+    }
+}
+
+char*
+getDevName(TargetDevice* tdev)
+{
+    size_t size;
+    char* name;
+
+    clGetDeviceInfo(tdev->id, CL_DEVICE_NAME, 0, NULL, &size);
+    name = malloc(size * sizeof(char));
+    clGetDeviceInfo(tdev->id, CL_DEVICE_NAME, size, name, NULL);
+
+    return name;
+}
+
+void
+initCLDeviceInfoRec(TargetDevice* tdev, DeviceInfo *devInfo)
+{
+    cl_int status = 0;
+    cl_uint bDouble;
+    cl_device_id devID = tdev->id;
+    devInfo->tdev = tdev;
+
+    status = clGetDeviceInfo(devID,
+        CL_DEVICE_MAX_COMPUTE_UNITS,
+        sizeof(cl_uint),
+        &(devInfo->nrComputeUnits),
+        NULL);
+
+    status = clGetDeviceInfo(devID,
+        CL_DEVICE_GLOBAL_MEM_SIZE,
+        sizeof(cl_ulong),
+        &(devInfo->globalSize),
+        NULL);
+
+    status = clGetDeviceInfo(devID,
+        CL_DEVICE_LOCAL_MEM_SIZE,
+        sizeof(cl_ulong),
+        &(devInfo->ldsSize),
+        NULL);
+
+    status = clGetDeviceInfo(devID,
+        CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+        sizeof(cl_ulong),
+        &(devInfo->maxMemAllocSize),
+        NULL);
+
+    status = clGetDeviceInfo(devID,
+        CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE,
+        sizeof(cl_uint),
+        &(devInfo->alignment),
+        NULL);
+
+    status = clGetDeviceInfo(devID,
+        CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
+        sizeof(cl_uint),
+        &(devInfo->workItemSizesDim),
+        NULL);
+
+    status = clGetDeviceInfo(devID,
+        CL_DEVICE_MAX_WORK_ITEM_SIZES,
+        sizeof(size_t) * devInfo->workItemSizesDim,
+        &(devInfo->workItemSizes),
+        NULL);
+
+    status = clGetDeviceInfo(devID,
+        CL_DEVICE_MAX_WORK_GROUP_SIZE,
+        sizeof(size_t) ,
+        &(devInfo->workGroupSizes),
+        NULL);
+
+    status = clGetDeviceInfo(devID,
+        CL_DEVICE_ADDRESS_BITS,
+        sizeof(cl_uint),
+        &(devInfo->addressBits),
+        NULL);
+
+
+
+    status = clGetDeviceInfo(devID,
+        CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
+        sizeof(cl_uint),
+        &bDouble,
+        NULL);
+    devInfo->nativeDouble = deviceHasNativeDouble(devID, &status);
+
+    // Values are put randomly.
+    //TODO  To use the correct data
+    devInfo->nrStreamCores = 1;     /* Number of stream cores per Compute Unit */
+    devInfo->wavefront = 64;        /* Number of work-items executed in parallel on hardware */
+    devInfo->nativeComplex = true;  /* Specifies whether device supports complex float */
+}
+
+bool
+initReadingData(StorageCacheImpl* cacheImpl, TargetDevice* tdev )
+{
+    char* devName;
+    DeviceInfo  defInf;
+
+    initCLDeviceInfoRec(tdev, &defInf);
+    initBlasFuncionData(cacheImpl->functionInfo);
+    initCacheData(cacheImpl->functionInfo, &defInf);
+
+    cacheImpl->endFile = calcOffset(cacheImpl->functionInfo);
+    devName = getDevName(tdev);
+    cacheImpl->fpath = createFullPatch(devName, false);
+    cacheImpl->fpath_tmp = createFullPatch(devName, true);
+    free(devName);
+    if (cacheImpl->fpath == NULL) {
+        return false;
+    }
+
+    return true;
+}
+
+int
+getGranularityInfo(
+    // In
+    TargetDevice* tdev,
+    const char* pattName,
+    const DataType dt,
+    const KernelExtraFlags kflag,
+    int dim,  //
+    // Out
+    SubproblemDim *sdim,
+    PGranularity *pgran,
+    double *time)
+{
+    BlasParamInfo* bParam;
+    int ret = GF_ERROR;
+    int r;
+    StorageCacheImpl* cache = getStorageCache(tdev, false);
+
+    if (cache == NULL) {
+        return ret;
+    }
+
+    bParam = findParam(cache, pattName, dt, kflag, dim);
+    if (bParam != NULL) {
+        r = bParam->sstatus != SS_CORRECT_DATA;
+        if (!r) {
+            memcpy(sdim, bParam->sDim, sizeof(SubproblemDim)* MAX_SUBDIMS);
+            memcpy(pgran,&bParam->pGran, sizeof(PGranularity));
+            *time = bParam->time;
+            ret = GF_SUCCESS;
+        }
+        else if (r == -1) {
+            ret = GF_CORRUPT_FILE;
+            //printCorruptionError(devID);
+        }
+    }
+
+    return ret;
+}
+
+int
+getKernelInfo(
+    TargetDevice* devID,
+    const char* pattName,
+    const DataType dt,
+    const KernelExtraFlags kflag,
+    int dim,
+    unsigned char** buffer,
+    size_t* sizeBuffer)
+{
+    BlasParamInfo* bParam;
+    int ret = GF_ERROR;
+    StorageCacheImpl* cache = getStorageCache(devID, false);
+
+    if (cache == NULL) {
+        return ret;
+    }
+
+    memset(buffer, 0, sizeof(char*) * MAX_CLBLAS_KERNELS_PER_STEP);
+    memset(sizeBuffer, 0, sizeof(size_t) * MAX_CLBLAS_KERNELS_PER_STEP);
+    if (cache->isPopulate) {
+        bParam = findParam(cache, pattName, dt, kflag, dim);
+        if (bParam != NULL) {
+            loadKernelsFromFile(cache, bParam, buffer, sizeBuffer);
+            if (buffer[0] == NULL) {
+                ret = GF_SUCCESS;
+            }
+        }
+    }
+    return ret;
+}
+/******************************************************************************/
+
+void
+destroyStorageCache(void)
+{
+    unsigned int i;
+    StorageCacheImpl*  curCache;
+
+    if(storageCacheArray != NULL) {
+        for (i = 0; i < storageCacheArrayCount; i++) {
+            curCache = &storageCacheArray[i];
+
+            if (curCache != NULL) {
+                destroyData(curCache->functionInfo);
+
+                if (curCache->fpath != NULL) {
+                    free(curCache->fpath);
+                }
+                if (curCache->fpath_tmp != NULL) {
+                    free(curCache->fpath_tmp);
+                }
+
+                curCache->isPopulate = false;
+            }
+        }
+
+        storageCacheArrayCount = 0;
+
+        mutexDestroy(storageCacheLock);
+        storageCacheLock = NULL;
+
+        free(storageCacheArray);
+        storageCacheArray = NULL;
+    }
+}
+
+BlasFunctionInfo*
+getBlasFunctionInfo(TargetDevice* tdev, int func)
+{
+    StorageCacheImpl*  impl = getStorageCache(tdev, false);
+    BlasFunctionInfo* ret = NULL;
+
+    if (impl == NULL) {
+        return NULL;
+    }
+
+    if (func >= 0 && func < BLAS_FUNCTIONS_NUMBER) {
+        ret = &impl->functionInfo[func];
+    }
+    return ret;
+}
+
+
+#define CHECK_(X) \
+        res = X; \
+        if (!res) { \
+            printf("ERROR %s\n", #X); \
+            /*raise(SIGTRAP);*/ \
+        }
+
+void checkFILE(TargetDevice* tdev, BlasFunctionInfo* fiArr)
+{
+    StorageCacheImpl*  impl;
+    bool res;
+    int func;
+    unsigned int patt;
+    unsigned int extra;
+    unsigned int param;
+
+    impl = getStorageCache(tdev, false);
+    if (impl == NULL) {
+        return;
+    }
+
+    for (func = 0; func < BLAS_FUNCTIONS_NUMBER; func++) {
+        BlasFunctionInfo* cfi = &impl->functionInfo[func];
+        BlasFunctionInfo* fi = &fiArr[func];
+
+        CHECK_(cfi->funcNo == fi->funcNo);
+        CHECK_(cfi->numPatterns == fi->numPatterns);
+        CHECK_(cfi->maskForTuningsKernel == fi->maskForTuningsKernel);
+        CHECK_(cfi->maskForUniqueKernels == fi->maskForUniqueKernels);
+        CHECK_(cfi->defaultPattern == fi->defaultPattern);
+        CHECK_(cfi->defaultPattern == fi->defaultPattern);
+        CHECK_(strcmp(cfi->name, fi->name) == 0);
+        //CHECK_(cfi-> == fi->)
+        for (patt = 0; patt < fi->numPatterns; ++patt) {
+            BlasPatternInfo* cpi = &cfi->pattInfo[patt];
+            BlasPatternInfo* pi = &fi->pattInfo[patt];
+            MemoryPattern*  cmp = &cfi->pattern[patt];
+            MemoryPattern*  mp = &fi->pattern[patt];
+
+            CHECK_(cpi->numExtra == pi->numExtra );
+            CHECK_(cpi->numTuneExtra == pi->numTuneExtra);
+            CHECK_(cpi->offset == pi->offset);
+            CHECK_(cpi->size == pi->size);
+            //CHECK_(cpi->sstatus == pi->sstatus);
+            CHECK_(cpi->pattNo == pi->pattNo);
+            CHECK_(strcmp(cpi->name, pi->name) == 0);
+
+            CHECK_(cmp->nrLevels == mp->nrLevels );
+            CHECK_(cmp->cuLevel == mp->cuLevel );
+            CHECK_(cmp->thLevel == mp->thLevel );
+            CHECK_(cmp->sops == mp->sops );
+            CHECK_(cmp->extra == mp->extra );
+            CHECK_(strcmp(cmp->name, mp->name) == 0);
+
+            for (extra = 0; extra < pi->numExtra; ++extra) {
+                BlasExtraInfo* cei = &cpi->extra[extra];
+                BlasExtraInfo* ei = &pi->extra[extra];
+
+                CHECK_(cei->numParam == ei->numParam);
+                CHECK_(cei->dtype == ei->dtype);
+                CHECK_(cei->flags == ei->flags);
+                CHECK_(cei->vecLen == ei->vecLen);
+                CHECK_(cei->isUseForTunning == ei->isUseForTunning);
+
+                CHECK_(cei->offset == ei->offset);
+                CHECK_(cei->size == ei->size);
+                CHECK_(cei->sstatus == ei->sstatus);
+
+
+                for (param = 0; param < ei->numParam; ++param) {
+                    BlasParamInfo* cpri = &cei->param[param];
+                    BlasParamInfo* pri = &ei->param[param];
+
+                    CHECK_(cpri->dim == pri->dim);
+                    CHECK_(cpri->pGran.wfSize == pri->pGran.wfSize);
+                    CHECK_(cpri->pGran.wgDim == pri->pGran.wgDim);
+                    CHECK_(cpri->pGran.wgSize[0] == pri->pGran.wgSize[0]);
+                    CHECK_(cpri->pGran.wgSize[1] == pri->pGran.wgSize[1]);
+                    CHECK_(cpri->sDim[0].bwidth == pri->sDim[0].bwidth);
+                    CHECK_(cpri->sDim[0].itemX== pri->sDim[0].itemX);
+                    CHECK_(cpri->sDim[0].itemY== pri->sDim[0].itemY);
+                    CHECK_(cpri->sDim[0].x == pri->sDim[0].x);
+                    CHECK_(cpri->sDim[0].y == pri->sDim[0].y);
+                    CHECK_(cpri->sDim[1].bwidth == pri->sDim[1].bwidth);
+                    CHECK_(cpri->sDim[1].itemX== pri->sDim[1].itemX);
+                    CHECK_(cpri->sDim[1].itemY== pri->sDim[1].itemY);
+                    CHECK_(cpri->sDim[1].x == pri->sDim[1].x);
+                    CHECK_(cpri->sDim[1].y == pri->sDim[1].y);
+                    CHECK_(cpri->sDim[2].bwidth == pri->sDim[2].bwidth);
+                    CHECK_(cpri->sDim[2].itemX== pri->sDim[2].itemX);
+                    CHECK_(cpri->sDim[2].itemY== pri->sDim[2].itemY);
+                    CHECK_(cpri->sDim[2].x == pri->sDim[2].x);
+                    CHECK_(cpri->sDim[2].y == pri->sDim[2].y);
+                    CHECK_(cpri->time == pri->time);
+                    CHECK_(cpri->offset == pri->offset);
+                    CHECK_(cpri->size == pri->size);
+                    CHECK_(cpri->sstatus == pri->sstatus);
+                }
+            }
+        }
+    }
+}
+
+bool
+isDeviceEQ(DeviceIdent* dev1, DeviceIdent* dev2)
+{
+    bool ret = true;
+
+    ret &= dev1->chip == dev2->chip;
+    ret &= dev1->family == dev2->family;
+    ret &= dev1->vendor == dev2->vendor;
+
+    return ret;
+}
+
+StorageCacheImpl*
+getStorageCache(TargetDevice* tdev, bool force)
+{
+    unsigned int k;
+    StorageCacheImpl* curCache = NULL;
+
+    assert(storageCacheArray != NULL);
+    assert(storageCacheLock != NULL);
+
+    for (k = 0; k < storageCacheArrayCount; ++k) {
+        if (isDeviceEQ(&tdev->ident, &storageCacheArray[k].devIdent) ) {
+            curCache  = &storageCacheArray[k];
+        }
+    }
+
+    assert (curCache != NULL);
+
+    // Read data from file can be only one thread
+    // Work with the cached data can all threads in parallel
+    if (!curCache->isInit) {
+        mutexLock(storageCacheLock);                // LOCK
+
+        if (!curCache->isInit) {
+            curCache->isPopulate = false;
+
+            if (initReadingData(curCache, tdev)) {
+                loadDataFromFile(curCache);
+            }
+
+            curCache->isInit = true;
+        }
+        mutexUnlock(storageCacheLock);              // UNLOCK
+    }
+
+    // if storage cashe is empty then return NULL
+    if (!(curCache->isPopulate || force)) {
+        curCache = NULL;
+    }
+
+    return curCache;
+}
+
+unsigned int
+getPlatforms(cl_platform_id **platforms)
+{
+    cl_int ret;
+    cl_uint numberPlatform;
+
+    ret = clGetPlatformIDs(0, NULL, &numberPlatform);
+
+    if (ret != CL_SUCCESS || numberPlatform == 0) {
+        return  0;
+    }
+
+    *platforms = calloc(numberPlatform, sizeof(cl_platform_id));
+
+    if (*platforms == NULL) {
+        return 0;
+    }
+
+    ret = clGetPlatformIDs(numberPlatform, *platforms, NULL);
+    return numberPlatform;
+}
+
+void
+initStorageCache(void)
+{
+    cl_uint numberPlatform = 0;
+    cl_platform_id *platforms = NULL;
+    cl_device_id *devices = NULL;
+    StorageCacheImpl* cur = NULL;
+    cl_int ret;
+
+    unsigned int deviceCount = 0;
+    unsigned int i, j, k;
+
+    assert (storageCacheLock == NULL);
+    assert (storageCacheArray == NULL);
+    assert (storageCacheArrayCount == 0);
+
+    storageCacheLock = mutexInit();
+    numberPlatform = getPlatforms(&platforms);
+
+    if (numberPlatform ==0) {
+        return;
+    }
+
+    for (i =0; i < numberPlatform; ++i) {
+        cl_uint dc;
+
+        ret = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &dc);
+        if (ret == CL_SUCCESS) {
+            deviceCount += dc;
+        }
+    }
+
+    storageCacheArray = calloc(deviceCount, sizeof(*storageCacheArray));
+
+    for (i =0; i < numberPlatform; ++i) {
+        cl_uint dc;
+
+        ret = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &dc);
+        if (ret != CL_SUCCESS) {
+            continue;
+        }
+
+        devices = calloc(dc, sizeof(*devices));
+
+        ret = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, dc, devices, NULL);
+
+        for (j = 0; j < dc; ++ j) {
+            TargetDevice td;
+            bool isUnique = true;
+
+            td.id = devices[j];
+            identifyDevice(&td);
+
+            for (k = 0; k < storageCacheArrayCount; ++k) {
+                if (isDeviceEQ(&td.ident, &storageCacheArray[k].devIdent) ) {
+                    isUnique = false;
+                }
+            }
+
+            if (isUnique) {
+                cur = &storageCacheArray[storageCacheArrayCount];
+
+                clearPatternsNumber(cur->functionInfo);
+                cur->isInit = false;
+                cur->devIdent.chip = td.ident.chip;
+                cur->devIdent.family = td.ident.family;
+                cur->devIdent.vendor = td.ident.vendor;
+
+                storageCacheArrayCount++;
+            }
+        }
+        free(devices);
+    }
+    free (platforms);
+}
diff --git a/src/library/tools/tune/toolslib.h b/src/library/tools/tune/toolslib.h
new file mode 100644
index 0000000..48c27e6
--- /dev/null
+++ b/src/library/tools/tune/toolslib.h
@@ -0,0 +1,87 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef TOOLSLIB_H__
+#define TOOLSLIB_H__
+
+#include <CL/cl.h>
+
+#include <defbool.h>
+#include <devinfo.h>
+#include <cltypes.h>
+
+#include <granulation.h>
+#include <kernel_extra.h>
+
+// Interface to access to saved data
+
+#define GF_SUCCESS          0
+#define GF_ERROR            1
+#define GF_INVALID_CACHE    2
+#define GF_CORRUPT_FILE     3
+#define GF_KERNEL_NOT_FOUND 4
+
+
+/*
+ * FIXME: It's a kludge to dedicated processing a case when matrix leading
+ *        dimension is aligned on the bank size
+ */
+#define BANK_ALIGNED_CASE_RECORD_IDX 5
+
+
+typedef int dimension;
+
+void
+initStorageCache(void);
+
+void
+destroyStorageCache(void);
+
+int
+getGranularityInfo (
+    TargetDevice* tdev,
+    const char* pattName,
+    const DataType dt,
+    const KernelExtraFlags kflag,
+    dimension dim,
+    SubproblemDim* sdim,
+    PGranularity*
+    pgran,
+    double* time);
+
+int
+getKernelInfo (
+    TargetDevice* tdev,
+    const char* pattName,
+    const DataType dt,
+    const KernelExtraFlags kflag,
+    dimension dim,
+    unsigned char** bufer,
+    size_t* sizeBufer);
+
+int getDimensionCount(TargetDevice* tdev, int func);
+
+dimension
+getDimensionID (
+    TargetDevice* tdev,
+    int func,
+    size_t M,
+    size_t N,
+    size_t K);
+
+#endif /* TOOLSLIB_H__ */
+
diff --git a/src/library/tools/tune/tune.c b/src/library/tools/tune/tune.c
new file mode 100644
index 0000000..d41e45a
--- /dev/null
+++ b/src/library/tools/tune/tune.c
@@ -0,0 +1,2646 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <malloc.h>
+#include <math.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <CL/cl.h>
+
+// #include "fileio.h"
+#include "toolslib.h"
+#include "tune.h"
+#include "devinfo.h"
+#include "assert.h"
+#include "solution_seq.h"
+#include "matrix_dims.h"
+
+#include "subdim.h"
+
+#if defined(_MSC_VER)
+#include "Windows.h"
+#else
+#include "time.h"
+#endif
+
+#define EXIT_COD_OK                         0x0000
+#define EXIT_COD_CL_ERROR                   0x0100
+#define EXIT_COD_UNKNOWN_DATATYPE           0x0101
+#define EXIT_COD_NO_DATA                    0x0102
+#define EXIT_COD_NO_ENVIRONMENT_VARIABLE    0x0103
+#define EXIT_COD_BAD_ENVIRONMENT_VARIABLE   0x0104
+
+#define TYPE_NUMBER 4
+#define MAX_RUN_KERNEL 3
+
+typedef  int KMASK;
+
+//////////////////////////////////////////////////////////////////
+#if defined(_MSC_VER)
+
+typedef unsigned long long nano_time_t;
+#define NANOTIME_MAX (~0ULL - 1)
+
+#define fmin min
+#define fmax max
+
+nano_time_t
+conv2nanosec(nano_time_t t)
+{
+    LARGE_INTEGER count;
+
+    if (QueryPerformanceFrequency(&count) == FALSE) {
+        return 0;
+    }
+    t = (t * 1000000)/count.QuadPart;
+
+    return (nano_time_t)(t * 1000);
+}
+
+nano_time_t
+getCurrentTime(void)
+{
+     LARGE_INTEGER count;
+
+     if (QueryPerformanceCounter(&count) == FALSE) {
+         return 0;
+     }
+     return (nano_time_t)count.QuadPart;
+}
+#else /* defined(_MCS_VER) */
+
+typedef unsigned long nano_time_t;
+#define NANOTIME_MAX (~0UL - 1)
+
+nano_time_t
+conv2nanosec(nano_time_t t)
+{
+    /* clock_... functions measure time in nanoseconds */
+    return t;
+}
+
+nano_time_t
+getCurrentTime(void)
+{
+    int err;
+    struct timespec t;
+
+    err = clock_gettime(CLOCK_REALTIME, &t);
+    if (err == 0) {
+        return (t.tv_sec * 1000000000UL + t.tv_nsec);
+    }
+    return 0;
+}
+
+#endif  /* defined(_MCS_VER) */
+//////////////////////////////////////////////////////////////////
+
+cl_int
+waitForSuccessfulFinish(
+    cl_command_queue commandQueues,
+    cl_event *event)
+{
+    cl_int err, status;
+
+    err = clFinish(commandQueues);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    if (event == NULL || *event == NULL) {
+        return CL_SUCCESS;
+    }
+
+    status = CL_COMPLETE;
+    err = clGetEventInfo(*event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+        sizeof(status), &status, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    if (status < 0) {
+        return -status;
+    }
+    return CL_SUCCESS;
+}
+
+cl_int
+flushAll(cl_command_queue commandQueue)
+{
+    cl_int err;
+
+    err = clFlush(commandQueue);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    return CL_SUCCESS;
+}
+
+
+enum {
+    MASK_KERNEL_COMP = 0x01,
+    MASK_KERNEL_A    = 0x02,
+    MASK_KERNEL_B    = 0x04
+};
+
+const char *FILE_PATH = NULL;
+
+FILE* logStream;
+
+int globalDim = 0;
+enum {
+    DEVNAME_MAXLEN = 64
+};
+
+#ifdef TEST_LOG
+#include <time.h>
+typedef unsigned long nano_time_t;
+nano_time_t
+getCurrentTime(void)
+{
+    int err;
+    struct timespec t;
+
+
+    err = clock_gettime(CLOCK_REALTIME, &t);
+    if (err == 0) {
+        return (t.tv_sec * 1000000000UL + t.tv_nsec);
+    }
+    return 0;
+}
+double globalTime = 0;
+double globalFastTime = 0;
+
+#endif
+
+extern int getDataTypeSize(DataType dataType);
+extern void writeStorageCache(TargetDevice* devID);
+extern BlasFunctionInfo* getBlasFunctionInfo(TargetDevice* devID, int func);
+extern void checkFILE(TargetDevice* devID, BlasFunctionInfo* fiArr);
+extern char* getDevName(TargetDevice* tdev);
+
+const unsigned int uiNONE = (unsigned int)-1;
+
+// float types based unified pointer
+typedef union FPtr {
+    void *v;
+    cl_float *f;
+    cl_double *d;
+    cl_float2 *f2;
+    cl_double2 *d2;
+} FPtr;
+
+
+
+typedef struct GParam {
+
+    int count;
+    char            name[65];
+    SubproblemDim   dims[MAX_SUBDIMS];
+    PGranularity    pgran;
+    unsigned int    vecLen;
+
+    cl_ulong        time;
+    // For each kernel the binaries are created
+    Kernel*         kernel;
+    //cl_kernel       clkern;
+    size_t          binary_sizes;
+    char*           binaries;
+
+    Kernel          *kernelPrepA;
+    Kernel          *kernelPrepB;
+
+    size_t          binary_sizesA;
+    char*           binariesA;
+
+    size_t          binary_sizesB;
+    char*           binariesB;
+
+//    POSFILE         fbin[MAX_CLBLAS_KERNELS_PER_STEP];
+} GParam;
+
+typedef struct MatrixInfo {
+    DataType        dtype;
+    unsigned int    sizeDType;
+
+    unsigned int M;
+    unsigned int N;
+    unsigned int K;
+
+    cl_mem clA;
+    cl_mem clB;
+    cl_mem clC;
+
+    FPtr A;
+    FPtr B;
+    FPtr C;
+
+    cl_mem clImgA;
+    cl_mem clImgB;
+    void *imgA;
+    void *imgB;
+}MatrixInfo;
+
+enum Command {
+    C_DEFAULT,
+    C_REBUILD,
+    C_GENKERNEL,
+    C_ADD,
+};
+
+struct GeneratorInfoRec {
+
+    cl_platform_id      platform;       // ID of platform
+    cl_device_type      devType;
+    cl_context          ctx;
+    cl_command_queue    queue;
+    //
+    unsigned int        numDevices;     // number of Devices
+
+    TargetDevice 		targetDevice;	//
+    DeviceInfo          deviceInfos;    // Todo delete this member. Use TargetDevice.
+    char                *deviceName;    //
+
+    bool       aFunc[BLAS_FUNCTIONS_NUMBER];
+    int        aPattern;
+    bool       aDType[TYPE_NUMBER];
+    int        aFlag;
+    int        aCommand;
+    bool       aIsKernel;
+    int        aMaxparam;
+    bool       aExtendedOutput;
+    bool       aAll;
+
+    double next;
+    double last;
+    const char* patternName;
+} genInfo;
+
+char *
+genParamStr(char* name, int w, size_t data)
+{
+    char format[5];
+
+    sprintf(format,"%%%uu ", w);
+    if (data != (size_t)-1) {
+        char format[5];
+
+        sprintf(format,"%%%uu ", w);
+        sprintf(name, format, (unsigned)data);
+    }
+    else {
+        char format[5];
+
+        sprintf(format,"%%%us ", w);
+        sprintf(name, format, "SU");
+    }
+    return name + w + 1;
+}
+
+char *
+genParamsStr(SubproblemDim* dim, char* name, int w)
+{
+    char* n = name;
+
+        n = genParamStr(n, w, dim->x);
+        n = genParamStr(n, w, dim->y);
+        n = genParamStr(n, w, dim->bwidth);
+    sprintf(n,":");
+    return n + 1;
+}
+
+void
+createGParamName(GParam* param)
+{
+    char* n = param->name;
+
+    if (param->dims[2].itemX > 0) {
+        n = genParamsStr(&param->dims[0], n, 3);
+        n = genParamsStr(&param->dims[1], n, 3);
+        n = genParamsStr(&param->dims[2], n, 2);
+    }
+    else {
+        n = genParamsStr(&param->dims[0], n, 3);
+        n = genParamsStr(&param->dims[1], n, 2);
+    }
+
+    sprintf(n,"%3dx%-2d", param->pgran.wgSize[0],
+                          param->pgran.wgSize[1]);
+}
+
+static int
+patternUseImages(MemoryPattern *pattern)
+{
+    const CLBLASMpatExtra *extra = (const CLBLASMpatExtra*)pattern->extra;
+
+    if (extra == NULL) {
+        return 0;
+    }
+    if ((extra->mobjA == CLMEM_IMAGE)  ||
+        (extra->mobjB == CLMEM_IMAGE) ) {
+            return 1;
+    }
+    return 0;
+}
+
+
+void
+initGeneratorInfoRec(void)
+{
+    int i;
+
+    memset(&genInfo, 0, sizeof(struct GeneratorInfoRec));
+    genInfo.devType = CL_DEVICE_TYPE_GPU;
+
+    genInfo.aCommand = C_DEFAULT;
+    for (i=0; i < TYPE_NUMBER; ++i) {
+        genInfo.aDType[i] = false;
+    }
+    genInfo.aFlag    = -1;
+    for (i=0; i < BLAS_FUNCTIONS_NUMBER; ++i) {
+        genInfo.aFunc[i] = false;
+    }
+    genInfo.aPattern = -1;
+    genInfo.aIsKernel = false;
+    genInfo.aMaxparam = 5000;
+
+    genInfo.aExtendedOutput = false;
+}
+
+void
+destroyKernels(GParam *param)
+{
+    if (param->kernel != NULL) {
+        putKernel(NULL, param->kernel);
+        param->kernel = NULL;
+    }
+    if (param->kernelPrepA != NULL) {
+        putKernel(NULL, param->kernelPrepA);
+        param->kernelPrepA = NULL;
+    }
+    if (param->kernelPrepB != NULL) {
+        putKernel(NULL, param->kernelPrepB);
+        param->kernelPrepB = NULL;
+    }
+}
+
+void
+destroyGenInfo(void)
+{
+    free (genInfo.deviceName);
+    genInfo.deviceName = NULL;
+    clReleaseCommandQueue(genInfo.queue);
+    clReleaseContext(genInfo.ctx);
+//    destroyData(genInfo.functionInfo);
+}
+
+void
+checkErrorFunc(char* funcName, cl_int status)
+{
+    if (status != CL_SUCCESS) {
+        char * ret = "UNKNOWN";
+
+        switch (status) {
+        case CL_OUT_OF_RESOURCES:
+            ret = "CL_OUT_OF_RESOURCES";    // -5
+            break;
+        case CL_BUILD_PROGRAM_FAILURE:      // -11
+            ret = "CL_BUILD_PROGRAM_FAILURE";
+            break;
+        case CL_INVALID_VALUE:              // - 30
+            ret = "CL_INVALID_VALUE";
+            break;
+        case CL_INVALID_KERNEL_ARGS:        // - 52
+            ret = "CL_INVALID_KERNEL_ARGS";
+            break;
+        case CL_INVALID_WORK_GROUP_SIZE:    // - 54
+            ret = "CL_INVALID_WORK_GROUP_SIZE";
+            break;
+        case CL_INVALID_WORK_ITEM_SIZE:     // - 55
+            ret = "CL_INVALID_WORK_ITEM_SIZE";
+            break;
+       case CL_INVALID_BUFFER_SIZE:         // - 61
+            ret = "CL_INVALID_BUFFER_SIZE";
+            break;
+
+        }
+
+        fprintf(logStream, "%s() failed with %d(%s)\n", funcName, status, ret);
+        fflush(logStream);
+        destroyGenInfo();
+        exit(EXIT_COD_CL_ERROR);
+    }
+}
+
+void
+initOpenCl(void)
+{
+    cl_int status = 0;
+    cl_uint numPlatforms;
+    status = clGetPlatformIDs(0, NULL, &numPlatforms);
+    checkErrorFunc("clGetPlatformIDs", status);
+
+    if (numPlatforms > 0) {
+        unsigned int i;
+        cl_platform_id* platforms =
+                (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id));
+
+        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
+        checkErrorFunc("clGetPlatformIDs", status);
+
+        for(i=0; i < numPlatforms; ++i) {
+            char pbuff[100];
+            status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR,
+                    sizeof(pbuff), pbuff, NULL);
+
+            checkErrorFunc("clGetPlatformInfo", status);
+            genInfo.platform = platforms[i];
+            if(!strcmp(pbuff, "Advanced Micro Devices, Inc.")) {
+                break;
+            }
+        }
+        free(platforms);
+    }
+
+    // Init Device count
+    status = clGetDeviceIDs(genInfo.platform, genInfo.devType, 0, 0,
+            (cl_uint*)&genInfo.numDevices);
+    checkErrorFunc("clGetDeviceIDs", status);
+}
+
+void
+initDevice(int dev)
+{
+    cl_int status = 0;
+    cl_uint num_devices;
+    cl_device_id* deviceIDs =
+           (cl_device_id *)calloc(genInfo.numDevices, sizeof(cl_device_id));
+
+    status = clGetDeviceIDs(genInfo.platform, genInfo.devType,
+        genInfo.numDevices,  deviceIDs, &num_devices);
+    checkErrorFunc("clGetDeviceIDs", status);
+
+    genInfo.targetDevice.id = deviceIDs[dev];
+    identifyDevice(&genInfo.targetDevice);
+    genInfo.deviceName = getDevName(&genInfo.targetDevice);
+    initCLDeviceInfoRec(&genInfo.targetDevice, &genInfo.deviceInfos);
+}
+
+void
+getContext(void)
+{
+    cl_int status = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_device_id device = genInfo.targetDevice.id;
+
+    props[1] = (cl_context_properties)genInfo.platform;
+
+    genInfo.ctx = clCreateContext(props, 1, &device,
+        NULL, NULL, &status);
+    checkErrorFunc("clCreateContext", status);
+
+    genInfo.queue = clCreateCommandQueue(genInfo.ctx,
+    	device,
+        CL_QUEUE_PROFILING_ENABLE,
+        &status);
+    checkErrorFunc("clCreateCommandQueue",status);
+}
+
+int
+bitcount (unsigned int n)  {
+    int count = 1 ;
+
+    while (n)  {
+        count  *= 2;
+        n &= (n - 1) ;
+    }
+    return count ;
+}
+
+bool
+genKernel(GParam *param, CLBLASKernExtra* extra, MemoryPattern *pattern)
+{
+    cl_int status;
+    SolverKgen genKernel;
+    bool ret = false;
+    cl_device_id device;
+    char bopts[BUILD_OPTS_MAXLEN];
+
+    genKernel = pattern->sops->genKernel;
+    device = genInfo.targetDevice.id;
+
+    setupBuildOpts(bopts, device, pattern);
+    param->kernel = makeKernel(device, genInfo.ctx, genKernel,
+                               param->dims, &param->pgran, extra, bopts, NULL);
+    if (param->kernel != NULL) {
+        status = clGetProgramInfo(param->kernel->program, CL_PROGRAM_BINARY_SIZES,
+                                  sizeof(size_t), &param->binary_sizes, NULL);
+
+        checkErrorFunc("clGetProgramInfo", status);
+        param->binaries = (char *)malloc(sizeof(char)*param->binary_sizes);
+        status = clGetProgramInfo(param->kernel->program,
+                CL_PROGRAM_BINARIES,
+                sizeof(char *),
+                &param->binaries,
+                NULL);
+        checkErrorFunc("clGetProgramInfo", status);
+        ret = true;
+    }
+
+    return ret;
+}
+
+void
+convKExtraFlagToArg(KernelExtraFlags flags, CLBlasKargs* args)
+{
+    args->order = (flags & KEXTRA_COLUMN_MAJOR)?clblasColumnMajor: clblasRowMajor;
+    args->side  = (flags & KEXTRA_SIDE_RIGHT)? clblasRight: clblasLeft;
+    args->uplo  = (flags & KEXTRA_UPPER_TRIANG)?clblasUpper: clblasLower;
+
+    args->transA = (flags & KEXTRA_TRANS_A)? clblasTrans: clblasNoTrans;
+    args->transB = (flags & KEXTRA_TRANS_B)? clblasTrans: clblasNoTrans;
+    if (isComplexType(args->dtype))
+    {
+        args->transA = (flags & KEXTRA_CONJUGATE_A)?clblasConjTrans: args->transA;
+        args->transB = (flags & KEXTRA_CONJUGATE_B)?clblasConjTrans: args->transB;
+    }
+    args->diag = (flags & KEXTRA_UNIT_DIAGONAL)? clblasUnit: clblasNonUnit;
+
+}
+
+void
+initCLBlasKArgDim(CLBlasKargs *args, MatrixInfo* mi, KernelExtraFlags extra)
+{
+    cl_int status;
+    float beta = ((extra & KEXTRA_BETA_ZERO) != 0)? 0.0f : 1.0f;
+
+    memset( args, 0, sizeof(CLBlasKargs) );
+    convKExtraFlagToArg( extra, args );
+    args->dtype = mi->dtype;
+
+    switch (mi->dtype)
+    {
+    case TYPE_FLOAT:
+        args->alpha.argFloat = 1.0;
+        args->beta.argFloat = beta;
+        break;
+    case TYPE_DOUBLE:
+        args->alpha.argDouble = 1.0;
+        args->beta.argFloat = beta;
+        break;
+    case TYPE_COMPLEX_FLOAT:
+        args->alpha.argFloatComplex.s[0] = 1.0;
+        args->alpha.argFloatComplex.s[1] = 0.0;
+        args->beta.argFloatComplex.s[0] = beta;
+        args->beta.argFloatComplex.s[1] = 0.0;
+        break;
+
+    case TYPE_COMPLEX_DOUBLE:
+        args->alpha.argDoubleComplex.s[0] = 1.0;
+        args->alpha.argDoubleComplex.s[1] = 0.0;
+        args->beta.argDoubleComplex.s[0] = beta;
+        args->beta.argDoubleComplex.s[1] = 0.0;
+        break;
+    }
+    args->M = mi->M;
+    args->N = mi->N;
+    args->K = mi->K;
+
+    args->A  = clCreateBuffer(genInfo.ctx, CL_MEM_READ_ONLY,
+        args->N * args->M * mi->sizeDType, NULL, &status);
+    checkErrorFunc("clCreateBuffer",status);
+    mi->clA = args->A;
+
+    status = clEnqueueWriteBuffer(genInfo.queue, args->A, CL_TRUE, 0,
+        args->N * args->M * mi->sizeDType, mi->A.v, 0, NULL, NULL);
+    checkErrorFunc("clEnqueueWriteBuffer",status);
+
+    args->lda.matrix = args->K;
+    args->ldb.matrix = args->K;
+    args->ldc.matrix = args->M;
+
+    args->B = clCreateBuffer(genInfo.ctx, CL_MEM_READ_ONLY ,
+        args->K * args->N * mi->sizeDType, NULL, &status);
+    checkErrorFunc("clCreateBuffer",status);
+    mi->clB = args->B;
+
+    status = clEnqueueWriteBuffer(genInfo.queue, args->B, CL_TRUE, 0,
+        args->K * args->N * mi->sizeDType, mi->B.v, 0, NULL, NULL);
+    checkErrorFunc("clEnqueueWriteBuffer",status);
+
+
+    args->C = clCreateBuffer(genInfo.ctx, CL_MEM_WRITE_ONLY ,
+        args->M * args->K * mi->sizeDType, NULL, &status);
+    checkErrorFunc("clCreateBuffer",status);
+
+    mi->clC = args->C;
+    args->addrBits = genInfo.deviceInfos.addressBits;
+    args->offsetM = 0;
+    args->offsetN = 0;
+    args->offA = 0;
+    args->offBX = 0;
+    args->offCY = 0;
+    args->scimage[0] = mi->clImgA;
+    args->scimage[1] = mi->clImgB;
+
+}
+
+void
+initKernelArg(
+    MemoryPattern *pattern,
+    CLBlasKargs args,
+    cl_kernel kernel,
+    CLBlasKernelType kernType,
+    const CLBLASKernExtra *kextra)
+{
+    unsigned int ind;
+    unsigned int nrArgs;
+    cl_int   status;
+    KernelArg karg[MAX_KERNEL_ARGS];
+
+    memset(karg, 0, sizeof(KernelArg) * MAX_KERNEL_ARGS);
+
+    args.kernType = kernType;
+    pattern->sops->assignKargs(karg, &args, kextra);
+
+    status = clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS,
+        sizeof(nrArgs), &nrArgs, NULL);
+
+    for (ind = 0; ((ind < nrArgs) && (status == CL_SUCCESS)); ind++) {
+        status = clSetKernelArg(kernel, ind, karg[ind].typeSize,
+            karg[ind].arg.data);
+    }
+}
+
+double
+runKernel(
+    cl_kernel kernel,
+    cl_device_id device,
+    MemoryPattern *pattern,
+    const GParam *param,
+    //unsigned int dim,
+    CLBlasKargs *args,
+    const void *extra,
+    unsigned int funcID)
+{
+    unsigned int nrComputeUnits;
+    size_t globalWorkSize[2];
+    size_t localWorkSize[3];
+    cl_event evt = NULL;
+    cl_int  status;
+    double ret;
+
+    status = clGetDeviceInfo(device,
+        CL_DEVICE_MAX_COMPUTE_UNITS,
+        sizeof(cl_uint),
+        (void*)&nrComputeUnits,
+        NULL);
+    checkErrorFunc("clGetDeviceInfo",status);
+
+    //////////////////////////////////////////////////////////////////////////
+    //calcWorkGroups();
+
+    if (pattern->sops->calcThreads) {
+        pattern->sops->calcThreads(globalWorkSize, param->dims,
+                                   &param->pgran, args, extra);
+    }
+    else {
+        /////
+        SubproblemDim globDim;
+        SubproblemDim sd[MAX_SUBDIMS];
+
+        kargsToProbDims(&globDim, funcID, args, false);
+        sd[0] = param->dims[0];
+        sd[1] = param->dims[1];
+
+        if ((param->pgran.wgDim == 2) && pattern->sops->innerDecompositionAxis) {
+            if (pattern->sops->innerDecompositionAxis(args) ==
+                DECOMP_AXIS_X) {
+
+                /*
+                 * these dimensions will not used more anywhere, so we can
+                 * just swap them
+                 */
+                swapDimXY(&(sd[0]));
+                swapDimXY(&(sd[1]));
+                swapDimXY(&globDim);
+            }
+        }
+        calcGlobalThreads(globalWorkSize, &(sd[0]),
+                          &param->pgran, globDim.y, globDim.x);
+    }
+
+    localWorkSize[0] = param->pgran.wgSize[0];
+    localWorkSize[1] = param->pgran.wgSize[1];
+    localWorkSize[2] = 0;
+
+    fflush(stdout);
+    status = clEnqueueNDRangeKernel(genInfo.queue, kernel, param->pgran.wgDim,
+                                    NULL, globalWorkSize, localWorkSize,
+                                    0, NULL, &evt);
+    clReleaseKernel(kernel);
+    checkErrorFunc("clEnqueueNDRangeKernel",status);
+
+#if 0
+    {
+        cl_ulong start, end;
+
+        status = clFinish(genInfo.queue);
+
+        checkErrorFunc("clFinish", status);
+        status = clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_START,
+                                        sizeof(cl_ulong), &start, NULL);
+
+        checkErrorFunc("clGetEventProfilingInfo",status);
+        status = clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_END,
+                                        sizeof(cl_ulong), &end, NULL);
+        checkErrorFunc("clGetEventProfilingInfo",status);
+
+        end -= start;
+        end /= 1000;
+        ret = (double)end/1000;
+    }
+#else
+    {
+        nano_time_t time;
+
+        status = flushAll(genInfo.queue);
+        checkErrorFunc("flushAll", status);
+
+        time = getCurrentTime();
+        status = waitForSuccessfulFinish(genInfo.queue, &evt);
+        checkErrorFunc("waitForSuccessfulFinish", status);
+        time = getCurrentTime() - time;
+
+        ret = (double)conv2nanosec(time)/1000;
+        ret /= 1000;
+    }
+
+#endif
+    clReleaseEvent(evt);
+    return ret;
+}
+
+void
+subInitMatrixInfo(
+                  MatrixInfo *matrixInfo,
+                  DataType dt,
+                  unsigned int sizeType)
+{
+    matrixInfo->dtype = dt;
+    matrixInfo->sizeDType = sizeType;
+
+    matrixInfo->A.v = malloc(matrixInfo->N * matrixInfo->M * sizeType);
+    matrixInfo->B.v = malloc(matrixInfo->N * matrixInfo->K * sizeType);
+    matrixInfo->C.v = malloc(matrixInfo->M * matrixInfo->K * sizeType);
+}
+
+void
+initMatrixFloat(FPtr* m, int maxi)
+{
+    int i;
+    for (i = 0; i < maxi; ++i) {
+        m->f[i] = 1.0;
+    }
+}
+
+
+void
+initMatrixInfo(
+    MatrixInfo *mi,
+    DataType dt,
+    DeviceInfo* di,
+    BlasExtraInfo* bExtra
+    )
+{
+    unsigned int nDim;
+    BlasFunctionInfo* bFunc = bExtra->parent->parent;
+
+    for (nDim = 0; nDim < bExtra->numParam; ++nDim, mi++) {
+        unsigned int i;
+        unsigned int dimension = getDimension(nDim, dt, di, bFunc->funcNo);
+
+        if (bFunc != NULL && bFunc->initKNM != NULL) {
+            bFunc->initKNM(mi, dimension);
+        }
+        else {
+            mi->K = dimension;
+            mi->N = dimension;
+            mi->M = dimension;
+        }
+
+
+        switch (dt)
+        {
+        case TYPE_FLOAT:
+            subInitMatrixInfo(mi, dt, sizeof(cl_float));
+            initMatrixFloat(&mi->A, mi->K * mi->M);
+            initMatrixFloat(&mi->B, mi->N * mi->K);
+            break;
+        case TYPE_DOUBLE:
+            subInitMatrixInfo(mi, dt, sizeof(cl_double));
+            for (i = 0; i < mi->K * mi->M; ++i) {
+                mi->A.d[i] = 1.0;
+            }
+            for (i = 0; i < mi->N * mi->K; ++i) {
+                mi->B.d[i] = 1.0;
+            }
+            break;
+        case TYPE_COMPLEX_FLOAT:
+            subInitMatrixInfo(mi, dt, sizeof(cl_float2));
+            for (i = 0; i < mi->K * mi->M; ++i) {
+                mi->A.f2[i].s[0] = 1.0;
+                mi->A.f2[i].s[1] = 0.0;
+            }
+            for (i = 0; i < mi->N * mi->K; ++i) {
+                mi->B.f2[i].s[0] = 1.0;
+                mi->B.f2[i].s[1] = 0.0;
+            }
+            break;
+        case TYPE_COMPLEX_DOUBLE:
+            subInitMatrixInfo(mi, dt, sizeof(cl_double2));
+            for (i = 0; i < mi->K * mi->M; ++i) {
+                mi->A.d2[i].s[0] = 1.0;
+                mi->A.d2[i].s[1] = 0.0;
+            }
+            for (i = 0; i < mi->N * mi->K; ++i) {
+                mi->B.d2[i].s[0] = 1.0;
+                mi->B.d2[i].s[1] = 0.0;
+            }
+            break;
+        default:
+            exit (EXIT_COD_UNKNOWN_DATATYPE);
+        }
+
+        mi->clA = NULL;
+        mi->clB = NULL;
+        mi->clC = NULL;
+
+        mi->clImgA = NULL;
+        mi->clImgB = NULL;
+        mi->imgA = NULL;
+        mi->imgB = NULL;
+
+    }
+}
+
+void
+releaseMemObjOne(MatrixInfo * mi)
+{
+    clReleaseMemObject(mi->clA);
+    clReleaseMemObject(mi->clB);
+    clReleaseMemObject(mi->clC);
+
+    mi->clA = NULL;
+    mi->clB = NULL;
+    mi->clC = NULL;
+    mi->clImgA = NULL;
+    mi->imgA = NULL;
+    mi->clImgB = NULL;
+    mi->imgB = NULL;
+}
+
+void
+releaseMemObjAll(MatrixInfo * mi, BlasExtraInfo* bExtra)
+{
+    unsigned int nDim;
+
+    for (nDim = 0; nDim < bExtra->numParam; ++nDim, mi++) {
+        releaseMemObjOne(mi);
+    }
+}
+
+void
+destroyMatrixInfo(MatrixInfo* mi, BlasExtraInfo* bExtra)
+{
+    unsigned int nDim;
+
+    for (nDim = 0; nDim < bExtra->numParam; ++nDim, mi++) {
+        free(mi->A.v);
+        free(mi->B.v);
+        free(mi->C.v);
+    }
+}
+
+void
+logBest(
+        unsigned int * bestParam,
+        unsigned int nDim,
+        GParam * gp,
+        double * bestTime)
+{
+    fprintf(logStream,  "        %d  %s = %f\n",bestParam[nDim],
+            gp->name,  bestTime[nDim]);
+    fflush(logStream);
+}
+void
+logCheckError(int dim)
+{
+    fprintf(logStream,  " [%5d]:  NOT FOUND\n", dim);
+}
+void
+logCheck(
+         int dim,
+         SubproblemDim* sdim,
+         PGranularity* pgran,
+         double t,
+         double oldt,
+         bool kern)
+{
+    GParam gp;
+
+    gp.dims[0] = sdim[0];
+    gp.dims[1] = sdim[1];
+    gp.dims[2] = sdim[2];
+
+    gp.pgran = *pgran;
+    createGParamName(&gp);
+    if (genInfo.aExtendedOutput) {
+        if (oldt == 0) {
+            fprintf(logStream,  " [%5d]:  %s  - %7g ",dim, gp.name, t);
+            oldt = t;
+        }
+        if (fabs(t - oldt) < 0.0001) {
+            fprintf(logStream, (kern) ? "* " : "+ ");
+        }
+        else {
+            fprintf(logStream,  "- ");
+        }
+    }
+    fflush(logStream);
+}
+
+
+void
+logParamName(GParam * params, int cur, int max)
+{
+    if (genInfo.aExtendedOutput) {
+
+        fprintf(logStream, "%3i/%-3i, %s :", cur, max, params->name);
+        fflush(logStream);
+
+
+    /*  For Debug GEMM, Memmory pattern #4
+
+        fprintf(logStream,
+                "%3i/%-3i; wg: %dx%d; iB: %lux%lu; gB: %lux%lu; bw: %lu",
+                cur,
+                max,
+                params->pgran.wgSize[1],
+                params->pgran.wgSize[0],
+                params->dims[1].x,
+                params->dims[1].y,
+                params->dims[0].x,
+                params->dims[0].y,
+                params->dims[0].bwidth);
+    */
+        fflush(logStream);
+
+    }
+    else {
+        if (cur > 0) {
+            fprintf(logStream, "\b\b\b\b\b\b\b");
+        }
+        fprintf(logStream, "%5.2f%% ", genInfo.last
+                + (genInfo.next - genInfo.last)*cur/max);
+        fflush(logStream);
+    }
+
+}
+
+void
+logTime(double time)
+{
+    if (genInfo.aExtendedOutput) {
+        fprintf(logStream, " %7.2f", time);
+        fflush(logStream);
+    }
+}
+
+void
+logKernalGen(void)
+{
+    if (genInfo.aExtendedOutput) {
+        fprintf(logStream, " *");
+        fflush(logStream);
+    }
+}
+
+void
+logPattern(const char * patternName)
+{
+    if ( genInfo.aExtendedOutput || genInfo.patternName != patternName ) {
+        fprintf(logStream, "%s is being tuned, progress: ", patternName);
+        if (genInfo.aExtendedOutput) {
+            fprintf(logStream, "\n");
+        }else {
+            fprintf(logStream, "       ");
+        }
+        fflush(logStream);
+        genInfo.patternName = patternName;
+    }
+
+}
+void
+logEndString(void)
+{
+    if (genInfo.aExtendedOutput) {
+        fprintf(logStream, "\n");
+        fflush(logStream);
+    }
+}
+
+void
+logExtraFlag(
+        KernelExtraFlags flags,
+        KernelExtraFlags flag,
+        const char * trueName,
+        const char * falseName
+        )
+{
+    if ((flags & flag) > 0) {
+        fprintf(logStream, "%s", trueName);
+    }
+    else {
+        fprintf(logStream, "%s", falseName);
+    }
+
+}
+void
+logEndPattern(unsigned int func, unsigned int patt)
+{
+    //bool isFunc = (genInfo.aFunc == -1 || genInfo.aFunc == (int)func);
+    bool isFunc = genInfo.aFunc[func];
+    bool isPattern = (genInfo.aPattern == -1 || genInfo.aPattern == (int)patt);
+
+    if (!(isFunc && isPattern)) {
+            return;
+    }
+
+    if (!genInfo.aExtendedOutput) {
+        fprintf(logStream, "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b");
+        fprintf(logStream, "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b");
+        fprintf(logStream, " tuning is complete.                     \n");
+    }
+    fprintf(logStream, "\n");
+    fflush(logStream);
+}
+
+void
+logExtra(BlasExtraInfo* bExtra)
+{
+
+    const char*  strType = "";
+    const char*  strTrans = "";
+
+    KernelExtraFlags flags = bExtra->flags;
+
+    if (!genInfo.aExtendedOutput) {
+
+        fprintf(logStream, "\b\b\b\b\b\b\b\b %5.2f%% ", genInfo.last);
+    }
+    else {
+        fprintf(logStream, "   Flag (%d):(clblas*)", flags);
+
+        strTrans = (flags & KEXTRA_TRANS_A)? " Trans": " NoTrans";
+        logExtraFlag(flags, KEXTRA_CONJUGATE_A,        " ConjTrans", strTrans);
+        fprintf(logStream, "(A)");
+        strTrans = (flags & KEXTRA_TRANS_B)? " Trans": " NoTrans";
+        logExtraFlag(flags, KEXTRA_CONJUGATE_B,        " ConjTrans", strTrans);
+        fprintf(logStream, "(B)");
+        logExtraFlag(flags, KEXTRA_COLUMN_MAJOR,    " ColumnMajor", " RowMajor");
+        logExtraFlag(flags, KEXTRA_UPPER_TRIANG,    " Upper", " Lower");
+        logExtraFlag(flags, KEXTRA_SIDE_RIGHT,      " Right", " Left");
+
+        fprintf(logStream, " \n");
+
+        switch (bExtra->dtype)
+        {
+        case TYPE_FLOAT:            strType = "FLOAT"; break;
+        case TYPE_DOUBLE:           strType = "DOUBLE"; break;
+        case TYPE_COMPLEX_FLOAT:    strType = "COMPLEX_FLOAT"; break;
+        case TYPE_COMPLEX_DOUBLE:   strType = "COMPLEX_DOUBLE"; break;
+        }
+        fprintf(logStream, "   TYPE = %s:", strType);
+    }
+
+
+    fflush(logStream);
+    logEndString();
+}
+
+void
+logError(void)
+{
+    fprintf(logStream, " An internal kernel build error occurred!\n");
+    fflush(logStream);
+}
+
+static void
+releaseSCImage(void** buf, cl_mem* clImg)
+{
+    if (*clImg != NULL) {
+        clReleaseMemObject(*clImg);
+        *clImg = NULL;
+        free(*buf);
+        *buf = NULL;
+    }
+}
+
+static cl_int
+createSCImage(
+    void **buf,
+    cl_mem *image)
+{
+    cl_image_format format = { CL_RGBA, CL_FLOAT };
+    size_t width, height, maxWidth, maxHeight;
+    cl_int status;
+    cl_ulong memSize;
+    cl_device_id device;
+    cl_int err;
+
+    err = clGetContextInfo(genInfo.ctx, CL_CONTEXT_DEVICES,
+                           sizeof(device), &device, NULL);
+
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    err = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+            sizeof(memSize), &memSize, NULL);
+
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    err = clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH,
+            sizeof(maxWidth), &maxWidth, NULL);
+
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    err = clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT,
+            sizeof(maxHeight), &maxHeight, NULL);
+
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    // some functions need 2 scratch images
+    memSize /= 2;
+
+    height = (size_t)sqrt((double)memSize / sizeof(cl_float));
+    width = height / 4;
+    if (height > maxHeight) {
+        height = maxHeight;
+    }
+
+    if (width > maxWidth) {
+        width = maxWidth;
+    }
+
+    *buf = calloc(width * height, 4 * sizeof(cl_float));
+    if (buf == NULL) {
+        return CL_OUT_OF_HOST_MEMORY;
+    }
+    *image = clCreateImage2D(genInfo.ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+        &format, width, height, 0, *buf, &status);
+    if (*image == NULL) {
+        free(*buf);
+        *buf = NULL;
+        return status;
+    }
+    return CL_SUCCESS;
+}
+
+static void
+generatePrepKernel(
+    cl_device_id device,
+    MemoryPattern *pattern,
+    GParam * param,
+    CLBlasKargs *args,
+    CLBLASKernExtra *extra,
+    CLBlasKernelType kernType)
+{
+    PGranularity pgran;
+    Kernel *k = NULL;
+    size_t bSize;
+    char*  bin;
+    cl_int status;
+    cl_ulong ldsSize;
+    CLBlasKernelType kernTypeOld = extra->kernType;
+
+    DUMMY_ARG_USAGE(args);
+
+    extra->kernType = kernType;
+    clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE,
+                    sizeof(cl_ulong), &ldsSize, NULL);
+
+    pgran = param->pgran;
+
+    k = makeKernel(
+        device,
+        genInfo.ctx,
+        pattern->sops->genKernel,
+        param->dims,
+        &pgran,
+        extra,
+        NULL,
+        NULL);
+
+    status = clGetProgramInfo(k->program, CL_PROGRAM_BINARY_SIZES,
+            sizeof(size_t), &bSize, NULL);
+    checkErrorFunc("clGetProgramInfo", status);
+
+    bin = (char *)malloc(sizeof(char)*bSize);
+    status = clGetProgramInfo(k->program, CL_PROGRAM_BINARIES, sizeof(char*),
+            &bin, NULL);
+    checkErrorFunc("clGetProgramInfo", status);
+
+    if (kernType == CLBLAS_PREP_A_KERNEL) {
+        param->binariesA = bin;
+        param->binary_sizesA = bSize;
+        param->kernelPrepA = k;
+    }
+
+    if (kernType == CLBLAS_PREP_B_KERNEL) {
+        param->binariesB = bin;
+        param->binary_sizesB = bSize;
+        param->kernelPrepB = k;
+    }
+
+    extra->kernType = kernTypeOld;
+}
+
+void
+delGParam(GParam * gp)
+{
+    if (gp != NULL) {
+        gp->count --;
+        if (gp->count ==0){
+            destroyKernels(gp);
+            free(gp->binaries);
+            free(gp->binariesA);
+            free(gp->binariesB);
+            free(gp);
+            gp = NULL;
+        }
+    }
+}
+
+void
+setFlagsDependentOnDevice(
+        CLBlasKargs* args,
+        CLBLASKernExtra* extra,
+        GParam* parCur,
+        unsigned int func,
+        unsigned int patt
+        )
+{
+    SolutionStep step;
+    cl_int status;
+
+    step.args = *args;
+    step.cmdQueue = genInfo.queue;
+    step.extraFlags = extra->flags;
+    step.funcID = func;
+    step.kernels[0] = NULL;
+    step.kernels[1] = NULL;
+    step.kernels[2] = NULL;
+    //step.node = NULL;
+    step.numEventsInWaitList = 0;
+    step.patternID = patt;
+    step.pgran = parCur->pgran;
+    step.subdims[0] = parCur->dims[0];
+    step.subdims[1] = parCur->dims[1];
+    step.subdims[2] = parCur->dims[2];
+
+    step.device.id = genInfo.targetDevice.id;
+    status = identifyDevice(&step.device);
+    checkErrorFunc("identifyDevice", status);
+
+    if (step.device.ident.vendor == VENDOR_AMD) {
+        extra->flags |= (KEXTRA_VENDOR_AMD | KEXTRA_ENABLE_MAD);
+    }
+    selectVectorization(&step, extra);
+}
+
+bool
+genAllKernel(
+        CLBlasKargs* args,
+        CLBLASKernExtra extra,
+        GParam* parCur,
+        MemoryPattern * pattern,
+        unsigned int func,
+        unsigned int patt
+        )
+{
+    bool ret;
+    cl_device_id device = genInfo.targetDevice.id;
+
+    if (func == (unsigned int)CLBLAS_SYRK ||
+            func == (unsigned int)CLBLAS_SYR2K) {
+        extra.flags |= KEXTRA_SYRK_SEPARATE_DIAGONAL;
+    }
+
+    setFlagsDependentOnDevice(args, &extra, parCur, func, patt);
+
+    // fixup work group size in respect with desired work dispatch order
+    if ((parCur->pgran.wgDim == 2) && pattern->sops->innerDecompositionAxis) {
+        if (pattern->sops->innerDecompositionAxis(args) == DECOMP_AXIS_X) {
+            unsigned int u;
+
+            u = parCur->pgran.wgSize[0];
+            parCur->pgran.wgSize[0] = parCur->pgran.wgSize[1];
+            parCur->pgran.wgSize[1] = u;
+        }
+    }
+
+    if (pattern->sops->fixupArgs) {
+        pattern->sops->fixupArgs(args, parCur->dims, &extra);
+    }
+
+    ret = genKernel(parCur, &extra, pattern);
+
+    if (patternUseImages(pattern)) {
+        generatePrepKernel(device, pattern, parCur, args, &extra,
+                CLBLAS_PREP_A_KERNEL);
+
+        generatePrepKernel(device, pattern, parCur, args, &extra,
+                CLBLAS_PREP_B_KERNEL);
+    }
+    return ret;
+}
+
+double
+runAllKernel(
+             MemoryPattern * pattern,
+             CLBlasKargs *args,
+             GParam* parCur,
+             unsigned int funcId,
+             double bestTime)
+{
+    double time;
+    double minTime = 1e30;
+    int i;
+    cl_device_id device = genInfo.targetDevice.id;
+    int max_run_kernel = MAX_RUN_KERNEL + (funcBlasLevel(funcId) == 2 ? 7 : 0);
+
+
+    cl_int status;
+    cl_kernel kernel;
+    if (patternUseImages(pattern)) {
+        /////////////// A //////////////
+        cl_kernel kPrepA;
+        cl_kernel kPrepB;
+
+        status = clCreateKernelsInProgram(
+                parCur->kernelPrepA->program, 1, &kPrepA, NULL);
+        checkErrorFunc("clGetProgramInfo", status);
+
+        initKernelArg(pattern, *args, kPrepA, CLBLAS_PREP_A_KERNEL,
+                      parCur->kernelPrepA->extra);
+
+        args->kernType = CLBLAS_PREP_A_KERNEL;
+        time = runKernel(kPrepA, device, pattern, parCur,
+                args, parCur->kernelPrepA->extra, funcId);
+
+        /////////////// B //////////////
+        status = clCreateKernelsInProgram(
+                parCur->kernelPrepB->program, 1, &kPrepB, NULL);
+        checkErrorFunc("clGetProgramInfo", status);
+
+
+        initKernelArg(pattern, *args, kPrepB, CLBLAS_PREP_B_KERNEL,
+                      parCur->kernelPrepB->extra);
+
+        args->kernType = CLBLAS_PREP_B_KERNEL;
+        time = runKernel(kPrepB, device, pattern, parCur,
+                args, parCur->kernelPrepB->extra, funcId);
+        args->kernType = CLBLAS_COMPUTING_KERNEL;
+    }
+
+    for (i = 0; i < max_run_kernel; ++i) {
+        status = clCreateKernelsInProgram(parCur->kernel->program, 1, &kernel, NULL);
+        checkErrorFunc("clGetProgramInfo", status);
+
+        initKernelArg(pattern, *args, kernel, CLBLAS_COMPUTING_KERNEL,
+                      parCur->kernel->extra);
+
+        time = runKernel(kernel, device, pattern, parCur, args,
+                         parCur->kernel->extra, funcId);
+        minTime = fmin(time, minTime);
+        if (minTime > bestTime*2 && i >= max_run_kernel/2 && minTime > 2) {
+            break;
+        }
+    }
+    return minTime;
+}
+
+GParam*
+createParCur(SubDimInfo *sdi)
+{
+    GParam*   parCur   = calloc(1, sizeof(GParam));
+    parCur->count ++;
+
+    parCur->dims[0] = sdi->sdim[0];
+    parCur->dims[1] = sdi->sdim[1];
+    parCur->dims[2] = sdi->sdim[2];
+    parCur->pgran   = sdi->pgran;
+
+    createGParamName(parCur);
+    return parCur;
+}
+
+GParam*
+createParCur2(GParam* sdi)
+{
+    GParam*   parCur   = calloc(1, sizeof(GParam));
+    parCur->count = 1;
+
+    parCur->dims[0] = sdi->dims[0];
+    parCur->dims[1] = sdi->dims[1];
+    parCur->dims[2] = sdi->dims[2];
+    parCur->pgran   = sdi->pgran;
+    parCur->vecLen  = sdi->vecLen;
+    parCur->binaries  = NULL;
+    parCur->binariesA = NULL;
+    parCur->binariesB = NULL;
+    parCur->binary_sizes = 0;
+    parCur->binary_sizesA = 0;
+    parCur->binary_sizesB = 0;
+    parCur->time = sdi->time;
+
+    createGParamName(parCur);
+    return parCur;
+}
+
+static void
+setParam(BlasParamInfo* bParam, double time, GParam* parCur)
+{
+    bParam->time = time;
+    bParam->pGran = parCur->pgran;
+    bParam->sDim[0]  = parCur->dims[0];
+    bParam->sDim[1]  = parCur->dims[1];
+    bParam->sDim[2]  = parCur->dims[2];
+    //
+    if (genInfo.aIsKernel) {
+        bParam->kSize[0] = (unsigned int)parCur->binary_sizes;
+        bParam->kSize[1] = (unsigned int)parCur->binary_sizesA;
+        bParam->kSize[2] = (unsigned int)parCur->binary_sizesB;
+    }
+    else {
+        bParam->kSize[0] = 0;
+        bParam->kSize[1] = 0;
+        bParam->kSize[2] = 0;
+    }
+}
+
+int VISIBILITY_HIDDEN
+comp(const void *i, const void *j)
+{
+    return *(double *)i < *(double *)j;
+}
+
+void VISIBILITY_HIDDEN
+initCLBLASExtra(CLBLASKernExtra* extra, BlasExtraInfo* bExtra)
+{
+    memset( extra, 0, sizeof(CLBLASKernExtra) );
+//    if (bExtra) {
+        extra->dtype = bExtra->dtype;
+        extra->flags = bExtra->flags;
+//        extra->vecLen = bExtra->vecLen;
+//    }
+}
+#ifdef TEST_LOG
+
+typedef struct LOG_FILE
+{
+    FILE* f;
+    bool readElem;
+    double t1, t2, t3;
+    double tall;
+}LOG_FILE;
+
+typedef struct LOG_STAT
+{
+    int count;
+    double minTime;
+    double maxTime;
+    double midleTime;
+}LOG_STAT;
+
+void
+openLogFile(LOG_FILE* lf, char* fileName)
+{
+    if ((lf->f = fopen(fileName, "a+")) != NULL) {
+
+    }
+
+}
+void
+closeLogFile(LOG_FILE* lf)
+{
+    fclose(lf->f);
+}
+bool
+readElemLogFile(LOG_FILE* lf, SubproblemDim* sd, unsigned int vecLen)
+{
+    unsigned int l0x, l0y, l0w, l1x, l1y, l1w, vl;
+    double t1, t2, t3, tall;
+    fscanf(lf->f, "%u %u %u %u %u %u %u - %lf %lf %lf %lf\n",
+           &l0x, &l0y, &l0w, &l1x, &l1y, &l1w, &vl, &t1, &t2, &t3, &tall);
+    if ( l0x == sd[0].x && l0y == sd[0].y && l0w == sd[0].bwidth
+       &&  l1x == sd[1].x && l1y == sd[1].y && l1w == sd[1].bwidth
+       && vl == vecLen) {
+        lf->t1 = t1;
+        lf->t2 = t2;
+        lf->t3 = t3;
+        lf->tall = tall;
+        return true;
+    }
+    return false;
+}
+
+double
+readLogFile(LOG_FILE* lf, SubproblemDim* sd, unsigned  int vecLen)
+{
+    lf->t3 = 0;
+    lf->readElem = readElemLogFile(lf, sd, vecLen);
+    if (!lf->readElem) {
+        rewind(lf->f);
+        while (!lf->readElem && !feof(lf->f)) {
+            lf->readElem =  readElemLogFile(lf, sd, vecLen);
+        }
+    }
+    return lf->t3;
+}
+
+double
+saveLogFile(LOG_FILE* lf,
+        SubproblemDim* sd,
+        unsigned int vecLen,
+        double* time,
+        double timeAll)
+{
+    if (!lf->readElem) {
+        fprintf(lf->f, "%u %u %u %u %u %u %u - %lf %lf %lf %lf\n",
+               (unsigned int)sd[0].x, (unsigned int)sd[0].y,
+               (unsigned int)sd[0].bwidth,
+               (unsigned int)sd[1].x, (unsigned int)sd[1].y,
+               (unsigned int)sd[1].bwidth,
+               vecLen,
+               time[0], time[1], time[2], timeAll);
+    }
+    return lf->t3;
+}
+
+void
+getBestVariant(LOG_FILE* lf)
+{
+    rewind(lf->f);
+    lf->tall = 0;
+    while (!feof(lf->f)) {
+        unsigned int l0x, l0y, l0w, l1x, l1y, l1w, vl;
+        double t1, t2, t3, tall;
+        fscanf(lf->f, "%u %u %u %u %u %u %u - %lf %lf %lf %lf\n",
+               &l0x, &l0y, &l0w, &l1x, &l1y, &l1w, &vl, &t1, &t2, &t3, &tall);
+        lf->t1 = fmin(t1, lf->t1);
+        lf->t2 = fmin(t2, lf->t2);
+        lf->t3 = fmin(t3, lf->t3);
+        lf->tall += tall;
+    }
+}
+
+#endif
+
+static void
+findBestParams(
+    MemoryPattern *pattern,
+    unsigned int func,
+    unsigned int patt,
+    bool isEnvPattSelected,
+    BlasExtraInfo* bExtra,
+    GParam*     bestParam[DIMARRAYCOUNT])
+{
+    unsigned int nDim;
+    SolutionStep step;
+    MatrixInfo mi [DIMARRAYCOUNT];
+    //cl_kernel kernel_old[MAX_CLBLAS_KERNELS_PER_STEP];
+    double time[DIMARRAYCOUNT];
+    CLBLASKernExtra extra;
+    SubDimInfo sdi;
+    void*  imgA  = NULL;
+    cl_mem clImgA = NULL;
+    void*  imgB = NULL;
+    cl_mem clImgB = NULL;
+    int curStep;
+    unsigned int dimension;
+
+#ifdef TEST_LOG
+    LOG_FILE lf;
+    double all_time = 0;
+    double step_time;
+    char str[1000];
+#endif
+
+    memset(time, 0, sizeof(time));
+    initCLBLASExtra(&extra, bExtra);
+
+#ifdef TEST_LOG
+    sprintf(str, "test_%d_%d_%d_%d.log",func, patt, extra.dtype, extra.flags);
+    openLogFile(&lf,str);
+#endif
+
+    // create images
+    if (patternUseImages(pattern)) {
+        cl_int status;
+        // Init Image
+        status = createSCImage(&imgA, &clImgA);
+        checkErrorFunc("createSCImage", status);
+        status = createSCImage(&imgB, &clImgB);
+        checkErrorFunc("createSCImage", status);
+    }
+
+    initSubDimInfo(&sdi, pattern, &genInfo.deviceInfos, func, patt,
+                    extra.dtype, extra.flags);
+
+    initMatrixInfo(mi,  extra.dtype, &genInfo.deviceInfos, bExtra);
+    resetSubdim(&sdi);
+
+    curStep = 0;
+    while (nextSubdim(&sdi, genInfo.aMaxparam, time[bExtra->numParam - 1]))
+    {
+        GParam* parCur;
+        GParam* lastbest[DIMARRAYCOUNT];
+        bool isKernelValid;
+        if (bExtra) {
+            parCur   = createParCur(&sdi);
+        }
+
+        globalDim++;
+        curStep++;
+        logParamName(parCur, curStep, sdi.varCount);
+
+#ifdef TEST_LOG
+        step_time = getCurrentTime();
+        time[DIMARRAY_BIG] = readLogFile(&lf, sdi.sdim, sdi.vecLen);
+        if (!lf.readElem) {
+#endif
+
+#ifdef TEST_LOG
+        }
+        else {
+            time[DIMARRAY_SMALL] = lf.t1;
+            time[DIMARRAY_MIDDLE] = lf.t2;
+            time[DIMARRAY_BIG] = lf.t3;
+            step_time = lf.tall;
+        }
+#endif
+
+        for (nDim = 0; nDim < bExtra->numParam; nDim++) {
+             if (bExtra){
+                lastbest[nDim] = NULL;
+             }
+        }
+
+        isKernelValid = 0;
+        for (nDim = 0; nDim < bExtra->numParam; nDim++) {
+
+            BlasParamInfo* bParam;
+            // can current combination of flags be handled by selected pattern
+            bool isProbSupported = false;
+
+            dimension = getDimension(nDim, extra.dtype,
+                                     &genInfo.deviceInfos,
+                                     bExtra->parent->parent->funcNo);
+
+            // setup kernel arguments
+            if (patternUseImages(pattern)) {
+                // Init Image
+                mi[nDim].imgA = imgA;
+                mi[nDim].clImgA =clImgA;
+                mi[nDim].imgB = imgB;
+                mi[nDim].clImgB =clImgB;
+            }
+
+            // Incorrect subdimension for a given size of the matrix
+            if ( dimension < sdi.sdim[0].x ||
+                 dimension % sdi.sdim[0].x != 0 ||
+                 dimension < sdi.sdim[0].y ||
+                 dimension % sdi.sdim[0].y != 0 ||
+                 dimension < sdi.sdim[0].bwidth ||
+                 dimension % sdi.sdim[0].bwidth != 0
+                 ) {
+
+                releaseMemObjOne(mi + nDim);
+                if (genInfo.aExtendedOutput) {
+                    fprintf(logStream, "        ");
+                }
+                // write dummy data
+                time[nDim] = -1;
+                continue;
+            }
+
+            step.extraFlags = extra.flags;
+            step.funcID = func;
+            initCLBlasKArgDim( &step.args, mi + nDim, extra.flags );
+
+            // assuming that all
+            // "old-fashioned" patterns, providing no performance estimation
+            // function can handle any set of arguments/flags
+            if ( NULL == pattern->sops->getPatternPerf ||
+                pattern->sops->getPatternPerf( step.extraFlags,
+                (void*)&step.args ) >= 0 ) {
+
+                isProbSupported = true;
+            }
+            else {
+                isProbSupported = false;
+            }
+
+            // if current flags and dimensions are not optimal for current
+            // pattern - skip building and running kernel.
+            // But if the pattern is selected by environment
+            // and can handle current problem - tune it anyway.
+            if ( (patt != selectPattern( &step, 0 ) &&
+                 (!isEnvPattSelected || !isProbSupported)) ) {
+
+                releaseMemObjOne(mi + nDim);
+
+                // write dummy data
+                time[nDim] = -1;
+                bestParam[nDim] = NULL;
+                continue;
+            }
+
+            if ( 0 == isKernelValid ) {
+                isKernelValid = genAllKernel(
+                    &step.args,
+                    extra,
+                    parCur,
+                    pattern,
+                    func,
+                    patt);
+
+                logKernalGen();
+            }
+
+            if ( 0 == isKernelValid ) {
+
+                releaseMemObjOne(mi + nDim);
+                logError();
+                break;
+            }
+            bParam = &(bExtra->param[nDim]);
+
+#ifdef TEST_LOG
+            if (!lf.readElem) {
+#endif
+                time[nDim] = runAllKernel(pattern, &step.args, parCur,
+                                          func, bParam->time);
+                releaseMemObjOne(mi + nDim);
+#ifdef TEST_LOG
+            }
+#endif
+            logTime(time[nDim]);
+            if (bParam->time > time[nDim]) {
+                if (bExtra) {
+                    BlasParamInfo* bParamNT = &(bExtra->param[nDim]);
+                    setParam(bParamNT, time[nDim], parCur);
+                    lastbest[nDim] = bestParam[nDim];
+                    bestParam[nDim] = parCur;
+                    parCur->count++;
+                }
+            }
+        }
+        for (nDim = 0; nDim < bExtra->numParam; nDim++) {
+            if (bExtra) {
+                delGParam(lastbest[nDim]);
+                lastbest[nDim] = NULL;
+            }
+        }
+
+#ifdef TEST_LOG
+        step_time = ((double)(getCurrentTime()) - step_time)/1000000;
+        saveLogFile(&lf, sdi.sdim, sdi.vecLen, time, step_time);
+        if (lf.readElem) {
+            step_time = lf.tall;
+        }
+        logTime(step_time);
+        all_time += step_time;
+#endif
+        logEndString();
+        releaseMemObjAll(mi, bExtra);
+        if (bExtra) {
+            delGParam(parCur);
+        }
+    }
+
+#ifdef TEST_LOG
+    // Show log
+    resetSubdim(&sdi);
+    double t;
+    double all = 0;
+    int count = 0;
+
+    time[DIMARRAY_SMALL] = 5000.0;
+    time[DIMARRAY_MIDDLE] = 5000.0;
+    time[DIMARRAY_BIG] = 5000.0;
+
+    sdi.returnAll = true;
+    do
+    {
+        t = readLogFile(&lf, sdi.sdim, sdi.vecLen);
+        if (lf.readElem) {
+            time[DIMARRAY_SMALL] = fmin(lf.t1, time[DIMARRAY_SMALL]);
+            time[DIMARRAY_MIDDLE] = fmin(lf.t2, time[DIMARRAY_MIDDLE]);
+            time[DIMARRAY_BIG] = fmin(lf.t3, time[DIMARRAY_BIG]);
+            all+= lf.tall;
+            count++;
+        }
+        else {
+            printf ("^");
+        }
+
+    } while (nextSubdim(&sdi, genInfo.aMaxparam, t));
+
+#ifdef TEST_LOG
+    getBestVariant(&lf);
+#endif
+
+    lf.t1 = time[DIMARRAY_SMALL];
+    lf.t2 = time[DIMARRAY_MIDDLE];
+    lf.t3 = time[DIMARRAY_BIG];
+    lf.tall = all;
+
+    fprintf(logStream, "---------------------------------------------------\n");
+    fprintf(logStream, "            steps  time1  time2  time3     AllTime \n");
+    int tmin = (int)(lf.tall/1000/60);
+    int tsec = (int)(lf.tall/1000) - tmin*60;
+    fprintf(logStream, " --> Best %5d %7.2lf %7.2lf %7.2lf    %2d:%2d  \n",
+            count, lf.t1, lf.t2, lf.t3, tmin, tsec);
+    tmin = (int)(all_time/1000/60);
+    tsec = (int)(all_time/1000) - tmin*60;
+    fprintf(logStream, " --> Fast %5d %7.2lf %7.2lf %7.2lf    %2d:%2d\n",
+            curStep,
+            bExtra->param[DIMARRAY_SMALL].time,
+            bExtra->param[DIMARRAY_MIDDLE].time,
+            bExtra->param[DIMARRAY_BIG].time,
+            tmin,tsec);
+
+    globalFastTime += all_time;
+    globalTime += lf.tall;
+
+    closeLogFile(&lf);
+#endif
+
+
+
+    logEndString();
+     // Release image
+     releaseSCImage(&imgA, &clImgA);
+     releaseSCImage(&imgB, &clImgB);
+
+     destroyMatrixInfo(mi, bExtra);
+}
+
+double
+checkData(
+		  TargetDevice* devID,
+          const MemoryPattern * pattern,
+          DataType dtype,
+          KernelExtraFlags flags,
+          int dim,
+          double oldt)
+{
+    SubproblemDim sdim[MAX_SUBDIMS];
+    PGranularity  pgran;
+    double time;
+    int i;
+    unsigned char* buffer[MAX_CLBLAS_KERNELS_PER_STEP];
+    size_t sizeBuffer[MAX_CLBLAS_KERNELS_PER_STEP];
+
+    int status;
+
+    for (i = 0; i < MAX_CLBLAS_KERNELS_PER_STEP; ++i) {
+        buffer[i] = NULL;
+        sizeBuffer[i] = 0;
+    }
+
+    status = getGranularityInfo(devID, pattern->name, dtype, flags, dim, sdim,
+            &pgran, &time);
+
+
+    if (status == GF_SUCCESS) {
+        status = getKernelInfo(devID, pattern->name, dtype, flags, dim, buffer,
+                sizeBuffer);
+
+        logCheck(dim, sdim, &pgran, time, oldt, buffer[0] != NULL);
+    }
+    else {
+        logCheckError(dim);
+    }
+    free(buffer[0]);
+    free(buffer[1]);
+    free(buffer[2]);
+    return time;
+}
+
+void
+logDimension(BlasFunctionInfo* bFunc)
+{
+    int func = bFunc->funcNo;
+    int i;
+
+    if (genInfo.aExtendedOutput) {
+        printf("FLOAT           ");
+        for (i = 0; i < DIMARRAYCOUNT; ++i) {
+            printf(" %6u",
+                   getDimension(i, TYPE_FLOAT, &genInfo.deviceInfos, func));
+        }
+        printf("\n");
+        printf("DOUBLE          ");
+        for (i = 0; i < DIMARRAYCOUNT; ++i) {
+            printf(" %6u",
+                   getDimension(i, TYPE_DOUBLE, &genInfo.deviceInfos, func));
+        }
+        printf("\n");
+        printf("COMPLEX FLOAT   ");
+        for (i = 0; i < DIMARRAYCOUNT; ++i) {
+            printf(" %6u",
+                   getDimension(i, TYPE_COMPLEX_FLOAT, &genInfo.deviceInfos, func));
+        }
+        printf("\n");
+        printf("COMPLEX DOUBLE  ");
+        for (i = 0; i < DIMARRAYCOUNT; ++i) {
+            printf(" %6u",
+                   getDimension(i, TYPE_COMPLEX_DOUBLE, &genInfo.deviceInfos, func));
+        }
+        printf("\n");
+
+    }
+
+}
+
+void
+calcExtraCount(int index, int indexCount)
+{
+
+    genInfo.last = (double)index/indexCount*100;
+    genInfo.next = (double)(index + 1)/indexCount*100;
+}
+
+int
+isFlag(BlasExtraInfo* info, KernelExtraFlags flag)
+{
+    return (info->flags & flag) == flag;
+}
+
+int
+isNoFlag(BlasExtraInfo* info, KernelExtraFlags flag)
+{
+    return (info->flags & flag) == 0;
+}
+
+/* Check if current set of flags and datatype should be evaluated
+ for current function and pattern.
+ It may be skipped due to compiler/runtime bugs and if it is
+ considered slow for any of the checked problem sizes
+
+ If skipSlowPatt parameter is set to false - pattern considered slow
+ is tuned anyway, otherwise, it is skipped */
+bool
+skipFlags(BlasExtraInfo* info,
+    int patt,
+    int func,
+    DeviceInfo* pDI,
+    bool skipSlowPatt)
+{
+    bool b = false;
+    int i;
+    bool shouldTunePatt = false;
+    SolutionStep step;
+
+    memset( &step, 0, sizeof(SolutionStep) );
+
+    step.funcID = func;
+    step.patternID = patt;
+    step.extraFlags = info->flags;
+
+    (void)func;
+    (void)patt;
+
+    /* evaluate problem sizes */
+    /* skip pattern, if it is not optimal for any of the dimensions
+    for current flags */
+    if ( skipSlowPatt ) {
+
+        for ( i = 0; i < DIMARRAYCOUNT; i++ ) {
+
+            step.args.M = getDimension( i, info->dtype, pDI, func );
+            step.args.N = getDimension( i, info->dtype, pDI, func );
+            step.args.K = getDimension( i, info->dtype, pDI, func );
+
+            if ( selectPattern( &step, 0 ) == (unsigned int)patt ){
+                shouldTunePatt = true;
+            }
+        }
+
+        if( false == shouldTunePatt ){
+            return true;
+        }
+    }
+
+    b |= (func == CLBLAS_SYMV) && (info->dtype == TYPE_COMPLEX_FLOAT);
+    b |= (func == CLBLAS_SYMV) && (info->dtype == TYPE_COMPLEX_DOUBLE);
+
+    /*
+      * WORKAROUND for WINDOWS: Now, for many subproblem dimensions,
+      *                         when tuning  TRMM, SYRK, SYR2K functions
+      *                         for complex-double type, gives BSoD.
+      */
+
+#if defined(_WIN32)
+    b |= (func == CLBLAS_TRSM) && (info->dtype == TYPE_COMPLEX_DOUBLE);
+    b |= (func == CLBLAS_SYRK) && (info->dtype == TYPE_COMPLEX_DOUBLE);
+    b |= (func == CLBLAS_SYR2K) && (info->dtype == TYPE_COMPLEX_DOUBLE);
+#endif
+    b |= !info->isUseForTunning;
+    return b;
+}
+
+bool
+isFilter(BlasExtraInfo* info, int patt, int func)
+{
+    int dType =  (int)info->dtype;
+    int flag = (int)info->flags;
+
+    bool isFunc = genInfo.aFunc[func];
+    bool isPattern = (genInfo.aPattern == -1 || genInfo.aPattern == patt);
+    bool isDataType = genInfo.aDType[dType];
+    bool isFlag = (genInfo.aFlag == -1 || genInfo.aFlag == flag);
+
+    return (!(isFunc && isPattern && isDataType && isFlag));
+}
+
+void
+initParamsTime(BlasExtraInfo* bExtra)
+{
+    unsigned int nDim;
+
+    for (nDim = 0; nDim < bExtra->numParam; nDim++) {
+        if (bExtra){
+            bExtra->param[nDim].time += 1e50;
+        }
+    }
+}
+
+void
+saveBestParams(
+    BlasExtraInfo* bExtra,
+    GParam*  bestParam[DIMARRAYCOUNT])
+{
+    unsigned int nDim;
+
+    for (nDim = 0; nDim < bExtra->numParam; nDim++) {
+    	if (bExtra){
+    		BlasParamInfo* bParam = &bExtra->param[nDim];
+
+    		if (bestParam[nDim] != NULL){
+    			saveBestParam(&genInfo.targetDevice, bParam);
+            }
+        }
+    }
+}
+
+void
+deleteGParams (BlasExtraInfo* bExtra, GParam*  bestParam[DIMARRAYCOUNT])
+{
+    unsigned int nDim;
+
+    for (nDim = 0; nDim < bExtra->numParam; nDim++) {
+        if (bExtra){
+            delGParam(bestParam[nDim]);
+        }
+    }
+}
+
+void
+checkDatas(BlasExtraInfo* bExtra, const MemoryPattern* pattern)
+{
+    unsigned int nDim;
+    double t;
+    unsigned int dimension;
+    int func = bExtra->parent->parent->funcNo;
+
+    for (nDim = 0; nDim < bExtra->numParam; nDim++) {
+        t = 0;
+        if (bExtra) {
+            dimension = getDimension(nDim, bExtra->dtype,
+                                     &genInfo.deviceInfos, func);
+            if(nDim == BANK_ALIGNED_CASE_RECORD_IDX) {
+                dimension = 0;
+            }
+            // TODO add implementation checkData
+            (void) pattern;
+            t = checkData(&genInfo.targetDevice, pattern, bExtra->dtype,
+                          bExtra->flags, dimension, t);
+        }
+        logEndString();
+   }
+}
+
+void
+generateKernelForOthersFlag( BlasExtraInfo* bExtra,
+    GParam*  bestParam[DIMARRAYCOUNT],
+    MemoryPattern* pattern)
+{
+    unsigned int nExtra;
+    BlasPatternInfo*  bPatt = bExtra->parent;
+    BlasFunctionInfo*  bFunc = bPatt->parent;
+    BlasExtraInfo* bExtraOther;
+    CLBLASKernExtra extra;
+    GParam*  bestParamOther[DIMARRAYCOUNT];
+    unsigned int nDim;
+    CLBlasKargs args;
+
+    memset( bestParamOther, 0, sizeof(GParam*)*DIMARRAYCOUNT );
+
+    for (nExtra = 0; nExtra < bPatt->numExtra; ++nExtra) {
+
+        bool isMaskFlag;
+        bool isEqFlag;
+        bool isDataType;
+        unsigned int mask;
+
+        bExtraOther = &(bPatt->extra[nExtra]);
+
+        mask = bExtraOther->flags & bFunc->maskForTuningsKernel;
+        isMaskFlag =  mask == bExtra->flags;
+        isEqFlag  = bExtraOther->flags == bExtra->flags;
+        isDataType = bExtra->dtype == bExtraOther->dtype;
+
+        if (isDataType && isMaskFlag && !isEqFlag) {
+
+            for (nDim = 0; nDim < bExtra->numParam; nDim++) {
+                if (bestParam[nDim] == NULL) {
+                    continue;
+                }
+                bestParamOther[nDim] = createParCur2(bestParam[nDim]);
+            }
+
+            for (nDim = 0; nDim < bExtra->numParam; nDim++) {
+                unsigned int nd;
+
+                if (bestParam[nDim] == NULL) {
+                    continue;
+                }
+                for (nd = 0; nd < nDim; ++nd) {
+                    if (bestParam[nDim] == bestParam[nd]) {
+                            bestParamOther[nDim] = bestParamOther[nd];
+                            bestParamOther[nDim]->count++;
+                    }
+                }
+                if (genInfo.aIsKernel && bestParamOther[nDim]->kernel == NULL) {
+                    unsigned int func = bFunc->funcNo;
+                    unsigned int patt = bPatt->pattNo;
+                    initCLBLASExtra(&extra, bExtra);
+                    genAllKernel(&args, extra, bestParamOther[nDim],
+                                 pattern, func, patt);
+                    logKernalGen();
+                }
+                saveBestParams(bExtraOther, bestParamOther);
+            }
+            deleteGParams(bExtraOther, bestParamOther);
+         }
+     }
+}
+
+BlasPatternInfo*
+getPattern(BlasFunctionID fid, int pid)
+{
+	BlasFunctionInfo* pFunc =  getBlasFunctionInfo(&genInfo.targetDevice, fid);
+    return &pFunc->pattInfo[pid];
+}
+
+void
+configurePattern(void)
+{
+    // Initialization specific to the handler function.
+    //getPattern(CLBLAS_XXXX, 0)->isPGValid = ;
+    //getPattern(CLBLAS_XXXX, 0)->initSubdim = ;
+}
+
+bool
+isRebuild(BlasExtraInfo* bExtra)
+{
+    unsigned int nDim;
+    bool ret = genInfo.aCommand != C_DEFAULT;
+
+    for (nDim = 0; nDim < bExtra->numParam; ++nDim) {
+        BlasParamInfo* bParam = &bExtra->param[nDim];
+
+        ret |= bParam->sstatus == SS_NOLOAD;
+        if (bParam->offset == 0 ) {
+            printf("*****\n");
+        }
+    }
+    return ret;
+}
+
+
+void
+createFile(void)
+{
+    unsigned int funcId;
+    unsigned int pattId = 0;
+    unsigned int envPattId = 0;
+    bool isEnvPattSelected = false;
+    unsigned int dev;
+
+    initOpenCl();
+    // For each devices
+    for (dev = 0; dev < genInfo.numDevices; dev++) {
+    	initDevice(dev);
+        writeStorageCache(&genInfo.targetDevice);
+        getContext();
+        configurePattern();
+
+        // for each function
+        for (funcId = 0; funcId < BLAS_FUNCTIONS_NUMBER; funcId++) {
+
+            char *pRest = NULL;
+            BlasFunctionInfo *funcInfo = getBlasFunctionInfo(
+                &genInfo.targetDevice,
+                funcId );
+
+            if (funcInfo->envImplementation != NULL) {
+                const char *envImpl;
+
+                envImpl = getenv(funcInfo->envImplementation);
+                if (envImpl != NULL) {
+
+                    envPattId = strtoul( envImpl, &pRest, 10 );
+                    //wrong value of env. variable AMD_CLBLAS_X_IMPLEMENTATION
+                    if( 0 == strlen( envImpl ) ||
+                        pRest != envImpl + strlen(envImpl) ){
+
+                        isEnvPattSelected = false;
+                    }
+                    else{
+
+                        isEnvPattSelected = true;
+                    }
+                }
+                else{
+
+                    isEnvPattSelected = false;
+                }
+
+            }
+
+            // if pattern is selected by environment - tune it
+            // otherwise - start from the pattern number 0
+            if( true == isEnvPattSelected ){
+                pattId = envPattId;
+            }
+            else{
+                pattId = 0;
+            }
+
+            //logPattern( funcInfo->name );
+            do
+            {
+                unsigned int nExtra;
+                unsigned int nTuneExtra = 0;
+                BlasPatternInfo * bPatt;
+                MemoryPattern* pattern;
+
+                bPatt = &(funcInfo->pattInfo[pattId]);
+                pattern = &(funcInfo->pattern[pattId]);
+
+                //if select a new trsm memory pattern (#3), then skip it
+                if ( funcId == CLBLAS_TRSM && pattId == 3) {
+                    pattId++;
+                    continue;
+                }
+
+                for (nExtra = 0; nExtra < bPatt->numExtra; ++nExtra) {
+                    bool isRebuildRequired;
+
+                    BlasExtraInfo* bExtra;
+                    bExtra = &(bPatt->extra[nExtra]);
+                    genInfo.last = 0;
+
+                    if ( skipFlags(bExtra,
+                            pattId,
+                            funcId,
+                            &genInfo.deviceInfos,
+                            !isEnvPattSelected ) ) {
+                        continue;
+                    }
+
+                    if (isFilter(bExtra, pattId, funcId)) {
+                        continue;
+                    }
+                    logPattern( funcInfo->name );
+
+                    calcExtraCount(nTuneExtra, bPatt->numTuneExtra);
+                    nTuneExtra++;
+
+                    logDimension(funcInfo);
+                    logExtra(bExtra);
+
+                    isRebuildRequired = isRebuild(bExtra);
+
+                    if (isRebuildRequired) {
+                        size_t bestPatamSize = sizeof(GParam*)*DIMARRAYCOUNT;
+
+                        GParam* bestParam[DIMARRAYCOUNT];
+
+                        memset(bestParam, 0, bestPatamSize);
+
+                        initParamsTime(bExtra);
+
+                        findBestParams( pattern,
+                            funcId,
+                            pattId,
+                            isEnvPattSelected,
+                            bExtra,
+                            bestParam);
+
+                        saveBestParams(bExtra, bestParam);
+
+                        generateKernelForOthersFlag( bExtra,
+                            bestParam,
+                            pattern);
+
+                        deleteGParams(bExtra, bestParam);
+                    }
+                    checkDatas(bExtra, pattern);
+                } /* extra */
+                //logEndPattern(funcId, pattId);
+
+                pattId++;
+            /* patt */
+            }while( false == isEnvPattSelected &&
+                    pattId < clblasSolvers[funcId].nrPatterns );
+
+        } /* func */
+    } /* dev */
+    destroyGenInfo();
+}
+
+void
+parseArg(int argc, char*  argv[])
+{
+
+    static char* help=  "clblasTune - automatically tune the clblas "
+                        "library for specific hardware.\n"
+                        "\n"
+                        "clblas function related parameters:\n"
+                        "   --gemm\n"
+                        "       Tune kernels for the GEMM function family.\n"
+                        "   --trmm\n"
+                        "       Tune kernels for the TRMM function family.\n"
+                        "   --trsm\n"
+                        "       Tune kernels for the TRSM function family.\n"
+                        "   --gemv\n"
+                        "       Tune kernels for the GEMV function family.\n"
+                        "   --symv\n"
+                        "       Tune kernels for the SYMV function family.\n"
+                        "   --syrk\n"
+                        "       Tune kernels for the SYRK function family.\n"
+                        "   --syr2k\n"
+                        "       Tune kernels for the SYR2K function family.\n"
+                        "\n"
+                        "   You can specify the parameters of "
+                        "several alternatives simultaneously.\n"
+                        "\n"
+                        "   If any of these parameters is not specified the "
+                        "tool tries kernels for all the functions.\n"
+                        "\n"
+                        " Used data types:\n"
+                        "   --float\n"
+                        "       Single precision version of functions.\n"
+                        "   --double\n"
+                        "       Double precision version of functions.\n"
+                        "   --complex\n"
+                        "       Single complex precision version of functions.\n"
+                        "   --double-complex\n"
+                        "       Double complex precision version of functions.\n"
+                        "\n"
+                        "   You can specify the parameters of "
+                        "several alternatives simultaneously.\n"
+                        "\n"
+                        "   If any of these parameters is not specified the "
+                        "tool tries kernels for all the data types.\n"
+                        "\n"
+                        "Management:\n"
+                        "   --fast\n"
+                        "       Using this option allows you to accelerate "
+                        "tuning in up to 2-3 times. Achieving an optimal result "
+                        "is not guaranteed.\n"
+                        "   --rebuild\n"
+                        "       Re-tuning the fastest OpenCL kernels. Can be "
+                        "used after the driver update.\n"
+                        "   --store-kernels\n"
+                        "       Store found best kernels into a database file\n"
+                        "       WARNING! The file can be very large.\n"
+                        "\n"
+
+                        ;
+
+
+    static char* args[] = { "--gemm",               // 0
+                            "--trmm",               // 1
+                            "--trsm",               // 2
+                            "--buffers",            // 3
+                            "--images",             // 4
+                            "--float",              // 5
+                            "--double",             // 6
+                            "--complex",            // 7
+                            "--double-complex",     // 8
+                            "--store-kernels",      // 9
+                            "--rebuild",            // 10
+#if defined(_EXTENDED_TUNE_ARG)
+                            "--e",                  // 11
+                            "--max",                // 12
+                            "--extended-output",    // 13
+#else
+                            "",
+                            "",
+                            "",
+#endif
+                            "--gemv",               // 14
+                            "--symv",               // 15
+                            "--syrk",               // 20
+                            "--syr2k",              // 17
+                            "--fast",               // 18
+                            "--caches",             // 19
+                            "--help"                // 20
+                            };
+    int i;
+    unsigned int j;
+    bool isSetFunction = false;
+    bool isSetType = false;
+
+    genInfo.aAll = true;
+
+    for (i = 1; i < argc; ++i) {
+        char * arg = argv[i];
+        bool b = true;
+        for (j = 0; j < sizeof(args)/sizeof(char*); ++ j){
+            if (strcmp(arg, args[j]) == 0){
+#if defined(_EXTENDED_TUNE_ARG)
+                int argi = 0;
+#endif
+                switch (j){
+                    case 0 :
+                        genInfo.aFunc[CLBLAS_GEMM] = true;
+                        isSetFunction = true;
+                        break;
+                    case 1 :
+                        genInfo.aFunc[CLBLAS_TRMM] = true;
+                        isSetFunction = true;
+                        break;
+                    case 2 :
+                        genInfo.aFunc[CLBLAS_TRSM] = true;
+                        isSetFunction = true;
+                        break;
+                    case 3 : genInfo.aPattern = 0;                  break;
+                    case 4 : genInfo.aPattern = 1;                  break;
+                    case 5 :
+                        genInfo.aDType[TYPE_FLOAT] = true;
+                        isSetType = true;
+                        break;
+                    case 6 :
+                        genInfo.aDType[TYPE_DOUBLE] = true;
+                        isSetType = true;
+                        break;
+                    case 7 :
+                        genInfo.aDType[TYPE_COMPLEX_FLOAT] = true;
+                        isSetType = true;
+                        break;
+                    case 8 :
+                        genInfo.aDType[TYPE_COMPLEX_DOUBLE] = true;
+                        isSetType = true;
+                        break;
+                    case 9 : genInfo.aIsKernel = true;              break;
+                    case 10: genInfo.aCommand = C_REBUILD;          break;
+#if defined(_EXTENDED_TUNE_ARG)
+                    case 11:
+                        i++;
+                        argi = atoi(argv[i]);
+                        genInfo.aFlag = argi;
+                        break;
+                    case 12:
+                        i++;
+                        argi = atoi(argv[i]);
+                        genInfo.aMaxparam = argi;
+                        break;
+                    case 13:
+                        genInfo.aExtendedOutput = true;
+                        break;
+#endif
+                    case 14:
+                        genInfo.aFunc[CLBLAS_GEMV] = true;
+                        isSetFunction = true;
+                        break;
+                    case 15:
+                        genInfo.aFunc[CLBLAS_SYMV] = true;
+                        isSetFunction = true;
+                        break;
+                    case 16:
+                        genInfo.aFunc[CLBLAS_SYRK] = true;
+                        isSetFunction = true;
+                        break;
+                    case 17:
+                        genInfo.aFunc[CLBLAS_SYR2K] = true;
+                        isSetFunction = true;
+                        break;
+                    case 18: genInfo.aAll  = false;                break;
+                    case 19: genInfo.aPattern = 2;                 break;
+                    case 20:
+                        printf ("%s", help);
+                        exit(0);
+                        break;
+                }
+                b = false;
+            }
+        }
+        if (b) {
+            fprintf(stdout, "Unknown argument %s\n", arg);
+        }
+    }
+    if (!isSetFunction) {
+        for (i=0; i < BLAS_FUNCTIONS_NUMBER; ++i) {
+            genInfo.aFunc[i] = 1;
+        }
+    }
+    if (!isSetType) {
+        for (i=0; i < TYPE_NUMBER; ++i) {
+            genInfo.aDType[i] = 1;
+        }
+    }
+}
+
+int
+main(int argc, char*  argv[])
+{
+    FILE_PATH = getenv(ENV_FILE_PATH);
+
+    initGeneratorInfoRec();
+    parseArg(argc, argv);
+    clblasSetup();
+
+    if (!FILE_PATH){
+        printf("The environment variable 'AMD_CLBLAS_STORAGE_PATH' is not defined\n");
+        exit(EXIT_COD_NO_ENVIRONMENT_VARIABLE);
+    }
+
+    logStream = stdout;
+    createFile();
+
+#ifdef TEST_LOG
+
+    int h = (int)(globalTime/1000/60/60);
+    int m = (int)(globalTime/1000/60) - h*60;
+    int c = (int)(globalTime/1000) - m*60 - h*60*60;
+    fprintf(logStream, " --> All  time : %2d:%2d:%2d  \n",h, m,c);
+
+    h = (int)(globalFastTime/1000/60/60);
+    m = (int)(globalFastTime/1000/60) - h*60;
+    c = (int)(globalFastTime/1000) - m*60 - h*60*60;
+    fprintf(logStream, " --> Fast time : %2d:%2d:%2d  \n",h, m,c);
+#endif
+}
+
+char*
+getDeviceName(cl_device_id devID, int * status)
+{
+    char* devName;
+    size_t size;
+    *status = clGetDeviceInfo(devID, CL_DEVICE_NAME, 0, NULL, &size);
+    checkErrorFunc("clGetDeviceInfo", *status);
+
+    devName = malloc(size * sizeof(char));
+
+    *status = clGetDeviceInfo(devID, CL_DEVICE_NAME, size, devName, NULL);
+    checkErrorFunc("clGetDeviceInfo", *status);
+    return devName;
+}
+
diff --git a/src/library/tools/tune/tune.h b/src/library/tools/tune/tune.h
new file mode 100644
index 0000000..d172c25
--- /dev/null
+++ b/src/library/tools/tune/tune.h
@@ -0,0 +1,43 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef TOOLS_H__
+#define TOOLS_H__
+
+#include <clblas-internal.h>
+#include <cltypes.h>
+#include <kern_cache.h>
+#include <granulation.h>
+#include <kernel_extra.h>
+
+#include <blas_mempat.h>
+
+#include "storage_data.h"
+
+extern const char *FileID;
+extern const char *FileExt;
+extern const char *ENV_FILE_PATH;
+
+struct SubDimInfo;
+
+
+void     initMask(unsigned int* mask);
+char*    getDevName(TargetDevice* devId);
+void     initCLDeviceInfoRec(TargetDevice* devID, DeviceInfo *devInfo);
+
+#endif /* TOOLS_H__ */
+
diff --git a/src/samples/CMakeLists.pack b/src/samples/CMakeLists.pack
new file mode 100644
index 0000000..dbf8e7b
--- /dev/null
+++ b/src/samples/CMakeLists.pack
@@ -0,0 +1,261 @@
+#############################################################################
+## Copyright (C) 2010,2011 Advanced Micro Devices, Inc. All Rights Reserved.
+#############################################################################
+cmake_minimum_required(VERSION 2.6)
+project(clblas.samples)
+
+# Configure
+set(AMDAPPSDKROOT $ENV{AMDAPPSDKROOT}
+    CACHE FILEPATH "ATI Stream SDK root path")
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Debug CACHE STRING
+      "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel."
+      FORCE)
+endif()
+
+if(TARGET_PLATFORM EQUAL 32 OR TARGET_PLATFORM EQUAL 64)
+    set(TARGET_PLATFORM ${TARGET_PLATFORM} CACHE STRING "Target platform type (32-bit or 64-bit)" FORCE)
+else()
+    if(CMAKE_SIZEOF_VOID_P MATCHES 8)
+        set(TARGET_PLATFORM "64" CACHE STRING "Target platform type (32-bit or 64-bit)" FORCE)
+    else()
+        set(TARGET_PLATFORM "32" CACHE STRING "Target platform type (32-bit or 64-bit)" FORCE)
+    endif()
+endif()
+message(STATUS "Target platform: ${TARGET_PLATFORM}-bit")
+if(TARGET_PLATFORM EQUAL 32)
+    set(_arch "x86" INTERNAL)
+    set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS FALSE)
+else()
+    set(_arch "x86_64" INTERNAL)
+    set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS TRUE)
+endif()
+
+# Find OpenCL SDK
+find_path(OPENCL_INCLUDE_DIRS CL/cl.h OpenCL/cl.h
+    HINTS
+        $ENV{AMDAPPSDKROOT}/include
+        ${AMDAPPSDKROOT}/include
+)
+find_library(OPENCL_LIBRARIES OpenCL
+    HINTS
+        $ENV{AMDAPPSDKROOT}/lib/${_arch}
+        ${AMDAPPSDKROOT}/lib/${_arch}
+)
+if(OPENCL_INCLUDE_DIRS AND OPENCL_LIBRARIES)
+    message(STATUS "Found OpenCL: ${OPENCL_LIBRARIES}")
+else()
+    message(FATAL_ERROR "Cannot find OpenCL SDK")
+endif()
+mark_as_advanced(OPENCL_INCLUDE_DIRS OPENCL_LIBRARIES)
+
+# Turn on maximum compiler verbosity
+if(CMAKE_COMPILER_IS_GNUCXX)
+    add_definitions(-pedantic -Wall -Wextra
+        -D_POSIX_C_SOURCE=199309L -D_XOPEN_SOURCE=500
+    )
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99 -Wstrict-prototypes" CACHE STRING
+        "Default CFLAGS" FORCE)
+    # Don't use -rpath.
+    set(CMAKE_SKIP_RPATH ON CACHE BOOL "Skip RPATH" FORCE)
+
+    set(CMAKE_C_FLAGS "-m${TARGET_PLATFORM} ${CMAKE_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "-m${TARGET_PLATFORM} ${CMAKE_CXX_FLAGS}")
+
+    if(TARGET_PLATFORM EQUAL 32)
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-builtin")
+    endif()
+endif()
+if (WIN32)
+    add_definitions(-D_CRT_SECURE_NO_WARNINGS)
+endif()
+
+set(SGEMV_SAMPLE_SRC example_sgemv.c)
+set(SSYMV_SAMPLE_SRC example_ssymv.c)
+set(SGEMM_SAMPLE_SRC example_sgemm.c)
+set(STRMM_SAMPLE_SRC example_strmm.c)
+set(STRSM_SAMPLE_SRC example_strsm.c)
+set(SSYRK_SAMPLE_SRC example_ssyrk.c)
+set(SSYR2K_SAMPLE_SRC example_ssyr2k.c)
+set(STRMV_SAMPLE_SRC example_strmv.c)
+set(STRSV_SAMPLE_SRC example_strsv.c)
+set(SGER_SAMPLE_SRC example_sger.c)
+set(SSYR_SAMPLE_SRC example_ssyr.c)
+set(SSYR2_SAMPLE_SRC example_ssyr2.c)
+set(SSYMM_SAMPLE_SRC example_ssymm.c)
+set(CHER_SAMPLE_SRC example_cher.c)
+set(CHEMM_SAMPLE_SRC example_chemm.cpp)
+set(CHERK_SAMPLE_SRC example_cherk.cpp)
+set(STPMV_SAMPLE_SRC example_stpmv.c)
+set(CHPMV_SAMPLE_SRC example_chpmv.c)
+set(STPSV_SAMPLE_SRC example_stpsv.c)
+set(SSPMV_SAMPLE_SRC example_sspmv.c)
+set(SSPR_SAMPLE_SRC example_sspr.c)
+set(CHPR_SAMPLE_SRC example_chpr.c)
+set(SSPR2_SAMPLE_SRC example_sspr2.c)
+set(ZHPR2_SAMPLE_SRC example_zhpr2.c)
+set(SGBMV_SAMPLE_SRC example_sgbmv.c)
+set(STBMV_SAMPLE_SRC example_stbmv.c)
+set(SSBMV_SAMPLE_SRC example_ssbmv.c)
+set(CHBMV_SAMPLE_SRC example_chbmv.c)
+set(STBSV_SAMPLE_SRC example_stbsv.c)
+set(CHER2K_SAMPLE_SRC example_cher2k.c)
+set(SSWAP_SAMPLE_SRC example_sswap.c)
+set(SSCAL_SAMPLE_SRC example_sscal.c)
+set(CSSCAL_SAMPLE_SRC example_csscal.c)
+set(SCOPY_SAMPLE_SRC example_scopy.c)
+set(SAXPY_SAMPLE_SRC example_saxpy.c)
+set(SDOT_SAMPLE_SRC example_sdot.c)
+set(SROTG_SAMPLE_SRC example_srotg.c)
+set(SROTMG_SAMPLE_SRC example_srotmg.c)
+set(SROT_SAMPLE_SRC example_srot.c)
+set(SROTM_SAMPLE_SRC example_srotm.c)
+set(iSAMAX_SAMPLE_SRC example_isamax.c)
+set(SNRM2_SAMPLE_SRC example_snrm2.c)
+set(SASUM_SAMPLE_SRC example_sasum.c)
+
+include_directories( ${OPENCL_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}/../include )
+
+# Set the OpenCL library include path depending on target platform
+if( TARGET_PLATFORM EQUAL 64 )
+    if( WIN32 )
+	    link_directories( ${ATI_STREAM_SDK_ROOT}/lib/x86_64/ ${PROJECT_SOURCE_DIR}/../lib64/import )
+    elseif( UNIX )
+	    link_directories( ${ATI_STREAM_SDK_ROOT}/lib/x86_64/ ${PROJECT_SOURCE_DIR}/../lib64 )
+    endif()
+else()
+    if( WIN32 )
+	    link_directories( ${ATI_STREAM_SDK_ROOT}/lib/x86/ ${PROJECT_SOURCE_DIR}/../lib32/import )
+    elseif( UNIX )
+	    link_directories( ${ATI_STREAM_SDK_ROOT}/lib/x86/ ${PROJECT_SOURCE_DIR}/../lib32 )
+    endif()
+endif()
+
+add_executable(example_sgemv ${SGEMV_SAMPLE_SRC})
+target_link_libraries(example_sgemv ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_ssymv ${SSYMV_SAMPLE_SRC})
+target_link_libraries(example_ssymv ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_sgemm ${SGEMM_SAMPLE_SRC})
+target_link_libraries(example_sgemm ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_strmm ${STRMM_SAMPLE_SRC})
+target_link_libraries(example_strmm ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_strsm ${STRSM_SAMPLE_SRC})
+target_link_libraries(example_strsm ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_ssyrk ${SSYRK_SAMPLE_SRC})
+target_link_libraries(example_ssyrk ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_ssyr2k ${SSYR2K_SAMPLE_SRC})
+target_link_libraries(example_ssyr2k ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_strmv ${STRMV_SAMPLE_SRC})
+target_link_libraries(example_strmv ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_strsv ${STRSV_SAMPLE_SRC})
+target_link_libraries(example_strsv ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_sger ${SGER_SAMPLE_SRC})
+target_link_libraries(example_sger ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_ssyr ${SSYR_SAMPLE_SRC})
+target_link_libraries(example_ssyr ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_ssyr2 ${SSYR2_SAMPLE_SRC})
+target_link_libraries(example_ssyr2 ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_ssymm ${SSYMM_SAMPLE_SRC})
+target_link_libraries(example_ssymm ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_cher ${CHER_SAMPLE_SRC})
+target_link_libraries(example_cher ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_chemm ${CHEMM_SAMPLE_SRC})
+target_link_libraries(example_chemm ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_cherk ${CHERK_SAMPLE_SRC})
+target_link_libraries(example_cherk ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_stpmv ${STPMV_SAMPLE_SRC})
+target_link_libraries(example_stpmv ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_chpmv ${CHPMV_SAMPLE_SRC})
+target_link_libraries(example_chpmv ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_stpsv ${STPSV_SAMPLE_SRC})
+target_link_libraries(example_stpsv ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_sspmv ${SSPMV_SAMPLE_SRC})
+target_link_libraries(example_sspmv ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_sspr ${SSPR_SAMPLE_SRC})
+target_link_libraries(example_sspr ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_chpr ${CHPR_SAMPLE_SRC})
+target_link_libraries(example_chpr ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_sspr2 ${SSPR2_SAMPLE_SRC})
+target_link_libraries(example_sspr2 ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_zhpr2 ${ZHPR2_SAMPLE_SRC})
+target_link_libraries(example_zhpr2 ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_sgbmv ${SGBMV_SAMPLE_SRC})
+target_link_libraries(example_sgbmv ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_stbmv ${STBMV_SAMPLE_SRC})
+target_link_libraries(example_stbmv ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_ssbmv ${SSBMV_SAMPLE_SRC})
+target_link_libraries(example_ssbmv ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_chbmv ${CHBMV_SAMPLE_SRC})
+target_link_libraries(example_chbmv ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_stbsv ${STBSV_SAMPLE_SRC})
+target_link_libraries(example_stbsv ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_cher2k ${CHER2K_SAMPLE_SRC})
+target_link_libraries(example_cher2k ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_sswap ${SSWAP_SAMPLE_SRC})
+target_link_libraries(example_sswap ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_sscal ${SSCAL_SAMPLE_SRC})
+target_link_libraries(example_sscal ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_csscal ${CSSCAL_SAMPLE_SRC})
+target_link_libraries(example_csscal ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_scopy ${SCOPY_SAMPLE_SRC})
+target_link_libraries(example_scopy ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_saxpy ${SAXPY_SAMPLE_SRC})
+target_link_libraries(example_saxpy ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_sdot ${SDOT_SAMPLE_SRC})
+target_link_libraries(example_sdot ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_srotg ${SROTG_SAMPLE_SRC})
+target_link_libraries(example_srotg ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_srotmg ${SROTMG_SAMPLE_SRC})
+target_link_libraries(example_srotmg ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_srot ${SROT_SAMPLE_SRC})
+target_link_libraries(example_srot ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_srotm ${SROTM_SAMPLE_SRC})
+target_link_libraries(example_srotm ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_isamax ${iSAMAX_SAMPLE_SRC})
+target_link_libraries(example_isamax ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_snrm2 ${SNRM2_SAMPLE_SRC})
+target_link_libraries(example_snrm2 ${OPENCL_LIBRARIES} clblas)
+
+add_executable(example_sasum ${SASUM_SAMPLE_SRC})
+target_link_libraries(example_sasum ${OPENCL_LIBRARIES} clblas)
diff --git a/src/samples/CMakeLists.txt b/src/samples/CMakeLists.txt
new file mode 100644
index 0000000..ea9e2b5
--- /dev/null
+++ b/src/samples/CMakeLists.txt
@@ -0,0 +1,357 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+set(SGEMV_SAMPLE_SRC example_sgemv.c)
+set(SSYMV_SAMPLE_SRC example_ssymv.c)
+set(SGEMM_SAMPLE_SRC example_sgemm.c)
+set(STRMM_SAMPLE_SRC example_strmm.c)
+set(STRSM_SAMPLE_SRC example_strsm.c)
+set(SSYRK_SAMPLE_SRC example_ssyrk.c)
+set(SSYR2K_SAMPLE_SRC example_ssyr2k.c)
+
+set(STRMV_SAMPLE_SRC example_strmv.c) # Addition: for STRMV
+set(DTRMV_SAMPLE_SRC example_dtrmv.c) # Addition: for STRMV
+set(STRSV_SAMPLE_SRC example_strsv.c) # Addition: for STRSV
+set(SGER_SAMPLE_SRC example_sger.c)   # Addition: for SGER
+set(SSYR_SAMPLE_SRC example_ssyr.c)   # Addition: for SSYR
+set(SSYR2_SAMPLE_SRC example_ssyr2.c)   # Addition: for SSYR2
+set(CHER_SAMPLE_SRC example_cher.c)
+set(ZHEMV_SAMPLE_SRC example_zhemv.cpp)
+set(ZHER2_SAMPLE_SRC example_zher2.c)
+set(CHERK_SAMPLE_SRC example_cherk.cpp)
+set(SSYMM_SAMPLE_SRC example_ssymm.c)
+set(CHEMM_SAMPLE_SRC example_chemm.cpp)
+
+set(STPMV_SAMPLE_SRC example_stpmv.c)
+set(CHPMV_SAMPLE_SRC example_chpmv.c)
+set(STPSV_SAMPLE_SRC example_stpsv.c)
+set(SSPMV_SAMPLE_SRC example_sspmv.c)
+set(SSPR_SAMPLE_SRC example_sspr.c)
+set(CHPR_SAMPLE_SRC example_chpr.c)
+set(SSPR2_SAMPLE_SRC example_sspr2.c)
+set(ZHPR2_SAMPLE_SRC example_zhpr2.c)
+
+set(SGBMV_SAMPLE_SRC example_sgbmv.c)
+set(STBMV_SAMPLE_SRC example_stbmv.c)
+set(SSBMV_SAMPLE_SRC example_ssbmv.c)
+set(CHBMV_SAMPLE_SRC example_chbmv.c)
+set(STBSV_SAMPLE_SRC example_stbsv.c)
+
+set(CHER2K_SAMPLE_SRC example_cher2k.c)
+
+set(SSWAP_SAMPLE_SRC example_sswap.c)
+set(SSCAL_SAMPLE_SRC example_sscal.c)
+set(CSSCAL_SAMPLE_SRC example_csscal.c)
+set(SCOPY_SAMPLE_SRC example_scopy.c)
+set(SAXPY_SAMPLE_SRC example_saxpy.c)
+set(SDOT_SAMPLE_SRC example_sdot.c)
+
+set(SROTG_SAMPLE_SRC example_srotg.c)
+set(SROTMG_SAMPLE_SRC example_srotmg.c)
+set(SROT_SAMPLE_SRC example_srot.c)
+set(SROTM_SAMPLE_SRC example_srotm.c)
+set(iSAMAX_SAMPLE_SRC example_isamax.c)
+set(SNRM2_SAMPLE_SRC example_snrm2.c)
+set(SASUM_SAMPLE_SRC example_sasum.c)
+
+set(VERSION_SAMPLE_SRC clBlasVersion.c)
+
+include_directories(${OPENCL_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR})
+
+add_executable(example_sgemv ${SGEMV_SAMPLE_SRC})
+target_link_libraries(example_sgemv ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_sgemv PROPERTY FOLDER "Samples")
+
+add_executable(example_ssymv ${SSYMV_SAMPLE_SRC})
+target_link_libraries(example_ssymv ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_ssymv PROPERTY FOLDER "Samples")
+
+add_executable(example_sgemm ${SGEMM_SAMPLE_SRC})
+target_link_libraries(example_sgemm ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_sgemm PROPERTY FOLDER "Samples")
+
+add_executable(example_strmm ${STRMM_SAMPLE_SRC})
+target_link_libraries(example_strmm ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_strmm PROPERTY FOLDER "Samples")
+
+add_executable(example_strsm ${STRSM_SAMPLE_SRC})
+target_link_libraries(example_strsm ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_strsm PROPERTY FOLDER "Samples")
+
+add_executable(example_ssyrk ${SSYRK_SAMPLE_SRC})
+target_link_libraries(example_ssyrk ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_ssyrk PROPERTY FOLDER "Samples")
+
+add_executable(example_ssyr2k ${SSYR2K_SAMPLE_SRC})
+target_link_libraries(example_ssyr2k ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_ssyr2k PROPERTY FOLDER "Samples")
+
+add_executable(version ${VERSION_SAMPLE_SRC})
+target_link_libraries(version ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET version PROPERTY FOLDER "Samples")
+
+# Addition - for samples
+add_executable(example_strmv ${STRMV_SAMPLE_SRC})
+target_link_libraries(example_strmv ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_strmv PROPERTY FOLDER "Samples")
+
+add_executable(example_dtrmv ${DTRMV_SAMPLE_SRC})
+target_link_libraries(example_dtrmv ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_dtrmv PROPERTY FOLDER "Samples")
+
+add_executable(example_strsv ${STRSV_SAMPLE_SRC})
+target_link_libraries(example_strsv ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_strsv PROPERTY FOLDER "Samples")
+
+add_executable(example_sger ${SGER_SAMPLE_SRC})
+target_link_libraries(example_sger ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_sger PROPERTY FOLDER "Samples")
+
+add_executable(example_cher ${CHER_SAMPLE_SRC})
+target_link_libraries(example_cher ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_cher PROPERTY FOLDER "Samples")
+
+add_executable(example_ssyr ${SSYR_SAMPLE_SRC})
+target_link_libraries(example_ssyr ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_ssyr PROPERTY FOLDER "Samples")
+
+add_executable(example_ssyr2 ${SSYR2_SAMPLE_SRC})
+target_link_libraries(example_ssyr2 ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_ssyr2 PROPERTY FOLDER "Samples")
+
+add_executable(example_zhemv ${ZHEMV_SAMPLE_SRC})
+target_link_libraries(example_zhemv ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_zhemv PROPERTY FOLDER "Samples")
+
+add_executable(example_zher2 ${ZHER2_SAMPLE_SRC})
+target_link_libraries(example_zher2 ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_zher2 PROPERTY FOLDER "Samples")
+
+add_executable(example_cherk ${CHERK_SAMPLE_SRC})
+target_link_libraries(example_cherk ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_cherk PROPERTY FOLDER "Samples")
+
+add_executable(example_ssymm ${SSYMM_SAMPLE_SRC})
+target_link_libraries(example_ssymm ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_ssymm PROPERTY FOLDER "Samples")
+
+add_executable(example_chemm ${CHEMM_SAMPLE_SRC})
+target_link_libraries(example_chemm ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_chemm PROPERTY FOLDER "Samples")
+
+add_executable(example_stpmv ${STPMV_SAMPLE_SRC})
+target_link_libraries(example_stpmv ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_stpmv PROPERTY FOLDER "Samples")
+
+add_executable(example_chpmv ${CHPMV_SAMPLE_SRC})
+target_link_libraries(example_chpmv ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_chpmv PROPERTY FOLDER "Samples")
+
+add_executable(example_stpsv ${STPSV_SAMPLE_SRC})
+target_link_libraries(example_stpsv ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_stpsv PROPERTY FOLDER "Samples")
+
+add_executable(example_sspmv ${SSPMV_SAMPLE_SRC})
+target_link_libraries(example_sspmv ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_sspmv PROPERTY FOLDER "Samples")
+
+add_executable(example_sspr ${SSPR_SAMPLE_SRC})
+target_link_libraries(example_sspr ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_sspr PROPERTY FOLDER "Samples")
+
+add_executable(example_chpr ${CHPR_SAMPLE_SRC})
+target_link_libraries(example_chpr ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_chpr PROPERTY FOLDER "Samples")
+
+add_executable(example_sspr2 ${SSPR2_SAMPLE_SRC})
+target_link_libraries(example_sspr2 ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_sspr2 PROPERTY FOLDER "Samples")
+
+add_executable(example_zhpr2 ${ZHPR2_SAMPLE_SRC})
+target_link_libraries(example_zhpr2 ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_zhpr2 PROPERTY FOLDER "Samples")
+
+add_executable(example_sgbmv ${SGBMV_SAMPLE_SRC})
+target_link_libraries(example_sgbmv ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_sgbmv PROPERTY FOLDER "Samples")
+
+add_executable(example_stbmv ${STBMV_SAMPLE_SRC})
+target_link_libraries(example_stbmv ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_stbmv PROPERTY FOLDER "Samples")
+
+add_executable(example_ssbmv ${SSBMV_SAMPLE_SRC})
+target_link_libraries(example_ssbmv ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_ssbmv PROPERTY FOLDER "Samples")
+
+add_executable(example_chbmv ${CHBMV_SAMPLE_SRC})
+target_link_libraries(example_chbmv ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_chbmv PROPERTY FOLDER "Samples")
+
+add_executable(example_stbsv ${STBSV_SAMPLE_SRC})
+target_link_libraries(example_stbsv ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_stbsv PROPERTY FOLDER "Samples")
+
+add_executable(example_cher2k ${CHER2K_SAMPLE_SRC})
+target_link_libraries(example_cher2k ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_cher2k PROPERTY FOLDER "Samples")
+
+add_executable(example_sswap ${SSWAP_SAMPLE_SRC})
+target_link_libraries(example_sswap ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_sswap PROPERTY FOLDER "Samples")
+
+add_executable(example_sscal ${SSCAL_SAMPLE_SRC})
+target_link_libraries(example_sscal ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_sscal PROPERTY FOLDER "Samples")
+
+add_executable(example_csscal ${CSSCAL_SAMPLE_SRC})
+target_link_libraries(example_csscal ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_csscal PROPERTY FOLDER "Samples")
+
+add_executable(example_scopy ${SCOPY_SAMPLE_SRC})
+target_link_libraries(example_scopy ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_scopy PROPERTY FOLDER "Samples")
+
+add_executable(example_saxpy ${SAXPY_SAMPLE_SRC})
+target_link_libraries(example_saxpy ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_saxpy PROPERTY FOLDER "Samples")
+
+add_executable(example_sdot ${SDOT_SAMPLE_SRC})
+target_link_libraries(example_sdot ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_sdot PROPERTY FOLDER "Samples")
+
+add_executable(example_srotg ${SROTG_SAMPLE_SRC})
+target_link_libraries(example_srotg ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_srotg PROPERTY FOLDER "Samples")
+
+add_executable(example_srotmg ${SROTMG_SAMPLE_SRC})
+target_link_libraries(example_srotmg ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_srotmg PROPERTY FOLDER "Samples")
+
+add_executable(example_srot ${SROT_SAMPLE_SRC})
+target_link_libraries(example_srot ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_srot PROPERTY FOLDER "Samples")
+
+add_executable(example_srotm ${SROTM_SAMPLE_SRC})
+target_link_libraries(example_srotm ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_srotm PROPERTY FOLDER "Samples")
+
+add_executable(example_isamax ${iSAMAX_SAMPLE_SRC})
+target_link_libraries(example_isamax ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_isamax PROPERTY FOLDER "Samples")
+
+add_executable(example_snrm2 ${SNRM2_SAMPLE_SRC})
+target_link_libraries(example_snrm2 ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_snrm2 PROPERTY FOLDER "Samples")
+
+add_executable(example_sasum ${SASUM_SAMPLE_SRC})
+target_link_libraries(example_sasum ${OPENCL_LIBRARIES} clBLAS)
+set_property( TARGET example_sasum PROPERTY FOLDER "Samples")
+
+if( TARGET_PLATFORM EQUAL 64 )
+	# CPack configuration; include the executable into the package
+	install( TARGETS example_sgemm example_sgemv example_ssymv example_ssyrk
+             example_ssyr2k example_strmm example_strsm 
+		     example_strmv example_strsv example_sger example_cher example_ssyr 
+		     example_ssyr2 example_cherk example_ssymm example_chemm
+		     example_stpmv example_chpmv example_stpsv example_sspmv example_sspr example_chpr
+		     example_sspr2 example_zhpr2 
+		     example_sgbmv example_stbmv example_ssbmv example_chbmv example_stbsv
+		     example_cher2k
+		     example_sswap example_sscal example_csscal example_scopy example_saxpy example_sdot
+		     example_srotg example_srotmg example_srot example_srotm
+		     example_snrm2 example_sasum example_isamax
+
+			 version
+			 RUNTIME DESTINATION bin64
+			 LIBRARY DESTINATION lib64
+			 ARCHIVE DESTINATION lib64/import
+			)
+else()
+	# CPack configuration; include the executable into the package
+	install( TARGETS example_sgemm example_sgemv example_ssymv example_ssyrk
+             example_ssyr2k example_strmm example_strsm 
+		     example_strmv example_strsv example_sger example_cher example_ssyr 
+		     example_ssyr2 example_cherk example_ssymm example_chemm
+		     example_stpmv example_chpmv example_stpsv example_sspmv example_sspr example_chpr
+		     example_sspr2 example_zhpr2 
+		     example_sgbmv example_stbmv example_ssbmv example_chbmv example_stbsv
+		     example_cher2k
+		     example_sswap example_sscal example_csscal example_scopy example_saxpy example_sdot
+		     example_srotg example_srotmg example_srot example_srotm
+		     example_snrm2 example_sasum example_isamax
+
+			 version
+			 RUNTIME DESTINATION bin32
+			 LIBRARY DESTINATION lib32
+			 ARCHIVE DESTINATION lib32/import
+			)
+endif()
+
+configure_file( "${PROJECT_SOURCE_DIR}/samples/CMakeLists.pack"
+		"${PROJECT_BINARY_DIR}/samples/CMakeLists.txt" COPYONLY )
+		
+install(FILES
+            example_sgemv.c
+            example_ssymv.c 
+            example_sgemm.c
+            example_strmm.c
+            example_strsm.c
+            example_ssyrk.c
+            example_ssyr2k.c
+			example_strmv.c
+			example_strsv.c
+			example_sger.c 
+			example_ssyr.c 
+			example_ssyr2.c
+			example_ssymm.c
+			example_cher.c 
+            example_chemm.cpp
+            example_cherk.cpp
+            example_ssymm.c
+            example_chemm.cpp
+            example_stpmv.c
+            example_chpmv.c
+            example_stpsv.c
+    	    example_sspmv.c
+    	    example_sspr.c
+    	    example_chpr.c
+    	    example_sspr2.c
+    	    example_zhpr2.c
+    	    example_sgbmv.c
+    	    example_stbmv.c
+    	    example_ssbmv.c
+    	    example_chbmv.c
+    	    example_stbsv.c
+    	    example_cher2k.c
+    	    example_sswap.c
+            example_sscal.c
+            example_scopy.c
+            example_csscal.c
+            example_saxpy.c
+            example_sdot.c
+            example_srotg.c
+            example_srotmg.c
+            example_srot.c
+            example_srotm.c
+            example_isamax.c
+            example_snrm2.c
+            example_sasum.c
+
+            clBlasVersion.c
+            ${PROJECT_BINARY_DIR}/samples/CMakeLists.txt
+
+        DESTINATION
+		    samples )
diff --git a/src/samples/clBlasVersion.c b/src/samples/clBlasVersion.c
new file mode 100644
index 0000000..6242827
--- /dev/null
+++ b/src/samples/clBlasVersion.c
@@ -0,0 +1,41 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+
+int
+main(void)
+{
+    cl_uint major,minor,patch;
+    clblasStatus err;
+
+    err = clblasGetVersion(&major,&minor,&patch);
+    if (err != CL_SUCCESS) {
+        printf("clblasGetVersion() failed with %d\n", err);
+        return 1;
+    }
+    printf("clblas version %d.%d.%d\n", major,minor,patch);
+    return 0;
+}
diff --git a/src/samples/example_chbmv.c b/src/samples/example_chbmv.c
new file mode 100644
index 0000000..0b2aec3
--- /dev/null
+++ b/src/samples/example_chbmv.c
@@ -0,0 +1,171 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+static const clblasUplo uplo = clblasUpper;
+static const size_t N = 5;
+static const size_t K = 2;
+
+static const cl_float2 alpha = {{10,10}};
+static const cl_float2 A[] = {
+	{{ 4.0,  4.0}}, {{ 7.0,  7.0}}, {{11.0, 11.0}},
+    {{ 5.0,  5.0}}, {{ 8.0,  8.0}}, {{12.0, 12.0}},
+    {{ 6.0,  6.0}}, {{ 9.0,  9.0}}, {{13.0, 13.0}},
+    {{10.0, 10.0}}, {{14.0, 14.0}}, {{00.0, 00.0}},
+	{{15.0, 15.0}}, {{00.0, 00.0}}, {{00.0, 00.0}}
+};
+static const size_t lda = 3;    // lda = K + 1
+
+static const cl_float2 X[] = {
+	{{1.0, 0.0}},
+	{{2.0, 0.0}},
+	{{3.0, 0.0}},
+	{{4.0, 0.0}},
+	{{5.0, 0.0}}
+};
+static const int incx = 1;
+
+static const cl_float2 beta = {{20.0, 20.0}};
+static cl_float2 Y[] = {
+	{{1.0, 0.0}},
+    {{2.0, 0.0}},
+    {{3.0, 0.0}},
+    {{4.0, 0.0}},
+    {{5.0, 0.0}}
+};
+static const int incy = 1;
+
+static void
+printResult(void)
+{
+    size_t i;
+
+    printf("Result:\n");
+
+    for (i = 0; i < N; i++) {
+        printf("(%9.2f, %-9.2f)\n", CREAL(Y[i * incy]), CIMAG(Y[i * incy]));
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufX, bufY;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * lda * sizeof(cl_float2),
+                          NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float2),
+                          NULL, &err);
+    bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(cl_float2),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+                                N * lda * sizeof(cl_float2), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+                                N * sizeof(cl_float2), X, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0,
+                                N * sizeof(cl_float2), Y, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasChbmv(order, uplo, N, K, alpha, bufA, 0 /*offA */, lda,
+							bufX, 0 /*offx*/, incx, beta, bufY, 0 /*offx*/, incy, 1, &queue, 0, NULL, &event);
+
+
+   	if (err != CL_SUCCESS) {
+        printf("clblasChbmv() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+		printResult();
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(cl_float2),
+                                    Y, 0, NULL, NULL);
+        /* At this point you will get the result of CHBMV placed in Y array. */
+        printResult();
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufY);
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_chemm.cpp b/src/samples/example_chemm.cpp
new file mode 100644
index 0000000..c222ed4
--- /dev/null
+++ b/src/samples/example_chemm.cpp
@@ -0,0 +1,178 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+
+#define M  4
+#define N  3
+
+static const cl_float2 alpha = {{10, 10}};
+
+static const clblasSide side = clblasLeft;
+static const clblasUplo uplo = clblasLower;
+static const cl_float2 A[M*M] = {
+    {{11, 12}}, {{-1, -1}}, {{-1, -1}}, {{-1, -1}},
+    {{21, 22}}, {{22, 23}}, {{-1, -1}}, {{-1, -1}},
+    {{31, 32}}, {{32, 33}}, {{33, 34}}, {{-1, -1}},
+    {{41, 61}}, {{42, 62}}, {{43, 73}}, {{44, 23}}
+};
+static const size_t lda = M;
+
+static const cl_float2 B[M*N] = {
+    {{11, -21}},  {{-12, 23}}, {{13, 33}},
+    {{21, 12}},   {{22, -10}}, {{23, 5}},
+    {{31, 1}},    {{-32, 65}}, {{33, -1}},
+    {{1, 41}},    {{-33, 42}}, {{12, 43}},
+};
+static const size_t ldb = N;
+
+static const cl_float2 beta = {{20, 20}};
+
+static cl_float2 C[M*N] = {
+    {{11, 11}},  {{-12, 12}}, {{13, 33}},
+    {{21, -32}}, {{22,  -1}}, {{23, 0}},
+    {{31, 13}},  {{-32, 78}}, {{33, 45}},
+    {{41, 14}},  {{0,   42}}, {{43, -1}},
+};
+static const size_t ldc = N;
+
+static void
+printResult(void)
+{
+    size_t i, j, nrows;
+
+    printf("Result:\n");
+
+    nrows = (sizeof(C) / sizeof(cl_float2)) / ldc;
+    for (i = 0; i < nrows; i++) {
+        for (j = 0; j < ldc; j++) {
+            printf("<%9.2f, %-9.2f> ", CREAL(C[i * ldc + j]), CIMAG(C[i*ldc + j]));
+        }
+        printf("\n");
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufB, bufC;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * M * sizeof(*A),
+                          NULL, &err);
+    bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * N * sizeof(*B),
+                          NULL, &err);
+    bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+        M * M * sizeof(*A), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0,
+        M * N * sizeof(*B), B, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0,
+        M * N * sizeof(*C), C, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasChemm(order, side, uplo, M, N, alpha, bufA,
+                         0, lda, bufB, 0, ldb, beta, bufC, 0, ldc, 1, &queue,
+                         0, NULL, &event);
+    if (err != CL_SUCCESS) {
+        printf("clblasSsymm() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, M * N * sizeof(*C),
+                                  C, 0, NULL, NULL);
+
+        /* At this point you will get the result of SYMM placed in C array. */
+        printResult();
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufC);
+    clReleaseMemObject(bufB);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_cher.c b/src/samples/example_cher.c
new file mode 100644
index 0000000..0c55fed
--- /dev/null
+++ b/src/samples/example_cher.c
@@ -0,0 +1,159 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+
+static const size_t N = 5;
+
+static cl_float alpha = 10;
+
+static const clblasUplo uplo = clblasUpper;
+static cl_float2 A[] = {
+    {{11.0f, 00.0f}}, {{12.0f, 02.0f}}, {{13.0f, 05.0f}}, {{14.0f, 12.0f}}, {{15.0f, 06.0f}},
+    {{00.0f, 00.0f}}, {{22.0f, 00.0f}}, {{23.0f, 25.0f}}, {{24.0f, 23.0f}}, {{25.0f, 61.0f}},
+    {{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{33.0f, 00.0f}}, {{34.0f, 23.0f}}, {{35.0f, 21.0f}},
+    {{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{44.0f, 00.0f}}, {{45.0f, 23.0f}},
+	{{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{55.0f, 00.0f}}
+};
+static const size_t lda = 5;    /* i.e. lda = N */
+
+static const cl_float2 X[] = {
+	{{11.0f, 23.0f}},
+	{{21.0f, 65.0f}},
+	{{31.0f, 20.0f}},
+	{{41.0f, 02.0f}},
+	{{51.0f, 10.0f}}
+};
+static const int incx = 1;
+
+static void
+printResult(void)
+{
+    size_t i, j;
+    printf("\nResult:\n");
+
+    for (i = 0; i < N; i++) {
+		for(j = 0; j < N; j++)
+			printf("(%9.2lf, %-9.2lf)\t", CREAL( A[ i*N + j ] ), CIMAG( A[ i*N + j ] ));
+		printf("\n");
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufX;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * lda * sizeof(cl_float2),
+                          NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float2),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+        N * lda * sizeof(cl_float2), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+        N * sizeof(cl_float2), X, 0, NULL, NULL);
+
+
+    err = clblasCher(order, uplo, N, alpha, bufX, 0 /*offx */, incx,
+										bufA, 0 /*offa */, lda, 1, &queue, 0, NULL, &event);
+
+   	if (err != CL_SUCCESS) {
+        printf("clblasCher() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufA, CL_TRUE, 0, (N * lda * sizeof(cl_float2)),
+                                  A, 0, NULL, NULL);
+        /* At this point you will get the result of CHER placed in A array. */
+        printResult();
+    }
+
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_cher2k.c b/src/samples/example_cher2k.c
new file mode 100644
index 0000000..20ca780
--- /dev/null
+++ b/src/samples/example_cher2k.c
@@ -0,0 +1,185 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasColumnMajor;
+static const clblasUplo uplo = clblasLower;
+static const clblasTranspose transA = clblasNoTrans;
+
+static const size_t N = 5;
+static const size_t K = 4;
+
+static const cl_float2 alpha = {{10, 1}};
+static const cl_float2 A[] = {
+
+    {{11, 0}}, {{12, 0}}, {{13, 0}}, {{14, 0}},
+    {{21, 0}}, {{22, 0}}, {{23, 0}}, {{24, 0}},
+    {{31, 0}}, {{32, 0}}, {{33, 0}}, {{34, 0}},
+    {{41, 0}}, {{42, 0}}, {{43, 0}}, {{44, 0}},
+    {{51, 0}}, {{52, 0}}, {{53, 0}}, {{54, 0}}
+};
+static const size_t lda = 5;    /* i.e. lda = N */
+
+static const cl_float2 B[] = {
+
+    {{1, 0}}, {{2, 0}}, {{3, 0}}, {{4, 0}},
+    {{2, 0}}, {{2, 0}}, {{3, 0}}, {{4, 0}},
+    {{3, 0}}, {{2, 0}}, {{3, 0}}, {{3, 0}},
+    {{4, 0}}, {{4, 0}}, {{4, 0}}, {{4, 0}},
+    {{5, 0}}, {{5, 0}}, {{5, 0}}, {{5, 0}}
+};
+static const size_t ldb = 5;    /* i.e. lda = N */
+
+static const cl_float beta = 1;
+static cl_float2 C[] = {
+    {{11, 1}}, {{12, 0}}, {{13, 0}}, {{14, 0}}, {{15, 0}},
+    {{ 0, 0}}, {{22, 2}}, {{23, 0}}, {{24, 0}}, {{25, 0}},
+    {{ 0, 0}}, {{ 0, 0}}, {{33, 4}}, {{34, 0}}, {{35, 0}},
+    {{ 0, 0}}, {{ 0, 0}}, {{ 0, 0}}, {{44, 5}}, {{45, 0}},
+    {{ 0, 0}}, {{ 0, 0}}, {{ 0, 0}}, {{ 0, 0}}, {{55, 6}}
+};
+static const size_t ldc = 5;    /* i.e. ldc = N */
+
+static void
+printResult(void)
+{
+    size_t i, j;
+
+    printf("Result:\n");
+
+    for (i = 0; i < N; i++) {
+        for (j = 0; j < N; j++) {
+            printf("(%9.2f, %-9.2f) ", CREAL(C[i + j * ldc]), CIMAG(C[i + j * ldc]));
+        }
+        printf("\n");
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufC, bufB;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * K * sizeof(*A),
+                          NULL, &err);
+    bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * K * sizeof(*B),
+                          NULL, &err);
+    bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * N * sizeof(*C),
+                          NULL, &err);
+
+    if ((bufA == NULL) || (bufC == NULL) || (bufB == NULL))
+    {
+        printf("Failed to create buffern");
+        return 1;
+    }
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+        N * K * sizeof(*A), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0,
+        N * K * sizeof(*B), B, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0,
+        N * N * sizeof(*C), C, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasCher2k(order, uplo, transA, N, K, alpha, bufA, 0, lda, bufB, 0, ldb,
+                            beta, bufC, 0, ldc, 1, &queue, 0, NULL, &event);
+
+    if (err != CL_SUCCESS) {
+        printf("clblasCher2k() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, N * N * sizeof(*C),
+                                  C, 0, NULL, NULL);
+
+        /* At this point you will get the result of SSYRK placed in C array. */
+        printResult();
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufC);
+    clReleaseMemObject(bufB);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_cherk.cpp b/src/samples/example_cherk.cpp
new file mode 100644
index 0000000..771a443
--- /dev/null
+++ b/src/samples/example_cherk.cpp
@@ -0,0 +1,184 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasColumnMajor;
+static const clblasUplo uplo = clblasLower;
+static const clblasTranspose transA = clblasNoTrans;
+
+static const size_t N = 5;
+static const size_t K = 4;
+
+static const cl_float alpha = 10;
+
+static const cl_float2 A[] = {
+
+
+    {{1, 0}}, {{1, 0}}, {{1, 0}}, {{1, 0}},
+    {{1, 0}}, {{1, 0}}, {{1, 0}}, {{1, 0}},
+    {{1, 0}}, {{1, 0}}, {{1, 0}}, {{1, 0}},
+    {{1, 0}}, {{1, 0}}, {{1, 0}}, {{1, 0}},
+    {{1, 0}}, {{1, 0}}, {{1, 0}}, {{1, 0}}
+
+    /*
+    {{11, 0}}, {{12, 0}}, {{13, 0}}, {{14, 0}},
+    {{21, 0}}, {{22, 0}}, {{23, 0}}, {{24, 0}},
+    {{31, 0}}, {{32, 0}}, {{33, 0}}, {{34, 0}},
+    {{41, 0}}, {{42, 0}}, {{43, 0}}, {{44, 0}},
+    {{51, 0}}, {{52, 0}}, {{53, 0}}, {{54, 0}}
+    */
+};
+static const size_t lda = 5;    /* i.e. lda = K */
+
+static const cl_float beta = 1;
+
+static cl_float2 C[] = {
+    {{11, 1}}, {{12, 0}}, {{13, 0}}, {{14, 0}}, {{15, 0}},
+    {{ 0, 0}}, {{22, 2}}, {{23, 0}}, {{24, 0}}, {{25, 0}},
+    {{ 0, 0}}, {{ 0, 0}}, {{33, 4}}, {{34, 0}}, {{35, 0}},
+    {{ 0, 0}}, {{ 0, 0}}, {{ 0, 0}}, {{44, 5}}, {{45, 0}},
+    {{ 0, 0}}, {{ 0, 0}}, {{ 0, 0}}, {{ 0, 0}}, {{55, 6}}
+};
+static const size_t ldc = 5;    /* i.e. ldc = N */
+
+static void
+printResult(void)
+{
+    size_t i, j;
+
+    printf("Result:\n");
+
+    for (i = 0; i < N; i++) {
+        for (j = 0; j < N; j++) {
+            printf("(%9.2f, %-9.2f) ", CREAL(C[i + j * ldc]), CIMAG(C[i + j * ldc]));
+        }
+        printf("\n");
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufC;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * K * sizeof(*A),
+                          NULL, &err);
+    bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * N * sizeof(*C),
+                          NULL, &err);
+
+    if ((bufA == NULL) || (bufC == NULL))
+    {
+        printf("Failed to create buffern");
+        return 1;
+    }
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+        N * K * sizeof(*A), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0,
+        N * N * sizeof(*C), C, 0, NULL, NULL);
+
+//    printResult();
+
+    /* Call clblas function. */
+    err = clblasCherk(order, uplo, transA, N, K,
+                         alpha, bufA, 0, lda, beta, bufC, 0, ldc,
+                         1, &queue, 0, NULL, &event);
+
+    if (err != CL_SUCCESS) {
+        printf("clblasCherk() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, N * N * sizeof(*C),
+                                  C, 0, NULL, NULL);
+
+        /* At this point you will get the result of SSYRK placed in C array. */
+        printResult();
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufC);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_chpmv.c b/src/samples/example_chpmv.c
new file mode 100644
index 0000000..763b7de
--- /dev/null
+++ b/src/samples/example_chpmv.c
@@ -0,0 +1,169 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+static const clblasUplo uplo = clblasUpper;
+static const size_t N = 5;
+
+static const cl_float2 A[] = {
+	{{ 1.0, 00.0}}, {{ 2.0, 02.0}}, {{ 4.0,  4.0}}, {{ 7.0,  7.0}}, {{11.0, 11.0}},
+                    {{ 3.0, 03.0}}, {{ 5.0,  5.0}}, {{ 8.0,  8.0}}, {{12.0, 12.0}},
+                                    {{ 6.0,  6.0}}, {{ 9.0,  9.0}}, {{13.0, 13.0}},
+                                                    {{10.0, 10.0}}, {{14.0, 14.0}},
+	                                                                {{15.0, 15.0}}
+};
+
+static const cl_float2 alpha = {{10,10}};
+static const cl_float2 X[] = {
+	{{1.0, 0.0}},
+	{{2.0, 0.0}},
+	{{3.0, 0.0}},
+	{{4.0, 0.0}},
+	{{5.0, 0.0}}
+};
+static const int incx = 1;
+
+static const cl_float2 beta = {{2.0, 2.0}};
+static cl_float2 Y[] = {
+	{{1.0, 0.0}},
+    {{2.0, 0.0}},
+    {{3.0, 0.0}},
+    {{4.0, 0.0}},
+    {{5.0, 0.0}}
+};
+static const int incy = 1;
+
+static void
+printResult(void)
+{
+    size_t i;
+
+    printf("Result:\n\n");
+    for (i = 0; i < N; i++) {
+        printf("(%9.2lf, %-9.2lf)\n", CREAL(Y[i * incy]), CIMAG(Y[i * incy]));
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufAP, bufX, bufY;
+    cl_event event = NULL;
+    int ret = 0, numElementsAP;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+    numElementsAP = (N * (N+1)) / 2;	// To get number of elements in a packed matrix
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufAP = clCreateBuffer(ctx, CL_MEM_READ_ONLY, numElementsAP * sizeof(cl_float2),
+                          NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float2),
+                          NULL, &err);
+    bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(cl_float2),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufAP, CL_TRUE, 0,
+                                numElementsAP * sizeof(cl_float2), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+                                N * sizeof(cl_float2), X, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0,
+                                N * sizeof(cl_float2), Y, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasChpmv(order, uplo, N, alpha, bufAP, 0 /*offA */,
+                                    bufX, 0 /*offx*/, incx, beta, bufY, 0 /*offy*/, incy,
+                                    1, &queue, 0, NULL, &event);
+
+   	if (err != CL_SUCCESS) {
+        printf("clblasChpmv() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(cl_float2),
+                                        Y, 0, NULL, NULL);
+        /* At this point you will get the result of CHPMV placed in Y array. */
+        printResult();
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufY);
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufAP);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_chpr.c b/src/samples/example_chpr.c
new file mode 100644
index 0000000..5feac90
--- /dev/null
+++ b/src/samples/example_chpr.c
@@ -0,0 +1,166 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+static const size_t N = 5;
+static cl_float alpha = 10;
+static const clblasUplo uplo = clblasUpper;
+
+static cl_float2 AP[] = {
+    {{11.0f, 00.0f}}, {{12.0f, 02.0f}}, {{13.0f, 05.0f}}, {{14.0f, 12.0f}}, {{15.0f, 06.0f}},
+                      {{22.0f, 00.0f}}, {{23.0f, 25.0f}}, {{24.0f, 23.0f}}, {{25.0f, 61.0f}},
+                                        {{33.0f, 00.0f}}, {{34.0f, 23.0f}}, {{35.0f, 21.0f}},
+                                                          {{44.0f, 00.0f}}, {{45.0f, 23.0f}},
+	                                                                        {{55.0f, 00.0f}}
+};
+
+static const cl_float2 X[] = {
+	{{11.0f, 23.0f}},
+	{{21.0f, 65.0f}},
+	{{31.0f, 20.0f}},
+	{{41.0f, 02.0f}},
+	{{51.0f, 10.0f}}
+};
+static const int incx = 1;
+
+static void
+printResult(void)
+{
+    size_t i, j, off;
+    printf("\nResult:\n");
+
+    off = 0;
+    for (i = 0; i < N; i++) {
+        for(j = 0; j < N; j++)  {
+            if( ( (uplo == clblasUpper) && (i > j)) || ((uplo == clblasLower) && (j > i)) )
+            {
+                printf("\t\t\t");
+                continue;
+            }
+
+			printf("(%9.2lf, %-9.2lf)\t", CREAL( AP[ off ] ), CIMAG( AP[ off ] ));
+            off ++ ;
+        }
+		printf("\n");
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufAP, bufX;
+    cl_event event = NULL;
+    int ret = 0, numElementsAP;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+    numElementsAP = (N * (N+1)) / 2;	// To get number of elements in a packed matrix
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufAP = clCreateBuffer(ctx, CL_MEM_READ_WRITE, numElementsAP * sizeof(cl_float2),
+                          NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float2),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufAP, CL_TRUE, 0,
+                           numElementsAP * sizeof(cl_float2), AP, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+                            N * sizeof(cl_float2), X, 0, NULL, NULL);
+
+
+    err = clblasChpr(order, uplo, N, alpha, bufX, 0 /*offx */, incx,
+										bufAP, 0 /*offa */, 1, &queue, 0, NULL, &event);
+
+   	if (err != CL_SUCCESS) {
+        printf("clblasChpr() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufAP, CL_TRUE, 0, (numElementsAP * sizeof(cl_float2)),
+                                  AP, 0, NULL, NULL);
+        /* At this point you will get the result of CHPR placed in A array. */
+        printResult();
+    }
+
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufAP);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_csscal.c b/src/samples/example_csscal.c
new file mode 100644
index 0000000..98dfcee
--- /dev/null
+++ b/src/samples/example_csscal.c
@@ -0,0 +1,141 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+
+static const size_t N = 5;
+static const cl_float alpha = 10;
+static cl_float2 X[] = {
+    {{1.0, 0.0}},
+    {{2.0, 0.0}},
+    {{3.0, 0.0}},
+    {{4.0, 0.0}},
+    {{5.0, 0.0}}
+};
+static const int incx = 1;
+
+
+static void
+printResult(void)
+{
+    size_t i;
+    printf("\nResult:\n");
+
+    for (i = 0; i < N; i++) {
+		printf("(%f, %-f) \n", CREAL(X[i]), CIMAG(X[i]));
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufX;
+    cl_event event = NULL;
+    int ret = 0;
+    int lenX = 1 + (N-1)*abs(incx);
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place vectors inside them. */
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, ( lenX * sizeof(cl_float2)),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+                    (lenX * sizeof(cl_float)), X, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasCsscal( N, alpha, bufX, 0, incx, 1, &queue, 0, NULL, &event);
+
+    if (err != CL_SUCCESS) {
+        printf("clblasCsscal() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX * sizeof(cl_float2)),
+                                  X, 0, NULL, NULL);
+        /* At this point you will get the result of CSSCAL placed in vector X. */
+        printResult();
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufX);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_dtrmv.c b/src/samples/example_dtrmv.c
new file mode 100644
index 0000000..9051f2d
--- /dev/null
+++ b/src/samples/example_dtrmv.c
@@ -0,0 +1,174 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasColumnMajor;
+
+static const size_t N = 5;
+
+static const cl_double alpha = 10;
+
+static const clblasUplo uplo = clblasUpper;
+static const cl_double A[] = {
+    11, 12, 13, 14, 15,
+     0, 22, 23, 24, 25,
+     0,  0, 33, 34, 35,
+     0,  0,  0, 44, 45,
+     0,  0,  0,  0, 55
+};
+static const size_t lda = 5;    /* i.e. lda = N */
+
+static const cl_double X[] = {
+    11,
+    21,
+    31,
+    41,
+    51
+};
+static const int incx = 1;
+
+static const cl_double beta = 20;
+
+static cl_double Y[] = {
+    11,
+    21,
+    31,
+    41,
+    51
+};
+static const int incy = 1;
+
+static void
+printResult(void)
+{
+    size_t i, nElements;
+
+    printf("Result:\n");
+
+    nElements = (sizeof(Y) / sizeof(cl_double)) / incy;
+    for (i = 0; i < nElements; i++) {
+        printf("%d\n", (int)Y[i * incy]);
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufX, bufY;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * lda * sizeof(*A),
+                          NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(*X),
+                          NULL, &err);
+    bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(*Y),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+        N * lda * sizeof(*A), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+        N * sizeof(*X), X, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0,
+        N * sizeof(*Y), Y, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasDtrmv(order, uplo, clblasTrans, clblasUnit,  N,  bufA, 0 /*offA */, lda,
+																			bufX, 0 /*offX */, incx,
+        																	bufY, 1, &queue, 0, NULL, &event);
+   	if (err != CL_SUCCESS) {
+        printf("clblasDtrmv() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(*Y),
+                                  Y, 0, NULL, NULL);
+        /* At this point you will get the result of SSYMV placed in Y array. */
+        printResult();
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufY);
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_isamax.c b/src/samples/example_isamax.c
new file mode 100644
index 0000000..8813361
--- /dev/null
+++ b/src/samples/example_isamax.c
@@ -0,0 +1,135 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const size_t N = 7;
+static cl_float X[] = {
+    1,
+    2,
+    -11,
+    17,
+    5,
+    6,
+    800,
+    10
+};
+static const int incx = 1;
+static cl_uint indexMax;
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufX, scratchBuf, iMax;
+    cl_event event = NULL;
+    int ret = 0;
+	int lenX = 1 + (N-1)*abs(incx);
+    int lenScratchBuf = N;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float)), NULL, &err);
+
+    // Allocate minimum of (N/64) elements. But here allocating N elements for the sake of simplicity
+    scratchBuf = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenScratchBuf*sizeof(cl_float) * 2), NULL, &err);
+
+    // Buffer to return the index of max absolute value in X
+    iMax = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, sizeof(cl_uint), NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)) , X, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasiSamax( N, iMax, 0, bufX, 0, incx, scratchBuf,
+                                    1, &queue, 0, NULL, &event);
+    if (err != CL_SUCCESS) {
+        printf("clblasiSamax() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, iMax, CL_TRUE, 0, sizeof(cl_uint),
+                                    &indexMax, 0, NULL, NULL);
+        printf("Result amax: %d\n", indexMax);
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(scratchBuf);
+    clReleaseMemObject(iMax);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_sasum.c b/src/samples/example_sasum.c
new file mode 100644
index 0000000..df46ad4
--- /dev/null
+++ b/src/samples/example_sasum.c
@@ -0,0 +1,131 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const size_t N = 7;
+static cl_float X[] = {
+    1,
+    2,
+    -11,
+    17,
+    5,
+    6,
+    81,
+};
+static const int incx = 1;
+static cl_float asum;
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufX, bufAsum, scratchBuff;
+    cl_event event = NULL;
+    int ret = 0;
+	int lenX = 1 + (N-1)*abs(incx);
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float)), NULL, &err);
+    bufAsum = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, (sizeof(cl_float)), NULL, &err);
+    // Allocate minimum of N elements
+    scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (N*sizeof(cl_float)), NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasSasum( N, bufAsum, 0, bufX, 0, incx, scratchBuff,
+                                    1, &queue, 0, NULL, &event);
+    if (err != CL_SUCCESS) {
+        printf("clblasSasum() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufAsum, CL_TRUE, 0, sizeof(cl_float),
+                                    &asum, 0, NULL, NULL);
+        printf("Result : %f\n", asum);
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufAsum);
+    clReleaseMemObject(scratchBuff);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_saxpy.c b/src/samples/example_saxpy.c
new file mode 100644
index 0000000..ebf6b4f
--- /dev/null
+++ b/src/samples/example_saxpy.c
@@ -0,0 +1,155 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const size_t N = 7;
+static const cl_float alpha = 10;
+static cl_float X[] = {
+    11,
+    21,
+    31,
+    41,
+    51,
+    61,
+    71,
+};
+static const int incx = 1;
+
+static cl_float Y[] = {
+    15,
+    11,
+    1,
+    2,
+    1,
+    8,
+    1,
+};
+static const int incy = 1;
+
+
+static void
+printResult(void)
+{
+    size_t i;
+    printf("\nResult:\n");
+
+    printf("Y\n");
+    for (i = 0; i < N; i++) {
+            printf("\t%f\n", Y[i]);
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufX, bufY;
+    cl_event event = NULL;
+    int ret = 0;
+	int lenX = 1 + (N-1)*abs(incx);
+	int lenY = 1 + (N-1)*abs(incy);
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float)), NULL, &err);
+    bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasSaxpy( N, alpha, bufX, 0, incx, bufY, 0, incy, 1, &queue, 0, NULL, &event);
+    if (err != CL_SUCCESS) {
+        printf("clblasSaxpy() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)),
+                                    Y, 0, NULL, NULL);
+
+        /* At this point you will get the result of SAXPY placed in vector Y. */
+        printResult();
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufY);
+    clReleaseMemObject(bufX);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_scopy.c b/src/samples/example_scopy.c
new file mode 100644
index 0000000..8cc4b14
--- /dev/null
+++ b/src/samples/example_scopy.c
@@ -0,0 +1,161 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const size_t N = 7;
+static cl_float X[] = {
+    11,
+    21,
+    31,
+    41,
+    51,
+    61,
+    71,
+};
+static const int incx = 1;
+
+static cl_float Y[] = {
+    0,
+    2,
+    0,
+    0,
+    0,
+    5,
+    0,
+};
+static const int incy = 1;
+
+
+static void
+printResult(void)
+{
+    size_t i;
+    printf("\nResult:\n");
+
+    printf(" X\n");
+    for (i = 0; i < N; i++) {
+			printf("\t%f\n", X[i]);
+    }
+
+    printf("Y\n");
+    for (i = 0; i < N; i++) {
+            printf("\t%f\n", Y[i]);
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufX, bufY;
+    cl_event event = NULL;
+    int ret = 0;
+	int lenX = 1 + (N-1)*abs(incx);
+	int lenY = 1 + (N-1)*abs(incy);
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float)), NULL, &err);
+    bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasScopy( N, bufX, 0, incx, bufY, 0, incy, 1, &queue, 0, NULL, &event);
+    if (err != CL_SUCCESS) {
+        printf("clblasScopy() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)),
+                                    X, 0, NULL, NULL);
+        err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)),
+                                    Y, 0, NULL, NULL);
+
+        /* At this point you will get the result of SSWAP placed in vector Y. */
+        printResult();
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufY);
+    clReleaseMemObject(bufX);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_sdot.c b/src/samples/example_sdot.c
new file mode 100644
index 0000000..7e699a1
--- /dev/null
+++ b/src/samples/example_sdot.c
@@ -0,0 +1,147 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const size_t N = 7;
+static cl_float X[] = {
+    1,
+    2,
+    -11,
+    17,
+    5,
+    6,
+    81,
+};
+static const int incx = 1;
+
+static cl_float Y[] = {
+    1,
+    5,
+    6,
+    4,
+    9,
+    10,
+    4,
+};
+static const int incy = 1;
+static cl_float dotProduct;
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufX, bufY, bufDotP, scratchBuff;
+    cl_event event = NULL;
+    int ret = 0;
+	int lenX = 1 + (N-1)*abs(incx);
+	int lenY = 1 + (N-1)*abs(incy);
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float)), NULL, &err);
+    bufY = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenY*sizeof(cl_float)), NULL, &err);
+    // Allocate 1 element space for dotProduct
+    bufDotP = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, (sizeof(cl_float)), NULL, &err);
+    // Allocate minimum of N elements
+    scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (N*sizeof(cl_float)), NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasSdot( N, bufDotP, 0, bufX, 0, incx, bufY, 0, incy, scratchBuff,
+                                    1, &queue, 0, NULL, &event);
+    if (err != CL_SUCCESS) {
+        printf("clblasSdot() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufDotP, CL_TRUE, 0, sizeof(cl_float),
+                                    &dotProduct, 0, NULL, NULL);
+        printf("Result dot product: %f\n", dotProduct);
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufY);
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufDotP);
+    clReleaseMemObject(scratchBuff);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_sgbmv.c b/src/samples/example_sgbmv.c
new file mode 100644
index 0000000..a977e24
--- /dev/null
+++ b/src/samples/example_sgbmv.c
@@ -0,0 +1,175 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+
+static const clblasOrder order = clblasRowMajor;
+static const clblasTranspose trans = clblasNoTrans;
+
+static const size_t M = 5;
+static const size_t N = 5;
+static const size_t KL = 1;
+static const size_t KU = 2;
+static const size_t lda = 4;    // lda = KL + KU + 1
+
+static const cl_float A[] = {
+    00, 12, 13, 14,
+    21, 22, 23, 24,
+    31, 32, 33, 34,
+    41, 42, 43, 00,
+    51, 62, 00, 00
+};
+
+static const cl_float alpha = 10;
+static const cl_float X[] = {
+    11,
+    21,
+    31,
+    41,
+    51,
+};
+static const int incx = 1;
+
+static const cl_float beta = 20;
+static cl_float Y[] = {
+    11,
+    21,
+    31,
+    41,
+    51,
+};
+static const int incy = 1;
+
+
+static void
+printResult(const char* str)
+{
+    size_t i;
+
+    printf("%s:\n", str);
+
+    for (i = 0; i < M; i++) {
+        printf("%f\n", Y[ i * incy ]);
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufX, bufY;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * lda * sizeof(cl_float),
+                          NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float),
+                          NULL, &err);
+    bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * sizeof(cl_float),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+                                M * lda * sizeof(cl_float), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+                                N * sizeof(cl_float), X, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0,
+                                M * sizeof(cl_float), Y, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasSgbmv(order, trans, M, N, KL, KU, alpha, bufA, 0, lda, bufX, 0, incx,
+                            beta, bufY, 0, incy, 1, &queue, 0, NULL, &event);
+    if (err != CL_SUCCESS) {
+        printf("clblasSgbmv() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, M * sizeof(cl_float),
+                                      Y, 0, NULL, NULL);
+        /* At this point you will get the result of SGBMV placed in Y array. */
+        printResult("clblasSgbmv result");
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufY);
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_sgemm.c b/src/samples/example_sgemm.c
new file mode 100644
index 0000000..2aaf16a
--- /dev/null
+++ b/src/samples/example_sgemm.c
@@ -0,0 +1,192 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+
+#define M  4
+#define N  3
+#define K  5
+
+static const clblasOrder order = clblasRowMajor;
+
+static const cl_float alpha = 10;
+
+static const clblasTranspose transA = clblasNoTrans;
+static const cl_float A[M*K] = {
+    11, 12, 13, 14, 15,
+    21, 22, 23, 24, 25,
+    31, 32, 33, 34, 35,
+    41, 42, 43, 44, 45,
+};
+static const size_t lda = K;        /* i.e. lda = K */
+
+static const clblasTranspose transB = clblasNoTrans;
+static const cl_float B[K*N] = {
+    11, 12, 13,
+    21, 22, 23,
+    31, 32, 33,
+    41, 42, 43,
+    51, 52, 53,
+};
+static const size_t ldb = N;        /* i.e. ldb = N */
+
+static const cl_float beta = 20;
+
+static cl_float C[M*N] = {
+    11, 12, 13,
+    21, 22, 23,
+    31, 32, 33,
+    41, 42, 43,
+};
+static const size_t ldc = N;        /* i.e. ldc = N */
+
+static cl_float result[M*N];
+
+static const size_t off  = 1;
+static const size_t offA = K + 1;   /* K + off */
+static const size_t offB = N + 1;   /* N + off */
+static const size_t offC = N + 1;   /* N + off */
+
+static void
+printResult(const char* str)
+{
+    size_t i, j, nrows;
+
+    printf("%s:\n", str);
+
+    nrows = (sizeof(result) / sizeof(cl_float)) / ldc;
+    for (i = 0; i < nrows; i++) {
+        for (j = 0; j < ldc; j++) {
+            printf("%d ", (int)result[i * ldc + j]);
+        }
+        printf("\n");
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufB, bufC;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
+                          NULL, &err);
+    bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
+                          NULL, &err);
+    bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+        M * K * sizeof(*A), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0,
+        K * N * sizeof(*B), B, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0,
+        M * N * sizeof(*C), C, 0, NULL, NULL);
+
+    /* Call clblas extended function. Perform gemm for the lower right sub-matrices */
+    err = clblasSgemm(order, transA, transB, M - off, N - off, K - off,
+                         alpha, bufA, offA, lda,
+                         bufB, offB, ldb, beta,
+                         bufC, offC, ldc,
+                         1, &queue, 0, NULL, &event);
+    if (err != CL_SUCCESS) {
+        printf("clblasSgemmEx() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0,
+                                  M * N * sizeof(*result),
+                                  result, 0, NULL, NULL);
+
+        /* At this point you will get the result of SGEMM placed in 'result' array. */
+        puts("");
+        printResult("clblasSgemmEx result");
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufC);
+    clReleaseMemObject(bufB);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_sgemv.c b/src/samples/example_sgemv.c
new file mode 100644
index 0000000..a76fb93
--- /dev/null
+++ b/src/samples/example_sgemv.c
@@ -0,0 +1,181 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+
+static const size_t M = 4;
+static const size_t N = 5;
+
+static const cl_float alpha = 10;
+
+static const clblasTranspose transA = clblasNoTrans;
+static const cl_float A[] = {
+    11, 12, 13, 14, 15,
+    21, 22, 23, 24, 25,
+    31, 32, 33, 34, 35,
+    41, 42, 43, 44, 45
+};
+static const size_t lda = 5;        /* i.e. lda = N */
+
+static const cl_float X[] = {
+    11,
+    21,
+    31,
+    41,
+    51,
+};
+static const int incx = 1;
+
+static const cl_float beta = 20;
+
+static cl_float Y[] = {
+    11,
+    21,
+    31,
+    41,
+};
+static const int incy = 1;
+
+static cl_float result[4];          /* M */
+
+static const size_t off  = 1;
+static const size_t offA = 5 + 1;   /* M + off */
+static const size_t offX = 1;       /* off */
+static const size_t offY = 1;       /* off */
+
+static void
+printResult(const char* str)
+{
+    size_t i, nElements;
+
+    printf("%s:\n", str);
+
+    nElements = (sizeof(result) / sizeof(cl_float)) / incy;
+    for (i = 0; i < nElements; i++) {
+        printf("%d\n", (int)result[i * incy]);
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufX, bufY;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * N * sizeof(*A),
+                          NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(*X),
+                          NULL, &err);
+    bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * sizeof(*Y),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+        M * N * sizeof(*A), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+        N * sizeof(*X), X, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0,
+        M * sizeof(*Y), Y, 0, NULL, NULL);
+
+    /* Call clblas extended function. */
+    err = clblasSgemv(order, transA, M - off, N - off, alpha,
+                           bufA, offA, lda, bufX, offX, incx, beta,
+                           bufY, offY, incy, 1, &queue, 0, NULL, &event);
+    if (err != CL_SUCCESS) {
+        printf("clblasSgemvEx() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, M * sizeof(*result),
+                                  result, 0, NULL, NULL);
+        /* At this point you will get the result of SGEMV placed in 'result' array. */
+        puts("");
+        printResult("clblasSgemvEx result");
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufY);
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_sger.c b/src/samples/example_sger.c
new file mode 100644
index 0000000..84ca3ba
--- /dev/null
+++ b/src/samples/example_sger.c
@@ -0,0 +1,174 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+
+static const size_t M = 5;
+static const size_t N = 5;
+static const cl_float alpha = 10;
+
+static cl_float A[] = {
+    11, 12, 13, 14, 15,
+    21, 22, 23, 24, 25,
+    31, 32, 33, 34, 35,
+    41, 42, 43, 44, 45,
+	81, 22, 33, 14, 75
+};
+static const size_t lda = 5;    /* i.e. lda = N */
+
+static const cl_float X[] = {
+    11,
+    21,
+    31,
+    41,
+    51,
+	91,
+};
+static const int incx = 1;
+
+static const cl_float Y[] = {
+    45,
+    23,
+    39,
+    45,
+    50,
+	10,
+};
+static const int incy = 1;
+
+
+static void
+printResult(void)
+{
+    size_t i, j;
+    printf("\nResult:\n");
+
+    for (i = 0; i < M; i++) {
+		for(j = 0; j < N; j++)
+			printf("\t%f", A[ i*N + j ]);
+		printf("\n");
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufX, bufY;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * lda * sizeof(cl_float),
+                          NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, ( 1 + ( M - 1 )*abs( incx ) ) * sizeof(cl_float),
+                          NULL, &err);
+    bufY = clCreateBuffer(ctx, CL_MEM_READ_ONLY, ( 1 + ( N - 1 )*abs( incy ) ) * sizeof(cl_float),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+        M * lda * sizeof(cl_float), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+        ( 1 + ( M - 1 )*abs( incx ) ) * sizeof(cl_float), X, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0,
+        ( 1 + ( N - 1 )*abs( incy ) ) * sizeof(cl_float), Y, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasSger(order, M, N, alpha, bufX, 0, incx, bufY, 0, incy,
+		bufA, 0, lda, 1, &queue, 0, NULL, &event);
+    if (err != CL_SUCCESS) {
+        printf("clblasSger() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufA, CL_TRUE, 0, (M * lda * sizeof(cl_float)),
+                                  A, 0, NULL, NULL);
+        /* At this point you will get the result of SGER placed in A array. */
+        printResult();
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufY);
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_snrm2.c b/src/samples/example_snrm2.c
new file mode 100644
index 0000000..4bd59bb
--- /dev/null
+++ b/src/samples/example_snrm2.c
@@ -0,0 +1,132 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const size_t N = 7;
+static cl_float X[] = {
+    1,
+    2,
+    -11,
+    17,
+    5,
+    6,
+    81,
+};
+static const int incx = 1;
+static cl_float NRM2;
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufX, bufNRM2, scratchBuff;
+    cl_event event = NULL;
+    int ret = 0;
+	int lenX = 1 + (N-1)*abs(incx);
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place vectors inside them. */
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float)), NULL, &err);
+    // Allocate 1 element space for NRM2
+    bufNRM2 = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, (sizeof(cl_float)), NULL, &err);
+    // Allocate minimum of N elements
+    scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (2*N*sizeof(cl_float)), NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasSnrm2(N, bufNRM2, 0, bufX, 0, incx, scratchBuff,
+                                    1, &queue, 0, NULL, &event);
+    if (err != CL_SUCCESS) {
+        printf("clblasSnrm2() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufNRM2, CL_TRUE, 0, sizeof(cl_float),
+                                    &NRM2, 0, NULL, NULL);
+        printf("Result Euclidean Norm: %f\n", NRM2);
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufNRM2);
+    clReleaseMemObject(scratchBuff);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_srot.c b/src/samples/example_srot.c
new file mode 100644
index 0000000..937dce0
--- /dev/null
+++ b/src/samples/example_srot.c
@@ -0,0 +1,165 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const size_t N = 5;
+static cl_float X[] = {
+    1,
+    2,
+    3,
+    4,
+	5,
+};
+static const int incx = 1;
+
+static cl_float Y[] = {
+    6,
+    7,
+    8,
+    9,
+	9,
+};
+static const int incy = 1;
+
+static const cl_float C = 2.0;
+static const cl_float S = 3.0;
+
+static void
+printResult(void)
+{
+    size_t i;
+    printf("\nResult:\n");
+
+    printf("X\n");
+    for (i = 0; i < N; i++) {
+        printf("\t%f\n", X[i]);
+    }
+    printf("Y\n");
+    for (i = 0; i < N; i++) {
+        printf("\t%f\n", Y[i]);
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufX, bufY;
+    cl_event event = NULL;
+    int ret = 0;
+	int lenX = 1 + (N-1)*abs(incx);
+	int lenY = 1 + (N-1)*abs(incy);
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err);
+    bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL);
+
+	printResult();
+
+    /* Call clblas function. */
+    err = clblasSrot(N, bufX, 0, incx, bufY, 0, incy, C, S, 1, &queue, 0, NULL, &event);
+//	printf("here\n");
+    if (err != CL_SUCCESS) {
+        printf("clblasSrot() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)),
+                                    Y, 0, NULL, NULL);
+        err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)),
+                                    X, 0, NULL, NULL);
+
+        /* At this point you will get the result of SROT placed in vector Y. */
+        printResult();
+    }
+	//printf("here\n");
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufY);
+    clReleaseMemObject(bufX);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_srotg.c b/src/samples/example_srotg.c
new file mode 100644
index 0000000..555b132
--- /dev/null
+++ b/src/samples/example_srotg.c
@@ -0,0 +1,137 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static cl_float SA = 11;
+static cl_float SB = 21;
+static cl_float C = 0.2;
+static cl_float S = 0.5;
+
+static void
+printResult(void)
+{
+    printf("\nResult:\n");
+    printf("SA: %f\tSB: %f\t C: %f\tS: %f\n", SA, SB, C, S);
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufSA, bufSB, bufC, bufS;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufSA = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(cl_float), NULL, &err);
+    bufSB = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(cl_float), NULL, &err);
+    bufC  = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(cl_float), NULL, &err);
+    bufS  = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(cl_float), NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufSA, CL_TRUE, 0, sizeof(cl_float), &SA, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufSB, CL_TRUE, 0, sizeof(cl_float), &SB, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, sizeof(cl_float), &C, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufS, CL_TRUE, 0, sizeof(cl_float), &S, 0, NULL, NULL);
+
+
+    /* Call clblas function. */
+    err = clblasSrotg(bufSA, 0, bufSB, 0, bufC, 0, bufS, 0, 1, &queue, 0, NULL, &event);
+    if (err != CL_SUCCESS) {
+        printf("clblasSrotg() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufSA, CL_TRUE, 0, sizeof(cl_float), &SA, 0, NULL, NULL);
+        err = clEnqueueReadBuffer(queue, bufSB, CL_TRUE, 0, sizeof(cl_float), &SB, 0, NULL, NULL);
+        err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, sizeof(cl_float), &C, 0, NULL, NULL);
+        err = clEnqueueReadBuffer(queue, bufS, CL_TRUE, 0, sizeof(cl_float), &S, 0, NULL, NULL);
+
+        /* At this point you will get the result of SROTG placed in vector Y. */
+        printResult();
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufSA);
+    clReleaseMemObject(bufSB);
+    clReleaseMemObject(bufC);
+    clReleaseMemObject(bufS);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_srotm.c b/src/samples/example_srotm.c
new file mode 100644
index 0000000..8117247
--- /dev/null
+++ b/src/samples/example_srotm.c
@@ -0,0 +1,171 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const size_t N = 7;
+static cl_float X[] = {
+    11,
+    21,
+    31,
+    41,
+    51,
+    61,
+    71,
+};
+static const int incx = 1;
+
+static cl_float Y[] = {
+    15,
+    11,
+    1,
+    2,
+    1,
+    8,
+    1,
+};
+static const int incy = 1;
+
+static cl_float SPARAM[] = {
+    -1,
+    10,
+    12,
+    20,
+    2
+};
+
+static void
+printResult(void)
+{
+    size_t i;
+    printf("\nResult:\n");
+
+    printf("X\n");
+    for (i = 0; i < N; i++) {
+        printf("\t%f\n", X[i]);
+    }
+    printf("Y\n");
+    for (i = 0; i < N; i++) {
+        printf("\t%f\n", Y[i]);
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufX, bufY, bufParam;
+    cl_event event = NULL;
+    int ret = 0;
+	int lenX = 1 + (N-1)*abs(incx);
+	int lenY = 1 + (N-1)*abs(incy);
+	int lenParam = 5;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err);
+    bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err);
+    bufParam = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenParam*sizeof(cl_float)), NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufParam, CL_TRUE, 0, (lenParam*sizeof(cl_float)), SPARAM, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasSrotm(N, bufX, 0, incx, bufY, 0, incy, bufParam, 0, 1, &queue, 0, NULL, &event);
+    if (err != CL_SUCCESS) {
+        printf("clblasSrotm() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)),
+                                    Y, 0, NULL, NULL);
+        err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)),
+                                    X, 0, NULL, NULL);
+
+        /* At this point you will get the result of SROTM placed in vector Y. */
+        printResult();
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufY);
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufParam);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_srotmg.c b/src/samples/example_srotmg.c
new file mode 100644
index 0000000..fbf099e
--- /dev/null
+++ b/src/samples/example_srotmg.c
@@ -0,0 +1,151 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static cl_float SD1 = 10;
+static cl_float SD2 = 21;
+static cl_float SX1 = 1;
+static cl_float SY1 = -1;
+static cl_float SPARAM[] = {
+    -1,
+    10,
+    12,
+    20,
+    2
+};
+
+static void
+printResult(void)
+{
+    printf("\nResult:\n");
+    printf("SD1: %f,\tSD2: %f,\t SX1: %f,\tSY1: %f\nSPARAM: %f %f %f %f %f\n",
+            SD1, SD2, SX1, SY1, SPARAM[0], SPARAM[1], SPARAM[2], SPARAM[3], SPARAM[4]);
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufD1, bufD2, bufX1, bufY1, bufParam;
+    cl_event event = NULL;
+    int ret = 0;
+    int lenParam = 5;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufD1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(cl_float), NULL, &err);
+    bufD2 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(cl_float), NULL, &err);
+    bufX1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(cl_float), NULL, &err);
+    bufY1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(cl_float), NULL, &err);
+    bufParam = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenParam*sizeof(cl_float)), NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufD1, CL_TRUE, 0, sizeof(cl_float), &SD1, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufD2, CL_TRUE, 0, sizeof(cl_float), &SD2, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX1, CL_TRUE, 0, sizeof(cl_float), &SX1, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufY1, CL_TRUE, 0, sizeof(cl_float), &SY1, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufParam, CL_TRUE, 0, (lenParam*sizeof(cl_float)), SPARAM, 0, NULL, NULL);
+
+
+    /* Call clblas function. */
+    err = clblasSrotmg(bufD1, 0, bufD2, 0, bufX1, 0, bufY1, 0, bufParam, 0, 1, &queue, 0, NULL, &event);
+    if (err != CL_SUCCESS) {
+        printf("clblasSrotmg() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufD1, CL_TRUE, 0, sizeof(cl_float), &SD1, 0, NULL, NULL);
+        err = clEnqueueReadBuffer(queue, bufD2, CL_TRUE, 0, sizeof(cl_float), &SD2, 0, NULL, NULL);
+        err = clEnqueueReadBuffer(queue, bufX1, CL_TRUE, 0, sizeof(cl_float), &SX1, 0, NULL, NULL);
+        err = clEnqueueReadBuffer(queue, bufY1, CL_TRUE, 0, sizeof(cl_float), &SY1, 0, NULL, NULL);
+        err = clEnqueueReadBuffer(queue, bufParam, CL_TRUE, 0,
+                        (lenParam*sizeof(cl_float)), SPARAM, 0, NULL, NULL);
+
+        /* At this point you will get the result of SROTG placed in vector Y. */
+        printResult();
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufD1);
+    clReleaseMemObject(bufD2);
+    clReleaseMemObject(bufX1);
+    clReleaseMemObject(bufY1);
+    clReleaseMemObject(bufParam);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_ssbmv.c b/src/samples/example_ssbmv.c
new file mode 100644
index 0000000..c679ac3
--- /dev/null
+++ b/src/samples/example_ssbmv.c
@@ -0,0 +1,171 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+static const clblasUplo uplo = clblasUpper;
+static const size_t N = 5;
+static const size_t K = 2;
+static const cl_float alpha = 10;
+
+static const cl_float A[] = {
+    11, 12, 13,
+    22, 23, 24,
+    33, 34, 35,
+    44, 45, 00,
+    55, 00, 00
+};
+static const size_t lda = 3;    // lda = K + 1
+
+static const cl_float X[] = {
+    11,
+    21,
+    31,
+    41,
+    51
+};
+static const int incx = 1;
+
+static const cl_float beta = 20;
+static cl_float Y[] = {
+    11,
+    21,
+    31,
+    41,
+    51
+};
+static const int incy = 1;
+
+
+static void
+printResult(const char* str)
+{
+    size_t i;
+
+    printf("%s:\n", str);
+
+    for (i = 0; i < N; i++) {
+        printf("%f\n", Y[ i * incy ]);
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufX, bufY;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * lda * sizeof(cl_float),
+                          NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float),
+                          NULL, &err);
+    bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(cl_float),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+                                N * lda * sizeof(cl_float), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+                                N * sizeof(cl_float), X, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0,
+                                N * sizeof(cl_float), Y, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasSsbmv(order, uplo, N, K, alpha, bufA, 0, lda, bufX, 0, incx,
+                            beta, bufY, 0, incy, 1, &queue, 0, NULL, &event);
+    if (err != CL_SUCCESS) {
+        printf("clblasSsbmv() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0,
+                                  N * sizeof(cl_float), Y, 0, NULL, NULL);
+        /* At this point you will get the result of SSBMV placed in Y array. */
+        printResult("clblasSsbmv result");
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufY);
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_sscal.c b/src/samples/example_sscal.c
new file mode 100644
index 0000000..5040c38
--- /dev/null
+++ b/src/samples/example_sscal.c
@@ -0,0 +1,141 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+
+static const size_t N = 5;
+static const cl_float alpha = 10;
+static cl_float X[] = {
+    11,
+    21,
+    31,
+    41,
+    51,
+};
+static const int incx = 1;
+
+static void
+printResult(void)
+{
+    size_t i;
+    printf("\nResult:\n");
+
+    for (i = 0; i < N; i++) {
+			printf("\t%f", X[i] );
+		printf("\n");
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufX;
+    cl_event event = NULL;
+    int ret = 0;
+    int lenX = 1 + (N-1)*abs(incx);
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place vectors inside them. */
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, ( lenX * sizeof(cl_float)),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+        ( lenX * sizeof(cl_float)), X, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasSscal( N, alpha, bufX, 0, incx, 1, &queue, 0, NULL, &event);
+
+    if (err != CL_SUCCESS) {
+        printf("clblasSscal() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX * sizeof(cl_float)),
+                                  X, 0, NULL, NULL);
+        /* At this point you will get the result of SSCAL placed in X vector. */
+        printResult();
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufX);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_sspmv.c b/src/samples/example_sspmv.c
new file mode 100644
index 0000000..9bf003e
--- /dev/null
+++ b/src/samples/example_sspmv.c
@@ -0,0 +1,170 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+static const size_t N = 5;
+static const cl_float alpha = 10;
+
+static const clblasUplo uplo = clblasUpper;
+static const cl_float AP[] = {
+    11, 12, 13, 14, 15,
+        22, 23, 24, 25,
+            33, 34, 35,
+                44, 45,
+                    55
+};
+static const size_t lda = 5;    /* i.e. lda = N */
+
+static const cl_float X[] = {
+    11,
+    21,
+    31,
+    41,
+    51
+};
+static const int incx = 1;
+
+static const cl_float beta = 20;
+static cl_float Y[] = {
+    11,
+    21,
+    31,
+    41,
+    51
+};
+static const int incy = 1;
+
+static void
+printResult(void)
+{
+    size_t i;
+
+    printf("Result:\n\n");
+    for (i = 0; i < N; i++) {
+        printf("%.3f\n", Y[i * incy]);
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufAP, bufX, bufY;
+    cl_event event = NULL;
+    int ret = 0, numElementsAP;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    numElementsAP = (N * (N+1)) / 2;	// To get number of elements in a packed matrix
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufAP = clCreateBuffer(ctx, CL_MEM_READ_ONLY, numElementsAP * sizeof(cl_float),
+                          NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float),
+                          NULL, &err);
+    bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(cl_float),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufAP, CL_TRUE, 0,
+                        numElementsAP * sizeof(cl_float), AP, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+                        N * sizeof(cl_float), X, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0,
+                        N * sizeof(cl_float), Y, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasSspmv(order, uplo, N, alpha, bufAP, 0, bufX, 0, incx,
+                                beta, bufY, 0, incy, 1, &queue, 0, NULL, &event);
+    if (err != CL_SUCCESS) {
+        printf("clblasSspmv() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(cl_float),
+                                            Y, 0, NULL, NULL);
+        /* At this point you will get the result of SSPMV placed in Y array. */
+        printResult();
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufY);
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufAP);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_sspr.c b/src/samples/example_sspr.c
new file mode 100644
index 0000000..cdba6ad
--- /dev/null
+++ b/src/samples/example_sspr.c
@@ -0,0 +1,168 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+static const size_t N = 5;
+static cl_float alpha = 10.0;
+static const clblasUplo uplo = clblasUpper;
+
+static cl_float AP[] = {
+    1.0, 02.0, 03.0, 04.0, 05.0,
+         06.0, 07.0, 08.0, 09.0,
+               10.0, 11.0, 12.0,
+                     13.0, 14.0,
+                           15.0
+};
+
+static const cl_float X[] = {
+    1.0,
+    2.0,
+    3.0,
+    4.0,
+    5.0
+};
+static const int incx = 1;
+
+static void
+printResult(void)
+{
+    size_t i, j, off;
+    printf("\nResult:\n");
+
+    off = 0;
+    for (i = 0; i < N; i++) {
+        for(j = 0; j < N; j++)  {
+            if( ( (uplo == clblasUpper) && (i > j)) || ((uplo == clblasLower) && (j > i)) )
+            {
+                printf("\t\t");
+                continue;
+            }
+
+			printf("%8.2lf\t", AP[ off ]);
+            off ++ ;
+        }
+		printf("\n");
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufAP, bufX;
+    cl_event event = NULL;
+    int ret = 0, numElementsAP;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+
+    numElementsAP = (N * (N+1)) / 2;	// To get number of elements in a packed matrix
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufAP = clCreateBuffer(ctx, CL_MEM_READ_WRITE, numElementsAP * sizeof(cl_float),
+                            NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float),
+                            NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufAP, CL_TRUE, 0,
+                            numElementsAP * sizeof(cl_float), AP, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+                            N * sizeof(cl_float), X, 0, NULL, NULL);
+
+
+    err = clblasSspr(order, uplo, N, alpha, bufX, 0 /*offx */, incx,
+						    bufAP, 0 /*offa */, 1, &queue, 0, NULL, &event);
+
+   	if (err != CL_SUCCESS) {
+        printf("clblasSspr() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufAP, CL_TRUE, 0, (numElementsAP * sizeof(cl_float)),
+                                  AP, 0, NULL, NULL);
+        /* At this point you will get the result of SSPR placed in A array. */
+        printResult();
+    }
+
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufAP);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_sspr2.c b/src/samples/example_sspr2.c
new file mode 100644
index 0000000..2427f50
--- /dev/null
+++ b/src/samples/example_sspr2.c
@@ -0,0 +1,180 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+static const size_t N = 5;
+static cl_float alpha = 10.0;
+static const clblasUplo uplo = clblasUpper;
+
+static cl_float AP[] = {
+    01.0, 02.0, 03.0, 04.0, 05.0,
+          06.0, 07.0, 08.0, 09.0,
+                10.0, 11.0, 12.0,
+                      13.0, 14.0,
+                            15.0
+};
+
+static const cl_float X[] = {
+    1.0,
+    2.0,
+    3.0,
+    4.0,
+    5.0
+};
+static const int incx = 1;
+
+static const cl_float Y[] = {
+	5.0,
+	4.0,
+	3.0,
+	2.0,
+	1.0
+};
+static const int incy = 1;
+
+static void
+printResult(void)
+{
+    size_t i, j, off;
+    printf("\nResult:\n");
+
+    off = 0;
+    for (i = 0; i < N; i++) {
+        for(j = 0; j < N; j++)  {
+            if( ( (uplo == clblasUpper) && (i > j)) || ((uplo == clblasLower) && (j > i)) )
+            {
+                printf("\t\t");
+                continue;
+            }
+
+			printf("%8.2lf\t", AP[ off ]);
+            off ++ ;
+        }
+		printf("\n");
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufAP, bufX, bufY;
+    cl_event event = NULL;
+    int ret = 0, numElementsAP;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    numElementsAP = (N * (N+1)) / 2;	// To get number of elements in a packed matrix
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufAP = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (numElementsAP * sizeof(cl_float)),
+                            NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float),
+                            NULL, &err);
+	bufY = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float),
+						    NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufAP, CL_TRUE, 0,
+                numElementsAP * sizeof(cl_float), AP, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+                N * sizeof(cl_float), X, 0, NULL, NULL);
+	err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0,
+		        N * sizeof(cl_float), Y, 0, NULL, NULL);
+
+    err = clblasSspr2(order, uplo, N, alpha, bufX, 0 /*offx */, incx, bufY, 0 /*offy*/, incy,
+						        bufAP, 0 /*offa */, 1, &queue, 0, NULL, &event);
+
+   	if (err != CL_SUCCESS) {
+        printf("clblasSspr2() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufAP, CL_TRUE, 0, (numElementsAP * sizeof(cl_float)),
+                                  AP, 0, NULL, NULL);
+        /* At this point you will get the result of SSPR2 placed in A array. */
+        printResult();
+    }
+
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufAP);
+	clReleaseMemObject(bufY);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_sswap.c b/src/samples/example_sswap.c
new file mode 100644
index 0000000..d3c44c8
--- /dev/null
+++ b/src/samples/example_sswap.c
@@ -0,0 +1,162 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const size_t N = 7;
+
+static cl_float X[] = {
+    11,
+    21,
+    31,
+    41,
+    51,
+    61,
+    71,
+};
+static const int incx = 1;
+
+static cl_float Y[] = {
+    45,
+    23,
+    39,
+    45,
+    50,
+    55,
+    65,
+};
+static const int incy = 1;
+
+
+static void
+printResult(void)
+{
+    size_t i;
+    printf("\nResult:\n");
+
+    printf(" X\n");
+    for (i = 0; i < N; i++) {
+			printf("\t%f\n", X[i]);
+    }
+
+    printf("Y\n");
+    for (i = 0; i < N; i++) {
+            printf("\t%f\n", Y[i]);
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufX, bufY;
+    cl_event event = NULL;
+    int ret = 0;
+	int lenX = 1 + (N-1)*abs(incx);
+	int lenY = 1 + (N-1)*abs(incy);
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place vectors inside them. */
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err);
+    bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasSswap( N, bufX, 0, incx, bufY, 0, incy, 1, &queue, 0, NULL, &event);
+    if (err != CL_SUCCESS) {
+        printf("clblasSswap() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)),
+                                    X, 0, NULL, NULL);
+        err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)),
+                                    Y, 0, NULL, NULL);
+
+        /* At this point you will get the result of SSWAP placed in vector X. */
+        printResult();
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufY);
+    clReleaseMemObject(bufX);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_ssymm.c b/src/samples/example_ssymm.c
new file mode 100644
index 0000000..d97d057
--- /dev/null
+++ b/src/samples/example_ssymm.c
@@ -0,0 +1,178 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+
+#define M  4
+#define N  3
+
+static const cl_float alpha = 10;
+
+static const clblasSide side = clblasLeft;
+static const clblasUplo uplo = clblasLower;
+static const cl_float A[M*M] = {
+    11, -1, -1, -1,
+    21, 22, -1, -1,
+    31, 32, 33, -1,
+    41, 42, 43, 44,
+};
+static const size_t lda = M;
+
+static const cl_float B[M*N] = {
+    11, 12, 13,
+    21, 22, 23,
+    31, 32, 33,
+    41, 42, 43,
+};
+static const size_t ldb = N;
+
+static const cl_float beta = 20;
+
+static cl_float C[M*N] = {
+    11, 12, 13,
+    21, 22, 23,
+    31, 32, 33,
+    41, 42, 43,
+};
+static const size_t ldc = N;
+
+static void
+printResult(void)
+{
+    size_t i, j, nrows;
+
+    printf("Result:\n");
+
+    nrows = (sizeof(C) / sizeof(cl_float)) / ldc;
+    for (i = 0; i < nrows; i++) {
+        for (j = 0; j < ldc; j++) {
+            printf("%d ", (int)C[i * ldc + j]);
+        }
+        printf("\n");
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufB, bufC;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * M * sizeof(*A),
+                          NULL, &err);
+    bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * N * sizeof(*B),
+                          NULL, &err);
+    bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+        M * M * sizeof(*A), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0,
+        M * N * sizeof(*B), B, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0,
+        M * N * sizeof(*C), C, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasSsymm(order, side, uplo, M, N, alpha, bufA,
+                         0, lda, bufB, 0, ldb, beta, bufC, 0, ldc, 1, &queue,
+                         0, NULL, &event);
+    if (err != CL_SUCCESS) {
+        printf("clblasSsymm() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, M * N * sizeof(*C),
+                                  C, 0, NULL, NULL);
+
+        /* At this point you will get the result of SYMM placed in C array. */
+        printResult();
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufC);
+    clReleaseMemObject(bufB);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_ssymv.c b/src/samples/example_ssymv.c
new file mode 100644
index 0000000..79fdfd7
--- /dev/null
+++ b/src/samples/example_ssymv.c
@@ -0,0 +1,182 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+
+static const size_t N = 5;
+
+static const cl_float alpha = 10;
+
+static const clblasUplo uplo = clblasUpper;
+static const cl_float A[] = {
+    11, 12, 13, 14, 15,
+     0, 22, 23, 24, 25,
+     0,  0, 33, 34, 35,
+     0,  0,  0, 44, 45,
+     0,  0,  0,  0, 55
+};
+static const size_t lda = 5;        /* i.e. lda = N */
+
+static const cl_float X[] = {
+    11,
+    21,
+    31,
+    41,
+    51
+};
+static const int incx = 1;
+
+static const cl_float beta = 20;
+
+static cl_float Y[] = {
+    11,
+    21,
+    31,
+    41,
+    51
+};
+static const int incy = 1;
+
+static cl_float result[5];          /* N */
+
+static const size_t off  = 1;
+static const size_t offa = 5 + 1;   /* N + off */
+static const size_t offx = 1;       /* off */
+static const size_t offy = 1;       /* off */
+
+static void
+printResult(const char* str)
+{
+    size_t i, nElements;
+
+    printf("%s:\n", str);
+
+    nElements = (sizeof(result) / sizeof(cl_float)) / incy;
+    for (i = 0; i < nElements; i++) {
+        printf("%d\n", (int)result[i * incy]);
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufX, bufY;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * N * sizeof(*A),
+                          NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(*X),
+                          NULL, &err);
+    bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(*Y),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+        N * N * sizeof(*A), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+        N * sizeof(*X), X, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0,
+        N * sizeof(*Y), Y, 0, NULL, NULL);
+
+    /* Call clblas extended function. */
+    err = clblasSsymv(order, uplo, N - off, alpha, bufA, offa, lda,
+        bufX, offx, incx,  beta, bufY, offy, incy, 1, &queue, 0, NULL, &event);
+    if (err != CL_SUCCESS) {
+        printf("clblasSsymvEx() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0,
+                                  N * sizeof(*result),
+                                  result, 0, NULL, NULL);
+        /* At this point you will get the result of SSYMV placed in 'result' array. */
+        puts("");
+        printResult("clblasSsymvEx result");
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufY);
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_ssyr.c b/src/samples/example_ssyr.c
new file mode 100644
index 0000000..8a0f045
--- /dev/null
+++ b/src/samples/example_ssyr.c
@@ -0,0 +1,161 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+
+static const size_t N = 5;
+
+static cl_float alpha = 10.0;
+
+static const clblasUplo uplo = clblasUpper;
+static cl_float A[] = {
+    1.0, 2.0,  3.0,  4.0,  5.0,
+    0.0, 6.0,  7.0,  8.0,  9.0,
+    0.0, 0.0, 10.0, 11.0, 12.0,
+    0.0, 0.0,  0.0, 13.0, 14.0,
+    0.0, 0.0,  0.0, 00.0, 15.0
+};
+static const size_t lda = 5;    /* i.e. lda = N */
+
+static const cl_float X[] = {
+    1.0,
+    2.0,
+    3.0,
+    4.0,
+    5.0
+};
+static const int incx = 1;
+
+static void
+printResult(void)
+{
+    size_t i, j;
+    printf("\nResult:\n");
+
+    for (i = 0; i < N; i++) {
+		for(j = 0; j < N; j++)
+			printf("\t(%.2f)", A[ i*N + j ]);
+		printf("\n");
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufX;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+	printResult();
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * lda * sizeof(cl_float),
+                          NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+        N * lda * sizeof(cl_float), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+        N * sizeof(cl_float), X, 0, NULL, NULL);
+
+
+    err = clblasSsyr(order, uplo, N, alpha, bufX, 0 /*offx */, incx,
+						bufA, 0 /*offa */, lda, 1, &queue, 0, NULL, &event);
+
+   	if (err != CL_SUCCESS) {
+        printf("clblasSsyr() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufA, CL_TRUE, 0, (N * lda * sizeof(cl_float)),
+                                  A, 0, NULL, NULL);
+        /* At this point you will get the result of SSYR placed in A array. */
+        printResult();
+    }
+
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_ssyr2.c b/src/samples/example_ssyr2.c
new file mode 100644
index 0000000..09f7c72
--- /dev/null
+++ b/src/samples/example_ssyr2.c
@@ -0,0 +1,175 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+
+static const size_t N = 5;
+
+static cl_float alpha = 10.0;
+
+static const clblasUplo uplo = clblasUpper;
+static cl_float A[] = {
+    1.0, 2.0,  3.0,  4.0,  5.0,
+    0.0, 6.0,  7.0,  8.0,  9.0,
+    0.0, 0.0, 10.0, 11.0, 12.0,
+    0.0, 0.0,  0.0, 13.0, 14.0,
+    0.0, 0.0,  0.0, 00.0, 15.0
+};
+static const size_t lda = 5;    /* i.e. lda = N */
+
+static const cl_float X[] = {
+    1.0,
+    2.0,
+    3.0,
+    4.0,
+    5.0
+};
+
+static const cl_float Y[] = {
+	5.0,
+	4.0,
+	3.0,
+	2.0,
+	1.0
+};
+
+
+static const int incx = 1, incy = 1;
+
+static void
+printResult(void)
+{
+    size_t i, j;
+    printf("\nResult:\n");
+
+    for (i = 0; i < N; i++) {
+		for(j = 0; j < N; j++)
+			printf("\t(%.2f)", A[ i*N + j ]);
+		printf("\n");
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufX, bufY;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+	printResult();
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * lda * sizeof(cl_float),
+                          NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float),
+                          NULL, &err);
+	bufY = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float),
+						  NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+        N * lda * sizeof(cl_float), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+        N * sizeof(cl_float), X, 0, NULL, NULL);
+	err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0,
+		N * sizeof(cl_float), Y, 0, NULL, NULL);
+
+    err = clblasSsyr2(order, uplo, N, alpha, bufX, 0 /*offx */, incx, bufY, 0 /*offy*/, incy,
+						bufA, 0 /*offa */, lda, 1, &queue, 0, NULL, &event);
+
+   	if (err != CL_SUCCESS) {
+        printf("clblasSsyr2() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufA, CL_TRUE, 0, (N * lda * sizeof(cl_float)),
+                                  A, 0, NULL, NULL);
+        /* At this point you will get the result of SSYR2 placed in A array. */
+        printResult();
+    }
+
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufA);
+	clReleaseMemObject(bufY);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_ssyr2k.c b/src/samples/example_ssyr2k.c
new file mode 100644
index 0000000..64ef255
--- /dev/null
+++ b/src/samples/example_ssyr2k.c
@@ -0,0 +1,193 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+static const clblasUplo uplo = clblasUpper;
+static const clblasTranspose transAB = clblasNoTrans;
+
+static const size_t N = 5;
+static const size_t K = 4;
+
+static const cl_float alpha = 10;
+
+static const cl_float A[] = {
+    11, 12, 13, 14,
+    21, 22, 23, 24,
+    31, 32, 33, 34,
+    41, 42, 43, 44,
+    51, 52, 53, 54
+};
+static const size_t lda = 4;        /* i.e. lda = K */
+
+static cl_float B[] = {
+    11, 12, 13, 14,
+    21, 22, 23, 24,
+    31, 32, 33, 34,
+    41, 42, 43, 44,
+    51, 52, 53, 54
+};
+static const size_t ldb = 4;        /* i.e. ldb = K */
+
+static const cl_float beta = 20;
+
+static cl_float C[] = {
+    11, 12, 13, 14, 15,
+    12, 22, 23, 24, 25,
+    13, 23, 33, 34, 35,
+    14, 24, 34, 44, 45,
+    15, 25, 35, 45, 55
+};
+static const size_t ldc = 5;        /* i.e. ldc = N */
+
+static cl_float result[5*5];        /* ldc * N */
+
+const size_t off  = 1;
+static const size_t offA = 4 + 1;   /* K + off */
+static const size_t offB = 4 + 1;   /* K + off */
+static const size_t offC = 5 + 1;   /* N + off */
+
+static void
+printResult(const char* str)
+{
+    size_t i, j, nrows;
+
+    printf("%s:\n", str);
+
+    nrows = (sizeof(result) / sizeof(cl_float)) / ldc;
+    for (i = 0; i < nrows; i++) {
+        for (j = 0; j < ldc; j++) {
+            printf("%d ", (int)result[i * ldc + j]);
+        }
+        printf("\n");
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufB, bufC;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * K * sizeof(*A),
+                          NULL, &err);
+    bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * K * sizeof(*B),
+                          NULL, &err);
+    bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * N * sizeof(*C),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+        N * K * sizeof(*A), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0,
+        N * K * sizeof(*B), B, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0,
+        N * N * sizeof(*C), C, 0, NULL, NULL);
+
+    /* Call clblas extended function. Perform SYR2K for the lower right sub-matrices */
+    err = clblasSsyr2k(order, uplo, transAB, N - off, K - off,
+                         alpha, bufA, offA, lda, bufB, offB, ldb,
+                         beta, bufC, offC, ldc, 1, &queue,
+                         0, NULL, &event);
+
+    if (err != CL_SUCCESS) {
+        printf("clblasSsyr2kEx() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0,
+                                  N * N * sizeof(*result),
+                                  result, 0, NULL, NULL);
+
+        /* At this point you will get the result of SSYR2K placed in 'result' array. */
+        puts("");
+        printResult("clblasSsyr2kEx result");
+    }
+
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufC);
+    clReleaseMemObject(bufB);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_ssyrk.c b/src/samples/example_ssyrk.c
new file mode 100644
index 0000000..3de1eb8
--- /dev/null
+++ b/src/samples/example_ssyrk.c
@@ -0,0 +1,175 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+static const clblasUplo uplo = clblasUpper;
+static const clblasTranspose transA = clblasNoTrans;
+
+static const size_t N = 5;
+static const size_t K = 4;
+
+static const cl_float alpha = 10;
+
+static const cl_float A[] = {
+    11, 12, 13, 14,
+    21, 22, 23, 24,
+    31, 32, 33, 34,
+    41, 42, 43, 44,
+    51, 52, 53, 54
+};
+static const size_t lda = 4;        /* i.e. lda = K */
+
+static const cl_float beta = 20;
+
+static cl_float C[] = {
+    11, 12, 13, 14, 15,
+    12, 22, 23, 24, 25,
+    13, 23, 33, 34, 35,
+    14, 24, 34, 44, 45,
+    15, 25, 35, 45, 55
+};
+static const size_t ldc = 5;        /* i.e. ldc = N */
+
+static cl_float result[5*5];        /*  ldc*N */
+
+static const size_t off  = 1;
+static const size_t offA = 4 + 1;   /* K + off */
+static const size_t offC = 5 + 1;   /* N + off */
+
+static void
+printResult(const char* str)
+{
+    size_t i, j, nrows;
+
+    printf("%s:\n", str);
+
+    nrows = (sizeof(result) / sizeof(cl_float)) / ldc;
+    for (i = 0; i < nrows; i++) {
+        for (j = 0; j < ldc; j++) {
+            printf("%d ", (int)result[i * ldc + j]);
+        }
+        printf("\n");
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufC;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * K * sizeof(*A),
+                          NULL, &err);
+    bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * N * sizeof(*C),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+        N * K * sizeof(*A), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0,
+        N * N * sizeof(*C), C, 0, NULL, NULL);
+
+    /* Call clblas extended function. Perform SYRK for the lower right sub-matrices */
+    err = clblasSsyrk(order, uplo, transA, N - off, K - off,
+                         alpha, bufA, offA, lda, beta, bufC, offC, ldc,
+                         1, &queue, 0, NULL, &event);
+    if (err != CL_SUCCESS) {
+        printf("clblasSsyrkEx() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0,
+                                  N * N * sizeof(*result),
+                                  result, 0, NULL, NULL);
+
+        /* At this point you will get the result of SSYRK placed in 'result' array. */
+        puts("");
+        printResult("clblasSsyrkEx result");
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufC);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_stbmv.c b/src/samples/example_stbmv.c
new file mode 100644
index 0000000..217df15
--- /dev/null
+++ b/src/samples/example_stbmv.c
@@ -0,0 +1,157 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+static const clblasUplo uplo = clblasUpper;
+static const size_t N = 5;
+static const size_t K = 2;
+
+static const cl_float A[] = {
+    11, 12, 13,
+    22, 23, 24,
+    33, 34, 35,
+    44, 45, 00,
+    55, 00, 00
+};
+static const size_t lda = 3;    // lda = K + 1
+
+static cl_float X[] = {
+    11,
+    21,
+    31,
+    41,
+    51
+};
+static const int incx = 1;
+
+static void
+printResult(void)
+{
+    size_t i;
+
+    printf("Result:\n\n");
+    for (i = 0; i < N; i++) {
+        printf("%.3f\n", X[i * incx]);
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufX, scratchBuff;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * lda * sizeof(cl_float),
+                          NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, N * sizeof(cl_float),
+                          NULL, &err);
+    scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+                                N * lda * sizeof(cl_float), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+                                N * sizeof(cl_float), X, 0, NULL, NULL);
+
+    err = clblasStbmv(order, uplo, clblasNoTrans, clblasNonUnit,  N,  K, bufA, 0 /*offA */, lda,
+							bufX, 0 /*offX */, incx, scratchBuff, 1, &queue, 0, NULL, &event);
+
+   	if (err != CL_SUCCESS) {
+        printf("clblasStbmv() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(*X),
+                                  X, 0, NULL, NULL);
+        /* At this point you will get the result of STBMV placed in X array. */
+        printResult();
+    }
+
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(scratchBuff);
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_stbsv.c b/src/samples/example_stbsv.c
new file mode 100644
index 0000000..6e059c7
--- /dev/null
+++ b/src/samples/example_stbsv.c
@@ -0,0 +1,158 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+static const clblasTranspose trans = clblasTrans;
+static const clblasUplo uplo = clblasLower;
+static const clblasDiag diag = clblasNonUnit;
+static const size_t N = 5;
+static const size_t K = 2;
+
+static const cl_float A[] = {
+    11, 12, 13,
+    22, 23, 24,
+    33, 34, 35,
+    44, 45, 00,
+    55, 00, 00
+};
+static const size_t lda = 3;    // lda = K + 1
+
+static cl_float X[] = {
+    11,
+	21,
+	31,
+	41,
+	51
+};
+static const int incx = 1;
+
+static void
+printResult(void)
+{
+    size_t i;
+
+    printf("Result:\n\n");
+    for (i = 0; i < N; i++) {
+         printf("%f \n", X[i * incx]);
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufX;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * lda * sizeof(cl_float),
+                          NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(cl_float),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+                                N * lda * sizeof(cl_float), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+                                N * sizeof(cl_float), X, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasStbsv(order, uplo, trans, diag, N, K,
+                         bufA, 0, lda, bufX, 0, incx, 1, &queue, 0, NULL, &event);
+
+    if (err != CL_SUCCESS) {
+        printf("clblasStbsv() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float),
+                                  X, 0, NULL, NULL);
+
+        /* At this point you will get the result of STBSV placed in X array. */
+        printResult();
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_stpmv.c b/src/samples/example_stpmv.c
new file mode 100644
index 0000000..7350962
--- /dev/null
+++ b/src/samples/example_stpmv.c
@@ -0,0 +1,158 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+static const clblasUplo uplo = clblasUpper;
+static const size_t N = 5;
+
+static const cl_float AP[] = {
+    11, 12, 13, 14, 15,
+        22, 23, 24, 25,
+            33, 34, 35,
+                44, 45,
+                    55
+};
+
+static cl_float X[] = {
+    11,
+    21,
+    31,
+    41,
+    51
+};
+static const int incx = 1;
+
+static void
+printResult(void)
+{
+    size_t i;
+
+    printf("Result:\n\n");
+    for (i = 0; i < N; i++) {
+        printf("%.3f\n", X[i * incx]);
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufAP, bufX, scratchBuff;
+    cl_event event = NULL;
+    int ret = 0, numElementsAP;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+	numElementsAP = (N * (N+1)) / 2;	// To get number of elements in a packed matrix
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufAP = clCreateBuffer(ctx, CL_MEM_READ_ONLY, numElementsAP * sizeof(cl_float),
+							NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, N * sizeof(cl_float),
+							NULL, &err);
+    scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float),
+							NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufAP, CL_TRUE, 0,
+							numElementsAP * sizeof(cl_float), AP, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+							N * sizeof(cl_float), X, 0, NULL, NULL);
+
+    err = clblasStpmv(order, uplo, clblasTrans, clblasUnit,  N,  bufAP, 0 /*offA */,
+							bufX, 0 /*offX */, incx, scratchBuff,
+							1, &queue, 0, NULL, &event);
+
+   	if (err != CL_SUCCESS) {
+        printf("clblasStpmv() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float),
+                                  X, 0, NULL, NULL);
+        /* At this point you will get the result of STRMV placed in Y array. */
+        printResult();
+    }
+
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(scratchBuff);
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufAP);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_stpsv.c b/src/samples/example_stpsv.c
new file mode 100644
index 0000000..5454869
--- /dev/null
+++ b/src/samples/example_stpsv.c
@@ -0,0 +1,159 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+static const size_t N = 5;
+
+static const clblasTranspose transA = clblasTrans;
+static const clblasUplo uploA = clblasUpper;
+static const clblasDiag diagA = clblasUnit;
+static const cl_float A[] = {
+    11, 12, 13, 14, 15,
+        22, 23, 24, 25,
+            33, 34, 35,
+                44, 45,
+                    55
+};
+static const size_t lda = 0;    /* i.e. lda = N */
+
+static cl_float X[] = {
+    11.0,
+    153.0,
+    657.0,
+    1753.0,
+    3671.0
+};
+static const int incx = 1;
+
+static void
+printResult(void)
+{
+    size_t i;
+
+    printf("Result:\n\n");
+    for (i = 0; i < N; i++) {
+         printf("%f \n", X[i * incx]);
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufX;
+    cl_event event = NULL;
+    int ret = 0, numElementsAP;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    numElementsAP = (N * (N+1)) / 2;    // To get number of elements in a packed matrix
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, numElementsAP * sizeof(cl_float),
+                          NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(cl_float),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+                    numElementsAP * sizeof(cl_float), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+                    N * sizeof(cl_float), X, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasStpsv(order, uploA, transA, diagA, N,
+                         bufA, 0, bufX, 0, incx, 1, &queue, 0,
+                         NULL, &event);
+    if (err != CL_SUCCESS) {
+        printf("clblasStpsv() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float),
+                                  X, 0, NULL, NULL);
+
+        /* At this point you will get the result of STPSV placed in X array. */
+        printResult();
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_strmm.c b/src/samples/example_strmm.c
new file mode 100644
index 0000000..d5643a5
--- /dev/null
+++ b/src/samples/example_strmm.c
@@ -0,0 +1,173 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+static const clblasSide side = clblasLeft;
+
+static const size_t M = 4;
+static const size_t N = 5;
+
+static const cl_float alpha = 10;
+
+static const clblasTranspose transA = clblasNoTrans;
+static const clblasUplo uploA = clblasUpper;
+static const clblasDiag diagA = clblasNonUnit;
+static const cl_float A[] = {
+    11, 12, 13, 14,
+     0, 22, 23, 24,
+     0,  0, 33, 34,
+     0,  0,  0, 44
+};
+static const size_t lda = 4;            /* i.e. lda = M */
+
+static cl_float B[] = {
+    11, 12, 13, 14, 15,
+    21, 22, 23, 24, 25,
+    31, 32, 33, 34, 35,
+    41, 42, 43, 44, 45
+};
+static const size_t ldb = 5;            /* i.e. ldb = N */
+
+static cl_float result[20];             /* ldb * M */
+
+static const size_t off  = 1;
+static const size_t offA = 4 + 1;       /* K + off */
+static const size_t offB = 5 + 1;       /* N + off */
+
+static void
+printResult(const char* str)
+{
+    size_t i, j, nrows;
+
+    printf("%s:\n", str);
+
+    nrows = (sizeof(result) / sizeof(cl_float)) / ldb;
+    for (i = 0; i < nrows; i++) {
+        for (j = 0; j < ldb; j++) {
+            printf("%d ", (int)result[i * ldb + j]);
+        }
+        printf("\n");
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufB;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * M * sizeof(*A),
+                          NULL, &err);
+    bufB = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * N * sizeof(*B),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+        M * M * sizeof(*A), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0,
+        M * N * sizeof(*B), B, 0, NULL, NULL);
+
+    /* Call clblas extended function. Perform TRMM for the lower right sub-matrices */
+    err = clblasStrmm(order, side, uploA, transA, diagA, M - off, N - off,
+                         alpha, bufA, offA, lda, bufB, offB, ldb, 1, &queue,
+                         0, NULL, &event);
+    if (err != CL_SUCCESS) {
+        printf("clblasStrmmEx() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufB, CL_TRUE, 0,
+                                  M * N * sizeof(*result),
+                                  result, 0, NULL, NULL);
+
+        /* At this point you will get the result of STRMM placed in 'result' array. */
+        puts("");
+        printResult("clblasStrmmEx result");
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufB);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_strmv.c b/src/samples/example_strmv.c
new file mode 100644
index 0000000..4df0923
--- /dev/null
+++ b/src/samples/example_strmv.c
@@ -0,0 +1,157 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+static const clblasUplo uplo = clblasUpper;
+static const size_t N = 5;
+
+static const cl_float A[] = {
+    11, 12, 13, 14, 15,
+     0, 22, 23, 24, 25,
+     0,  0, 33, 34, 35,
+     0,  0,  0, 44, 45,
+     0,  0,  0,  0, 55
+};
+static const size_t lda = 5;    /* i.e. lda = N */
+
+static cl_float X[] = {
+    11,
+    21,
+    31,
+    41,
+    51
+};
+static const int incx = 1;
+
+static void
+printResult(void)
+{
+    size_t i;
+
+    printf("Result:\n\n");
+    for (i = 0; i < N; i++) {
+        printf("%.3f\n", X[i * incx]);
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufX, scratchBuff;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * lda * sizeof(*A),
+                          NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, N * sizeof(*X),
+                          NULL, &err);
+    scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(*X),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+        N * lda * sizeof(*A), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+        N * sizeof(*X), X, 0, NULL, NULL);
+
+    err = clblasStrmv(order, uplo, clblasTrans, clblasUnit,  N,  bufA, 0 /*offA */, lda,
+															bufX, 0 /*offX */, incx,
+        													scratchBuff, 1, &queue, 0, NULL, &event);
+
+   	if (err != CL_SUCCESS) {
+        printf("clblasStrmv() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(*X),
+                                  X, 0, NULL, NULL);
+        /* At this point you will get the result of STRMV placed in X array. */
+        printResult();
+    }
+
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(scratchBuff);
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_strsm.c b/src/samples/example_strsm.c
new file mode 100644
index 0000000..6d5010a
--- /dev/null
+++ b/src/samples/example_strsm.c
@@ -0,0 +1,175 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+static const clblasSide side = clblasLeft;
+
+static const size_t M = 4;
+static const size_t N = 5;
+
+static const cl_float alpha = 10;
+
+static const clblasTranspose transA = clblasNoTrans;
+static const clblasUplo uploA = clblasUpper;
+static const clblasDiag diagA = clblasNonUnit;
+static const cl_float A[] = {
+    11, 12, 13, 14,
+     0, 22, 23, 24,
+     0,  0, 33, 34,
+     0,  0,  0, 44
+};
+static const size_t lda = 4;        /* i.e. lda = M */
+
+static cl_float B[] = {
+    11, 12, 13, 14, 15,
+    21, 22, 23, 24, 25,
+    31, 32, 33, 34, 35,
+    41, 42, 43, 44, 45
+};
+static const size_t ldb = 5;        /* i.e. ldb = N */
+
+
+static cl_float result[20];         /* ldb*M */
+
+static const size_t off  = 1;
+static const size_t offA = 4 + 1;   /* M + off */
+static const size_t offB = 5 + 1;   /* N + off */
+
+static void
+printResult(const char* str)
+{
+    size_t i, j, nrows;
+
+    printf("%s:\n", str);
+
+    nrows = (sizeof(result) / sizeof(cl_float)) / ldb;
+    for (i = 0; i < nrows; i++) {
+        for (j = 0; j < ldb; j++) {
+            printf("%.5e ", result[i * ldb + j]);
+        }
+        printf("\n");
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufB;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * M * sizeof(*A),
+                          NULL, &err);
+    bufB = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * N * sizeof(*B),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+        M * M * sizeof(*A), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0,
+        M * N * sizeof(*B), B, 0, NULL, NULL);
+
+    /* Call clblas function. Perform TRSM for the lower right sub-matrices */
+    err = clblasStrsm(order, side, uploA, transA, diagA, M - off, N - off,
+                         alpha, bufA, offA, lda, bufB, offB, ldb, 1, &queue, 0,
+                         NULL, &event);
+    if (err != CL_SUCCESS) {
+        printf("clblasStrsmEx() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufB, CL_TRUE, 0,
+                                  M * N * sizeof(*result),
+                                  result, 0, NULL, NULL);
+
+        /* At this point you will get the result of STRSM placed in 'result' array. */
+        puts("");
+        printResult("clblasStrsmEx result");
+    }
+
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufB);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_strsv.c b/src/samples/example_strsv.c
new file mode 100644
index 0000000..0b4d828
--- /dev/null
+++ b/src/samples/example_strsv.c
@@ -0,0 +1,155 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+static const size_t N = 4;
+
+static const clblasTranspose transA = clblasTrans;
+static const clblasUplo uploA = clblasLower;
+static const clblasDiag diagA = clblasNonUnit;
+static const cl_float A[] = {
+    11,  0,  0,  0,
+    12, 22,  0,  0,
+    13, 23, 33,  0,
+    14, 24, 34, 44
+};
+static const size_t lda = 4;    /* i.e. lda = N */
+
+static cl_float X[] = {
+    11,
+	21,
+	31,
+	41
+};
+static const int incx = 1;
+
+static void
+printResult(void)
+{
+    size_t i;
+
+    printf("Result:\n\n");
+    for (i = 0; i < N; i++) {
+         printf("%f \n", X[i * incx]);
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufX;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * N * sizeof(cl_float),
+                          NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(cl_float),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+                    N * N * sizeof(cl_float), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+                    N * sizeof(cl_float), X, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasStrsv(order, uploA, transA, diagA, N,
+                         bufA, 0, lda, bufX, 0, incx, 1, &queue, 0, NULL, &event);
+
+    if (err != CL_SUCCESS) {
+        printf("clblasStrsv() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float),
+                                  X, 0, NULL, NULL);
+
+        /* At this point you will get the result of STRSV placed in X array. */
+        printResult();
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_zhemv.cpp b/src/samples/example_zhemv.cpp
new file mode 100644
index 0000000..182560c
--- /dev/null
+++ b/src/samples/example_zhemv.cpp
@@ -0,0 +1,179 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasColumnMajor;
+
+static const size_t N = 5;
+
+static const cl_double2 alpha = {{10,10}};
+static const clblasUplo uplo = clblasUpper;
+static const cl_double2 A[] = {
+	{{ 1.0, 00.0}}, {{ 2.0, 02.0}}, {{ 4.0,  4.0}}, {{ 7.0,  7.0}}, {{11.0, 11.0}},
+    {{00.0, 00.0}}, {{ 3.0, 03.0}}, {{ 5.0,  5.0}}, {{ 8.0,  8.0}}, {{12.0, 12.0}},
+    {{00.0, 00.0}}, {{00.0, 00.0}}, {{ 6.0,  6.0}}, {{ 9.0,  9.0}}, {{13.0, 13.0}},
+    {{00.0, 00.0}}, {{00.0, 00.0}}, {{00.0, 00.0}}, {{10.0, 10.0}}, {{14.0, 14.0}},
+	{{00.0, 00.0}}, {{00.0, 00.0}}, {{00.0, 00.0}}, {{00.0, 00.0}}, {{15.0, 15.0}}
+};
+static const size_t lda = 5;    /* i.e. lda = N */
+
+static const cl_double2 X[] = {
+	{{1.0, 0.0}},
+	{{2.0, 0.0}},
+	{{3.0, 0.0}},
+	{{4.0, 0.0}},
+	{{5.0, 0.0}}
+};
+static const int incx = 1;
+
+static const cl_double2 beta = {{20.0, 20.0}};
+
+static cl_double2 Y[] = {
+	{{1.0, 0.0}},
+    {{2.0, 0.0}},
+    {{3.0, 0.0}},
+    {{4.0, 0.0}},
+    {{5.0, 0.0}}
+};
+static const int incy = 1;
+
+static void
+printResult(void)
+{
+    size_t i, nElements;
+
+    printf("Result:\n");
+
+    nElements = (sizeof(Y) / sizeof(cl_double2)) / incy;
+    for (i = 0; i < nElements; i++) {
+        printf("(%9.2f, %-9.2f)\n", CREAL(Y[i * incy]), CIMAG(Y[i * incy]));
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufX, bufY;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * lda * sizeof(*A),
+                          NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(*X),
+                          NULL, &err);
+    bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(*Y),
+                          NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+        N * lda * sizeof(*A), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+        N * sizeof(*X), X, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0,
+        N * sizeof(*Y), Y, 0, NULL, NULL);
+
+    /* Call clblas function. */
+    err = clblasZhemv(order, uplo, N, alpha, bufA, 0 /*offA */, lda,
+												bufX, 0 /*offx*/, incx, beta,
+												bufY, 0 /*offx*/, incy, 1, &queue, 0, NULL, &event);
+
+//	blasZhemv(order, uplo, N, alpha, (DoubleComplex*)A, 0, lda, (DoubleComplex*)X, 0, incx, beta, (DoubleComplex*)Y, 0, incy);
+//	err = CL_SUCCESS;
+	//err = clblasZtrmv(order, uplo, clblasNoTrans, clblasNonUnit, N, bufA, 0 /*offA */, lda,
+    //                                      bufX, 0 /*offx*/, incx,
+    //                                      bufY, 1, &queue, 0, NULL, &event);
+
+   	if (err != CL_SUCCESS) {
+        printf("clblasZhemv() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+		printResult();
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(*Y),
+                                  Y, 0, NULL, NULL);
+        /* At this point you will get the result of SSYMV placed in Y array. */
+        printResult();
+    }
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufY);
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufA);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_zher2.c b/src/samples/example_zher2.c
new file mode 100644
index 0000000..258f548
--- /dev/null
+++ b/src/samples/example_zher2.c
@@ -0,0 +1,172 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+static const size_t N = 5;
+static cl_double2 alpha = {{10.0f, 2.0f}};
+static const clblasUplo uplo = clblasUpper;
+
+static cl_double2 A[] = {
+    {{11.0f, 00.0f}}, {{12.0f, 02.0f}}, {{13.0f, 05.0f}}, {{14.0f, 12.0f}}, {{15.0f, 06.0f}},
+    {{00.0f, 00.0f}}, {{22.0f, 00.0f}}, {{23.0f, 25.0f}}, {{24.0f, 23.0f}}, {{25.0f, 61.0f}},
+    {{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{33.0f, 00.0f}}, {{34.0f, 23.0f}}, {{35.0f, 21.0f}},
+    {{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{44.0f, 00.0f}}, {{45.0f, 23.0f}},
+    {{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{55.0f, 00.0f}}
+};
+static const size_t lda = 5;    /* i.e. lda = N */
+
+static const cl_double2 X[] = {
+    {{11.0f, 03.0f}},
+    {{01.0f, 15.0f}},
+    {{30.0f, 20.0f}},
+    {{01.0f, 02.0f}},
+    {{11.0f, 10.0f}}
+};
+static const int incx = 1;
+
+static const cl_double2 Y[] = {
+    {{11.0f, 03.0f}},
+    {{03.0f, 05.0f}},
+    {{09.0f, 00.0f}},
+    {{01.0f, 02.0f}},
+    {{11.0f, 00.0f}}
+};
+static const int incy = 1;
+
+
+
+static void
+printResult(void)
+{
+    size_t i, j;
+    printf("\nResult:\n");
+
+    for (i = 0; i < N; i++) {
+		for(j = 0; j < N; j++)
+			printf("(%9.2lf, %-9.2lf)\t", CREAL( A[ i*N + j ] ), CIMAG( A[ i*N + j ] ));
+		printf("\n");
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufX, bufY;
+    cl_event event = NULL;
+    int ret = 0;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * lda * sizeof(cl_double2),
+                          NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_double2),
+                          NULL, &err);
+	bufY = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_double2),
+						  NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
+					N * lda * sizeof(cl_double2), A, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+					N * sizeof(cl_double2), X, 0, NULL, NULL);
+	err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0,
+					N * sizeof(cl_double2), Y, 0, NULL, NULL);
+
+    err = clblasZher2(order, uplo, N, alpha, bufX, 0 /*offx */, incx, bufY, 0 /*offy*/, incy,
+						bufA, 0 /*offa */, lda, 1, &queue, 0, NULL, &event);
+
+   	if (err != CL_SUCCESS) {
+        printf("clblasZher2() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufA, CL_TRUE, 0, (N * lda * sizeof(cl_double2)),
+                                  A, 0, NULL, NULL);
+        /* At this point you will get the result of ZHER2 placed in A array. */
+        printResult();
+    }
+
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufA);
+	clReleaseMemObject(bufY);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/samples/example_zhpr2.c b/src/samples/example_zhpr2.c
new file mode 100644
index 0000000..6224cbd
--- /dev/null
+++ b/src/samples/example_zhpr2.c
@@ -0,0 +1,179 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+
+/* Include CLBLAS header. It automatically includes needed OpenCL header,
+ * so we can drop out explicit inclusion of cl.h header.
+ */
+#include <clBLAS.h>
+
+/* This example uses predefined matrices and their characteristics for
+ * simplicity purpose.
+ */
+static const clblasOrder order = clblasRowMajor;
+static const size_t N = 5;
+static cl_double2 alpha = {{10.0f, 2.0f}};
+static const clblasUplo uplo = clblasUpper;
+
+static cl_double2 AP[] = {
+    {{11.0f, 00.0f}}, {{12.0f, 02.0f}}, {{13.0f, 05.0f}}, {{14.0f, 12.0f}}, {{15.0f, 06.0f}},
+                      {{22.0f, 00.0f}}, {{23.0f, 25.0f}}, {{24.0f, 23.0f}}, {{25.0f, 61.0f}},
+                                        {{33.0f, 00.0f}}, {{34.0f, 23.0f}}, {{35.0f, 21.0f}},
+                                                          {{44.0f, 00.0f}}, {{45.0f, 23.0f}},
+                                                                            {{55.0f, 00.0f}}
+};
+
+static const cl_double2 X[] = {
+    {{11.0f, 03.0f}},
+    {{01.0f, 15.0f}},
+    {{30.0f, 20.0f}},
+    {{01.0f, 02.0f}},
+    {{11.0f, 10.0f}}
+};
+static const int incx = 1;
+
+static const cl_double2 Y[] = {
+    {{11.0f, 03.0f}},
+    {{03.0f, 05.0f}},
+    {{09.0f, 00.0f}},
+    {{01.0f, 02.0f}},
+    {{11.0f, 00.0f}}
+};
+static const int incy = 1;
+
+static void
+printResult(void)
+{
+    size_t i, j, off;
+    printf("\nResult:\n");
+
+    off = 0;
+    for (i = 0; i < N; i++) {
+        for(j = 0; j < N; j++)  {
+            if( ( (uplo == clblasUpper) && (i > j)) || ((uplo == clblasLower) && (j > i)) )
+            {
+                printf("\t\t\t");
+                continue;
+            }
+
+			printf("(%9.2lf, %-9.2lf)\t", CREAL( AP[ off ] ), CIMAG( AP[ off ] ));
+            off ++ ;
+        }
+		printf("\n");
+    }
+}
+
+int
+main(void)
+{
+    cl_int err;
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufAP, bufX, bufY;
+    cl_event event = NULL;
+    int ret = 0, numElementsAP;
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs(1, &platform, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetPlatformIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    if (err != CL_SUCCESS) {
+        printf( "clGetDeviceIDs() failed with %d\n", err );
+        return 1;
+    }
+
+    props[1] = (cl_context_properties)platform;
+    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateContext() failed with %d\n", err );
+        return 1;
+    }
+
+    queue = clCreateCommandQueue(ctx, device, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf( "clCreateCommandQueue() failed with %d\n", err );
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    /* Setup clblas. */
+    err = clblasSetup();
+    if (err != CL_SUCCESS) {
+        printf("clblasSetup() failed with %d\n", err);
+        clReleaseCommandQueue(queue);
+        clReleaseContext(ctx);
+        return 1;
+    }
+
+    numElementsAP = (N * (N+1)) / 2;	// To get number of elements in a packed matrix
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufAP = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (numElementsAP * sizeof(cl_double2)),
+                            NULL, &err);
+    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_double2),
+                            NULL, &err);
+	bufY = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_double2),
+						    NULL, &err);
+
+    err = clEnqueueWriteBuffer(queue, bufAP, CL_TRUE, 0,
+					                numElementsAP * sizeof(cl_double2), AP, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
+					                N * sizeof(cl_double2), X, 0, NULL, NULL);
+	err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0,
+					                N * sizeof(cl_double2), Y, 0, NULL, NULL);
+
+    err = clblasZhpr2(order, uplo, N, alpha, bufX, 0 /*offx */, incx, bufY, 0 /*offy*/, incy,
+						            bufAP, 0 /*offa */, 1, &queue, 0, NULL, &event);
+
+   	if (err != CL_SUCCESS) {
+        printf("clblasZhpr2() failed with %d\n", err);
+        ret = 1;
+    }
+    else {
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents(1, &event);
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer(queue, bufAP, CL_TRUE, 0, (numElementsAP * sizeof(cl_double2)),
+                                  AP, 0, NULL, NULL);
+        /* At this point you will get the result of ZHPR2 placed in A array. */
+        printResult();
+    }
+
+
+    /* Release OpenCL memory objects. */
+    clReleaseMemObject(bufX);
+    clReleaseMemObject(bufAP);
+	clReleaseMemObject(bufY);
+
+    /* Finalize work with clblas. */
+    clblasTeardown();
+
+    /* Release OpenCL working objects. */
+    clReleaseCommandQueue(queue);
+    clReleaseContext(ctx);
+
+    return ret;
+}
diff --git a/src/scripts/perf/CMakeLists.txt b/src/scripts/perf/CMakeLists.txt
new file mode 100644
index 0000000..7b71a09
--- /dev/null
+++ b/src/scripts/perf/CMakeLists.txt
@@ -0,0 +1,30 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+set(GRAPHING_SCRIPTS 	measurePerformance.py 
+						plotPerformance.py 
+						blasPerformanceTesting.py 
+						errorHandler.py 
+						performanceUtility.py
+						)
+
+if( TARGET_PLATFORM EQUAL 64 )
+    set( BIN_DIR bin64 )
+else()
+    set( BIN_DIR bin32 )
+endif()
+
+install( FILES ${GRAPHING_SCRIPTS} DESTINATION ${BIN_DIR} )
diff --git a/src/scripts/perf/blasPerformanceTesting.py b/src/scripts/perf/blasPerformanceTesting.py
new file mode 100644
index 0000000..bf2298b
--- /dev/null
+++ b/src/scripts/perf/blasPerformanceTesting.py
@@ -0,0 +1,333 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+import itertools
+import re#gex
+import subprocess
+import os
+import sys
+from datetime import datetime
+
+# Common data and functions for the performance suite
+
+tableHeader = 'lengthx,lengthy,lengthz,batch,device,inlay,outlay,place,ldsComp,ldsFrac,cache,xfactor,label,GFLOPS'
+
+class TestCombination:
+    def __init__(self,
+                 lengthx, lengthy, lengthz, batchsize,
+                 device, inlayout, outlayout, placeness,
+                 ldscomplex, ldsfraction, cachesize, xfactor,
+                 label):
+        self.x = lengthx
+        self.y = lengthy
+        self.z = lengthz
+        self.batchsize = batchsize
+        self.device = device
+        self.inlayout = inlayout
+        self.outlayout = outlayout
+        self.placeness = placeness
+        self.ldscomplex = ldscomplex
+        self.ldsfraction = ldsfraction
+        self.cachesize = cachesize
+        self.xfactor = xfactor
+        self.label = label
+
+    def __str__(self):
+        return self.x + 'x' + self.y + 'x' + self.z + ':' + self.batchsize + ', ' + self.device + ', ' + self.inlayout + '/' + self.outlayout + ', ' + self.placeness + ', LDS comp(' + self.ldscomplex + '), LDS frac(' + self.ldsfraction + '), cachesz(' + self.cachesize + '), X-factor(' + self.xfactor + ') -- ' + self.label
+
+class GraphPoint:
+    def __init__(self,
+                 lengthx, lengthy, lengthz, batchsize,
+                 ldsfraction, device, label,
+                 gflops):
+        self.x = lengthx
+        self.y = lengthy
+        self.z = lengthz
+        self.batchsize = batchsize
+        self.device = device
+        self.label = label
+        self.ldsfraction = ldsfraction
+        self.gflops = gflops
+        self.problemsize = str(int(self.x) * int(self.y) * int(self.z) * int(self.batchsize))
+
+    def __str__(self):
+        # ALL members must be represented here (x, y, z, batch, device, label, ldsfraction, etc)
+        return self.x + 'x' + self.y + 'x' + self.z + ':' + self.batchsize + ', ' + self.device + ', LDS fraction = ' + self.ldsfraction + ' -- ' + self.label + '; ' + self.gflops
+
+class TableRow:
+    # parameters = class TestCombination instantiation
+    def __init__(self, parameters, gflops):
+        self.parameters = parameters
+        self.gflops = gflops
+
+    def __str__(self):
+        return self.parameters.__str__() + '; ' + self.gflops
+
+def transformDimension(x,y,z):
+    if int(z) != 1:
+        return 3
+    elif int(y) != 1:
+        return 2
+    elif int(x) != 1:
+        return 1
+
+def executable(library):
+    if type(library) != str:
+        print 'ERROR: expected library name to be a string'
+        quit()
+
+    if sys.platform != 'win32' and sys.platform != 'linux2':
+        print 'ERROR: unknown operating system'
+        quit()
+    if library == 'clblas':
+        if sys.platform == 'win32':
+            exe = 'client.exe'
+        elif sys.platform == 'linux2':
+            exe = './client'
+    if library == 'acmlblas':
+        if sys.platform == 'win32':
+            exe = 'ACMLBlas_client.exe'
+        elif sys.platform == 'linux2':
+            exe = './ACMLBlas_client'
+    if library!='null' and library!='clblas' and library!='acmlblas':
+        print 'ERROR: unknown library -- cannot determine executable name ' + library
+        quit()
+
+    if not os.path.isfile(exe):
+        error_message = 'ERROR: could not find client named ' + exe
+        print error_message
+        quit()
+
+    return exe
+
+def max_mem_available_in_bytes(exe, device):
+    arguments = [exe, '-i', device]
+    
+    deviceInfo = subprocess.check_output(arguments, stderr=subprocess.STDOUT).split(os.linesep)
+    deviceInfo = itertools.ifilter( lambda x: x.count('MAX_MEM_ALLOC_SIZE'), deviceInfo)
+    deviceInfo = list(itertools.islice(deviceInfo, None))
+    maxMemoryAvailable = re.search('\d+$', deviceInfo[0])
+    return int(maxMemoryAvailable.group(0))
+
+def max_problem_size(exe, device):
+    numbers_in_one_datapoint = 2 # (i.e.: real or complex?)
+    bytes_in_one_number = 4 # (i.e.: single or double precision?)
+    return max_mem_available_in_bytes(exe, device) / (numbers_in_one_datapoint * bytes_in_one_number)
+
+def maxBatchSize(lengthx, lengthy, lengthz, exe, device):
+    problemSize = int(lengthx) * int(lengthy) * int(lengthz)
+    maxBatchSize = max_problem_size(exe, device) / problemSize
+    if int(lengthx) == pow(2,16) or int(lengthx) == pow(2,17):
+        # special cases in the kernel. extra padding is added in, so we need to shrink the batch size to accommodate
+        return str(maxBatchSize/2)
+    else:
+        return str(maxBatchSize)
+
+def create_ini_file_if_requested(args):
+    if args.createIniFilename:
+        #print vars(args)
+        for x in vars(args):
+            #print x
+            if (type(getattr(args,x)) != file) and getattr(args,x) != None\
+                    and x.count('File') == 0:
+                args.createIniFilename.write('--' + x + ' ')
+                args.createIniFilename.write(str(getattr(args,x)) + '; ')
+        quit()
+
+def load_ini_file_if_requested(args, parser):
+    if args.useIniFilename:
+        argument_list = args.useIniFilename.readlines()
+        arg_string = str()
+        for a in argument_list:
+            arg_string += a
+        arg_string = arg_string.replace(';', '')
+        arg_string = arg_string.split()
+        args = parser.parse_args(arg_string)
+    return args
+
+
+def is_numeric_type(x):
+    return type(x) == int or type(x) == long or type(x) == float
+
+def split_up_comma_delimited_lists(args):
+    for x in vars(args):
+        attr = getattr(args, x)
+        if attr == None:
+            setattr(args, x, [None])
+        elif is_numeric_type(attr):
+            setattr(args, x, [attr])
+        elif type(attr) == str:
+            setattr(args, x, attr.split(','))
+    return args
+
+class Range:
+    def __init__(self, ranges, defaultStep='+1'):
+        # we might be passed in a single value or a list of strings
+        # if we receive a single value, we want to feed it right back
+        if type(ranges) != list:
+            self.expanded = ranges
+        elif ranges[0] == None:
+            self.expanded = [None]
+        else:
+            self.expanded = []
+            for thisRange in ranges:
+                thisRange = str(thisRange)
+                if re.search('^\+\d+$', thisRange):
+                    self.expanded = self.expanded + [thisRange]
+                elif thisRange == 'max':
+                    self.expanded = self.expanded + ['max']
+                else:
+                #elif thisRange != 'max':
+                    if thisRange.count(':'):
+                        self._stepAmount = thisRange.split(':')[1]
+                    else:
+                        self._stepAmount = defaultStep
+                    thisRange = thisRange.split(':')[0]
+
+                    if self._stepAmount.count('x'):
+                        self._stepper = '_mult'
+                    else:
+                        self._stepper = '_add'
+                    self._stepAmount = self._stepAmount.lstrip('+x')
+                    self._stepAmount = int(self._stepAmount)
+
+                    if thisRange.count('-'):
+                        self.begin = int(thisRange.split('-')[0])
+                        self.end = int(thisRange.split('-')[1])
+                    else:
+                        self.begin = int(thisRange.split('-')[0])
+                        self.end = int(thisRange.split('-')[0])
+                    self.current = self.begin
+
+                    if self.begin == 0 and self._stepper == '_mult':
+                        self.expanded = self.expanded + [0]
+                    else:
+                        while self.current <= self.end:
+                            self.expanded = self.expanded + [self.current]
+                            self._step()
+
+                # now we want to uniquify and sort the expanded range
+                self.expanded = list(set(self.expanded))
+                self.expanded.sort()
+
+    # advance current value to next
+    def _step(self):
+        getattr(self, self._stepper)()
+
+    def _mult(self):
+        self.current = self.current * self._stepAmount
+
+    def _add(self):
+        self.current = self.current + self._stepAmount
+
+def expand_range(a_range):
+    return Range(a_range).expanded
+
+def decode_parameter_problemsize(problemsize):
+    if not problemsize.count(None):
+        i = 0
+        while i < len(problemsize):
+            problemsize[i] = problemsize[i].split(':')
+            j = 0
+            while j < len(problemsize[i]):
+                problemsize[i][j] = problemsize[i][j].split('x')
+                j = j+1
+            i = i+1
+
+    return problemsize
+
+def blas_table_header():
+    return 'm,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS'
+
+class BlasTestCombination:
+    def __init__(self,
+                 sizem, sizen, sizek,
+                 lda, ldb, ldc,
+                 offa, offb, offc,
+                 alpha, beta, order,
+                 transa, transb,
+                 side, uplo, diag,
+                 function, precision,
+                 device, library, label):
+        self.sizem = str(sizem)
+        self.sizen = str(sizen)
+        self.sizek = str(sizek)
+        self.lda = str(lda)
+        self.ldb = str(ldb)
+        self.ldc = str(ldc)
+        self.offa = str(offa)
+        self.offb = str(offb)
+        self.offc = str(offc)
+        self.alpha = str(alpha)
+        self.beta = str(beta)
+        self.order = order
+        self.transa = transa
+        self.transb = transb
+        self.side = side
+        self.uplo = uplo
+        self.diag = diag
+        self.function = function
+        self.precision = precision
+        self.device = device
+        self.library = library
+        self.label = label
+
+    def __str__(self):
+        return self.sizem + 'x' + self.sizen + 'x' + self.sizek + ':' + self.lda + 'x' + self.ldb + 'x' + self.ldc + self.offa + 'x' + self.offb + 'x' + self.offc + ', ' + self.device + ', ' + self.precision + self.function + ', ' + self.library + ', alpha(' + self.alpha + '), beta(' + self.beta + '), order(' + self.order + '), transa(' + self.transa + '), transb(' + self.transb + '), side(' + self.side  + '), uplo(' + self.uplo + '), diag(' + self.diag + ') -- ' + self.label
+
+class BlasGraphPoint:
+    def __init__(self,
+                 sizem, sizen, sizek,
+                 lda, ldb, ldc,
+                 offa, offb, offc,
+                 device, order, transa, transb,
+                 function, library, label,
+                 gflops):
+        self.sizem = sizem
+        self.sizen = sizen
+        self.sizek = sizek
+        self.lda = lda
+        self.ldb = ldb
+        self.ldc = ldc
+        self.offa = offa
+        self.offb = offb
+        self.offc = offc
+        self.device = device
+        self.order = order
+        self.transa = transa
+        self.transb = transb
+        self.function = function
+        self.library = library
+        self.label = label
+        self.gflops = gflops
+
+    def __str__(self):
+        # ALL members must be represented here (x, y, z, batch, device, label, ldsfraction, etc)
+        return self.sizem + 'x' + self.sizen + 'x' + self.sizek + ':' + self.device + ', ' + self.function + ', ' + self.library + ', order(' + self.order + '), transa(' + self.transa + '), transb(' + self.transb + ') -- ' + self.label + '; ' + self.gflops + ' gflops'
+
+def open_file( filename ):
+    if type(filename) == list:
+        filename = filename[0]
+    if filename == None:
+        filename = 'results' + datetime.now().isoformat().replace(':','.') + '.txt'
+    else:
+        if os.path.isfile(filename):
+            oldname = filename
+            filename = filename + datetime.now().isoformat().replace(':','.')
+            message = 'A file with the name ' + oldname + ' already exists. Changing filename to ' + filename
+            print message
+    
+    return open(filename, 'w')
diff --git a/src/scripts/perf/errorHandler.py b/src/scripts/perf/errorHandler.py
new file mode 100644
index 0000000..8471f9d
--- /dev/null
+++ b/src/scripts/perf/errorHandler.py
@@ -0,0 +1,68 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+#---------------------------------File Note------------------------------------
+#Date: 27 January 2012
+#This file defines all the error code and error handler mechanism
+#--------------------------------Global Variables------------------------------
+
+UINS_CAT = 100
+WIN_REG_SEARCH_FAIL = 101
+UNIMPL_APP = 200
+SYS_ERR = 300
+TIME_OUT = 400
+DIM_INCO_FILE_FMT = 500 #incorrect file format for dimension
+DIM_FILE_VAL_INCO = 501 #Value coming from dimension file is incorrect
+
+#__errorTable : Defines all the errors in the system. Add a new error code and
+#               error message here 
+"""Error table is defined as private to this module""" 
+errorTable = {
+              UINS_CAT: 'Application is not able to find the installed catalyst',
+              WIN_REG_SEARCH_FAIL: 'Windows Registry search for catalysts version is unsuccessful',
+              UNIMPL_APP: 'Unimplemented Application requirement',
+              SYS_ERR:    'System error occurred - Please check the source code',
+              TIME_OUT: 'Operation is timed out',
+              DIM_INCO_FILE_FMT: 'incorrect file format for dimension - Not able to find dimension',
+              DIM_FILE_VAL_INCO: 'Value coming from dimension file is incorrect'
+              }
+
+#--------------------------------Class Definitions-----------------------------
+class TimeoutException(Exception): 
+    pass
+
+"""Base class for handling all the application generated exception"""
+class ApplicationException(Exception):
+    
+    def __init__(self, fileName, errno, msg = ""):
+        self.fileName = fileName
+        self.errno = errno
+        self.mess = errorTable[errno] + msg
+        self.message = 'Application ERROR:'+repr(self.fileName+'-'+str(self.errno)+'-'+self.mess)
+        
+    def __str__(self):
+        return repr(self.fileName+'-'+str(self.errno)+'-'+self.mess)
+    
+
+#--------------------------------Global Function-------------------------------
+if __name__ == '__main__':
+    #print errorTable
+    try:
+        raise ApplicationException('errorHandler', SYS_ERR)
+
+    except:
+        print 'Generic exception'
+
diff --git a/src/scripts/perf/measurePerformance.py b/src/scripts/perf/measurePerformance.py
new file mode 100644
index 0000000..659d4ef
--- /dev/null
+++ b/src/scripts/perf/measurePerformance.py
@@ -0,0 +1,543 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+import sys
+import argparse
+import subprocess
+import itertools
+import re#gex
+import os
+from threading import Timer, Thread
+import thread, time
+from platform import system
+from datetime import datetime
+
+import errorHandler
+from blasPerformanceTesting import *
+from performanceUtility import timeout, log
+
+IAM = 'BLAS'
+TIMOUT_VAL = 900  #In seconds
+   
+"""
+define and parse parameters
+"""
+devicevalues = ['gpu', 'cpu']
+libraryvalues = ['clblas','acmlblas']
+ordervalues = ['row','column']
+transvalues = ['none','transpose','conj']
+sidevalues = ['left','right']
+uplovalues = ['upper','lower']
+diagvalues = ['unit','nonunit']
+functionvalues = ['gemm', 'trmm', 'trsm', 'syrk', 'syr2k', 'gemv', 'symv' ]
+precisionvalues = ['s', 'd', 'c', 'z']
+roundtripvalues = ['roundtrip','noroundtrip','both']
+
+parser = argparse.ArgumentParser(description='Measure performance of the clblas library')
+parser.add_argument('--device',
+    dest='device', default='gpu',
+    help='device(s) to run on; may be a comma-delimited list. choices are ' + str(devicevalues) + '. (default gpu)')
+parser.add_argument('-m', '--sizem',
+    dest='sizem', default=None,
+    help='size(s) of m to test; may include ranges and comma-delimited lists. stepping may be indicated with a colon. e.g., 1024 or 100-800:100 or 15,2048-3000')
+parser.add_argument('-n', '--sizen',
+    dest='sizen', default=None,
+    help='size(s) of n to test; may include ranges and comma-delimited lists. stepping may be indicated with a colon. e.g., 1024 or 100-800:100 or 15,2048-3000')
+parser.add_argument('-k', '--sizek',
+    dest='sizek', default=None,
+    help='size(s) of k to test; may include ranges and comma-delimited lists. stepping may be indicated with a colon. e.g., 1024 or 100-800:100 or 15,2048-3000')
+parser.add_argument('-s', '--square',
+    dest='square', default=None,
+    help='size(s) of m=n=k to test; may include ranges and comma-delimited lists. stepping may be indicated with a colon. this option sets lda = ldb = ldc to the values indicated with --lda for all problems set with --square. e.g., 1024 or 100-800:100 or 15,2048-3000')
+parser.add_argument('--problemsize',
+    dest='problemsize', default=None,
+    help='additional problems of a set size. may be used in addition to sizem/n/k and lda/b/c. each indicated problem size will be added to the list of problems to complete. should be entered in MxNxK:AxBxC format (where :AxBxC specifies lda/b/c. :AxBxC is optional. if included, lda/b/c are subject to the same range restrictions as indicated in the lda/b/c section of this help. if omitted, :0x0x0 is assumed). may enter multiple in a comma-delimited list. e.g., 2x2x2:4x6x9,3x3x3 or 1024x8 [...]
+parser.add_argument('--lda',
+    dest='lda', default=0,
+    help='value of lda; may include ranges and comma-delimited lists. stepping may be indicated with a colon. if transA = \'n\', lda must be >= \'m\'. otherwise, lda must be >= \'k\'. if this is violated, the problem will be skipped. if lda is 0, it will be automatically set to match either \'m\' (if transA = \'n\') or \'k\' (otherwise). may indicate relative size with +X, where X is the offset relative to M or K (depending on transA). e.g., 1024 or 100-800:100 or 15,2048-3000 or +10 (if [...]
+parser.add_argument('--ldb',
+    dest='ldb', default=0,
+    help='value of ldb; may include ranges and comma-delimited lists. stepping may be indicated with a colon. if transB = \'n\', ldb must be >= \'k\'. otherwise, ldb must be >= \'n\'. if this is violated, the problem will be skipped. if ldb is 0, it will be automatically set to match either \'k\' (if transB = \'n\') or \'n\' (otherwise). may indicate relative size with +X, where X is the offset relative to K or N (depending on transB). e.g., 1024 or 100-800:100 or 15,2048-3000 or +100 (i [...]
+parser.add_argument('--ldc',
+    dest='ldc', default=0,
+    help='value of ldc; may include ranges and comma-delimited lists. stepping may be indicated with a colon. ldc must be >= \'m\'. if this is violated, the problem will be skipped. if ldc is 0, it will be automatically set to match \'m\'. may indicate relative size with +X, where X is the offset relative to M. e.g., 1024 or 100-800:100 or 15,2048-3000 or +5 (if M = 15, ldc = 20) (default 0)')
+parser.add_argument('--offa',
+    dest='offa', default=0,
+    help='offset of the matrix A in memory; may include ranges and comma-delimited lists. stepping may be indicated with a colon. e.g., 0-31 or 100-128:2 or 42 (default 0)')
+parser.add_argument('--offb',
+    dest='offb', default=0,
+    help='offset of the matrix B or vector X in memory; may include ranges and comma-delimited lists. stepping may be indicated with a colon. e.g., 0-31 or 100-128:2 or 42 (default 0)')
+parser.add_argument('--offc',
+    dest='offc', default=0,
+    help='offset of the matrix C or vector Y in memory; may include ranges and comma-delimited lists. stepping may be indicated with a colon. e.g., 0-31 or 100-128:2 or 42 (default 0)')
+parser.add_argument('-a', '--alpha',
+    dest='alpha', default=1.0, type=float,
+    help='specifies the scalar alpha')
+parser.add_argument('-b', '--beta',
+    dest='beta', default=1.0, type=float,
+    help='specifies the scalar beta')
+parser.add_argument('-f', '--function',
+    dest='function', default='gemm',
+    help='indicates the function(s) to use. may be a comma delimited list. choices are ' + str(functionvalues) + ' (default gemm)')
+parser.add_argument('-r', '--precision',
+    dest='precision', default='s',
+    help='specifies the precision for the function. may be a comma delimited list. choices are ' + str(precisionvalues) + ' (default s)')
+parser.add_argument('-o', '--order',
+    dest='order', default='row',
+    help='select row or column major. may be a comma delimited list. choices are ' + str(ordervalues) + ' (default row)')
+parser.add_argument('--transa',
+    dest='transa', default='none',
+    help='select none, transpose, or conjugate transpose for matrix A. may be a comma delimited list. choices are ' + str(transvalues) + ' (default none)')
+parser.add_argument('--transb',
+    dest='transb', default='none',
+    help='select none, transpose, or conjugate transpose for matrix B. may be a comma delimited list. choices are ' + str(transvalues) + ' (default none)')
+parser.add_argument('--side',
+    dest='side', default='left',
+    help='select side, left or right for TRMM and TRSM. may be a comma delimited list. choices are ' + str(sidevalues) + ' (default left)')
+parser.add_argument('--uplo',
+    dest='uplo', default='upper',
+    help='select uplo, upper or lower triangle. may be a comma delimited list. choices are ' + str(uplovalues) + ' (default upper)')
+parser.add_argument('--diag',
+    dest='diag', default='unit',
+    help='select diag, whether set diagonal elements to one. may be a comma delimited list. choices are ' + str(diagvalues) + ' (default unit)')
+parser.add_argument('--library',
+    dest='library', default='clblas',
+    help='indicates the library to use. choices are ' + str(libraryvalues) + ' (default clblas)')
+parser.add_argument('--label',
+    dest='label', default=None,
+    help='a label to be associated with all transforms performed in this run. if LABEL includes any spaces, it must be in \"double quotes\". note that the label is not saved to an .ini file. e.g., --label cayman may indicate that a test was performed on a cayman card or --label \"Windows 32\" may indicate that the test was performed on Windows 32')
+parser.add_argument('--tablefile',
+    dest='tableOutputFilename', default=None,
+    help='save the results to a plaintext table with the file name indicated. this can be used with clblas.plotPerformance.py to generate graphs of the data (default: table prints to screen)')
+parser.add_argument('--roundtrip',
+    dest='roundtrip', default='noroundtrip',
+    help='whether measure the roundtrips or not. choices are ' + str(roundtripvalues) + '. (default noroundtrip); should not be specified when calling ACML')
+ini_group = parser.add_mutually_exclusive_group()
+ini_group.add_argument('--createini',
+    dest='createIniFilename', default=None, type=argparse.FileType('w'),
+    help='create an .ini file with the given name that saves the other parameters given at the command line, then quit. e.g., \'clblas.measurePerformance.py -m 10 -n 100 -k 1000-1010 -f sgemm --createini my_favorite_setup.ini\' will create an .ini file that will save the configuration for an sgemm of the indicated sizes.')
+ini_group.add_argument('--ini',
+    dest='useIniFilename', default=None, type=argparse.FileType('r'),
+    help='use the parameters in the named .ini file instead of the command line parameters.')
+
+args = parser.parse_args()
+
+label = str(args.label)
+roundtrip = str(args.roundtrip)
+library = str(args.library)
+
+subprocess.call('mkdir perfLog', shell = True)
+logfile = os.path.join('perfLog', (label+'-'+'blasMeasurePerfLog.txt'))
+
+def printLog(txt):
+    print txt
+    log(logfile, txt)
+printLog(roundtrip)
+printLog("=========================MEASURE PERFORMANCE START===========================")
+printLog("Process id of Measure Performance:"+str(os.getpid()))
+
+
+#This function is defunct now
+ at timeout(5, "fileName") # timeout is 15 minutes, 15*60 = 300 secs
+def checkTimeOutPut2(args):
+    global currCommandProcess
+    #ret = subprocess.check_output(args, stderr=subprocess.STDOUT)
+    #return ret
+    currCommandProcess = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    printLog("Curr Command Process id = "+str(currCommandProcess.pid))
+    ret = currCommandProcess.communicate()    
+    if(ret[0] == None or ret[0] == ''):
+        errCode = currCommandProcess.poll()
+        raise subprocess.CalledProcessError(errCode, args, output=ret[1])
+    return ret[0]
+	
+#Spawns a separate thread to execute the library command and wait for that thread to complete
+#This wait is of 900 seconds (15 minutes). If still the thread is alive then we kill the thread
+def checkTimeOutPut(args):
+    t = None
+    global currCommandProcess
+    global stde
+    global stdo
+    stde = None
+    stdo = None
+    def executeCommand():
+        global currCommandProcess
+        global stdo
+        global stde
+        try:
+            stdo, stde = currCommandProcess.communicate()
+            printLog('stdout:\n'+str(stdo))
+            printLog('stderr:\n'+str(stde))
+        except:
+            printLog("ERROR: UNKNOWN Exception - +checkWinTimeOutPut()::executeCommand()")
+
+    currCommandProcess = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    thread = Thread(target=executeCommand)
+    thread.start()
+    thread.join(TIMOUT_VAL) #wait for the thread to complete 
+    if thread.is_alive():
+        printLog('ERROR: Killing the process - terminating thread because it is taking too much of time to execute')
+        currCommandProcess.kill()
+        printLog('ERROR: Timed out exception')
+        raise errorHandler.ApplicationException(__file__, errorHandler.TIME_OUT)
+    if stdo == "" or stdo==None:
+        errCode = currCommandProcess.poll()
+        printLog('ERROR: @@@@@Raising Called processor exception')
+        raise subprocess.CalledProcessError(errCode, args, output=stde)
+    return stdo
+
+printLog('Executing measure performance for label: '+str(label))
+
+create_ini_file_if_requested(args)
+args = load_ini_file_if_requested(args, parser)
+args = split_up_comma_delimited_lists(args)
+
+
+"""
+check parameters for sanity
+"""
+if args.sizem.count(None) == 0 and (args.sizen.count(None) or args.sizek.count(None)):
+    printLog( 'ERROR: if any of m, n, or k are specified, all of m, n, and k must be specified')
+    quit()
+if args.sizen.count(None) == 0 and (args.sizem.count(None) or args.sizek.count(None)):
+    printLog( 'ERROR: if any of m, n, or k are specified, all of m, n, and k must be specified')
+    quit()
+if args.sizek.count(None) == 0 and (args.sizem.count(None) or args.sizen.count(None)):
+    printLog( 'ERROR: if any of m, n, or k are specified, all of m, n, and k must be specified')
+    quit()
+
+if args.square.count(None) and args.problemsize.count(None) and args.sizem.count(None) and args.sizen.count(None) and args.sizek.count(None):
+    printLog( 'ERROR: at least one of [--square] or [--problemsize] or [-m, -n, and -k] must be specified')
+    quit()
+
+args.sizem = expand_range(args.sizem)
+args.sizen = expand_range(args.sizen)
+args.sizek = expand_range(args.sizek)
+args.square = expand_range(args.square)
+args.lda = expand_range(args.lda)
+args.ldb = expand_range(args.ldb)
+args.ldc = expand_range(args.ldc)
+args.offa = expand_range(args.offa)
+args.offb = expand_range(args.offb)
+args.offc = expand_range(args.offc)
+args.problemsize = decode_parameter_problemsize(args.problemsize)
+
+"""
+create the problem size combinations for each run of the client
+"""
+if not args.sizem.count(None):
+    # we only need to do make combinations of problem sizes if m,n,k have been specified explicitly
+    problem_size_combinations = itertools.product(args.sizem, args.sizen, args.sizek,
+                                                  args.lda, args.ldb, args.ldc)
+    problem_size_combinations = list(itertools.islice(problem_size_combinations, None))
+else:
+    problem_size_combinations = []
+
+"""
+add manually entered problem sizes to the list of problems to crank out
+"""
+manual_test_combinations = []
+
+
+if not args.problemsize.count(None):
+    for n in args.problemsize:
+        sizem = []
+        sizen = []
+        sizek = []
+        lda = []
+        ldb = []
+        ldc = []
+    
+        sizem.append(int(n[0][0]))
+        sizen.append(int(n[0][1]))
+        sizek.append(int(n[0][2]))
+        if len(n) > 1:
+            lda.append(int(n[1][0]))
+            ldb.append(int(n[1][1]))
+            ldc.append(int(n[1][2]))
+        else:
+            lda.append(0)
+            ldb.append(0)
+            ldc.append(0)
+    
+        combos = itertools.product(sizem,sizen,sizek,lda,ldb,ldc)
+        combos = list(itertools.islice(combos, None))
+        for n in combos:
+            manual_test_combinations.append(n)
+
+"""
+add square problem sizes to the list of problems to crank out
+"""
+square_test_combinations = []
+
+if not args.square.count(None):
+    for n in args.square:
+        combos = itertools.product([n],[n],[n],args.lda) # only lda is considered with --square, and lda/b/c are all set to the values specified by lda
+        combos = list(itertools.islice(combos, None))
+        for n in combos:
+            square_test_combinations.append((n[0],n[1],n[2],n[3],n[3],n[3])) # set lda/b/c = lda
+
+problem_size_combinations = problem_size_combinations + manual_test_combinations + square_test_combinations
+
+"""
+create final list of all transformations (with problem sizes and transform properties)
+"""
+test_combinations = itertools.product(problem_size_combinations, args.offa, args.offb, args.offc, args.alpha, args.beta, args.order, args.transa, args.transb, args.side, args.uplo, args.diag, args.function, args.precision, args.device, args.library)
+test_combinations = list(itertools.islice(test_combinations, None))
+
+test_combinations = [BlasTestCombination(params[0][0], params[0][1], params[0][2], params[0][3], params[0][4], params[0][5], params[1], params[2], params[3], params[4], params[5], params[6], params[7], params[8], params[9], params[10], params[11], params[12], params[13], params[14], params[15], label) for params in test_combinations]
+
+
+"""
+open output file and write the header
+"""
+table = open_file(args.tableOutputFilename)
+table.write(blas_table_header() + '\n')
+table.flush()
+
+"""
+turn each test combination into a command, run the command, and then stash the gflops
+"""
+result = [] # this is where we'll store the results for the table
+
+printLog( 'Total combinations = '+str(len(test_combinations)))
+
+vi = 0
+#test_combinations = test_combinations[:5]
+for params in test_combinations:
+    vi = vi+1
+    printLog('preparing command: '+ str(vi))  
+    device = params.device
+    sizem = params.sizem
+    sizen = params.sizen
+    sizek = params.sizek
+    lda = params.lda
+    ldb = params.ldb
+    ldc = params.ldc
+    offa = params.offa
+    offb = params.offb
+    offc = params.offc
+    alpha = params.alpha
+    beta = params.beta
+    function = params.function
+    precision = params.precision
+    library = params.library
+    label = params.label
+
+    if params.order == 'row':
+        order = str(0)
+    elif params.order == 'column':
+        order = str(1)
+    else:
+        printLog( 'ERROR: unknown value for order')
+        quit()
+    
+    if params.side == 'left':
+        side = str(0)
+    elif params.side == 'right':
+        side = str(1)
+    else:
+        printLog( 'ERROR: unknown value for side')
+        quit()
+        
+    if params.uplo == 'upper':
+        uplo = str(0)
+    elif params.uplo == 'lower':
+        uplo = str(1)
+    else:
+        printLog( 'ERROR: unknown value for uplo')
+        quit()
+
+    if params.diag == 'unit':
+        diag = str(0)
+    elif params.diag == 'nonunit':
+        diag = str(1)
+    else:
+        printLog( 'ERROR: unknown value for diag')
+        quit()
+
+    if re.search('^\+\d+$', lda):
+        if params.transa == 'none':
+            lda = str(int(lda.lstrip('+')) + int(sizem))
+        else:
+            lda = str(int(lda.lstrip('+')) + int(sizek))
+
+    if re.search('^\+\d+$', ldb):
+        if params.transb == 'none':
+            ldb = str(int(ldb.lstrip('+')) + int(sizek))
+        else:
+            ldb = str(int(ldb.lstrip('+')) + int(sizen))
+
+    if re.search('^\+\d+$', ldc):
+        ldc = str(int(ldc.lstrip('+')) + int(sizem))
+
+    if params.transa == 'none':
+        transa = str(0)
+    elif params.transa == 'transpose':
+        transa = str(1)
+    elif params.transa == 'conj':
+        transa = str(2)
+    else:
+        printLog( 'ERROR: unknown value for transa')
+        
+    if params.transb == 'none':
+        transb = str(0)
+    elif params.transb == 'transpose':
+        transb = str(1)
+    elif params.transb == 'conj':
+        transb = str(2)
+    else:
+        printLog( 'ERROR: unknown value for transb')
+     
+    if library == 'acmlblas':
+        arguments = [executable(library),
+                     '--' + device,
+                     '-m', sizem,
+                     '-n', sizen,
+                     '-k', sizek,
+                     '--lda', lda,
+                     '--ldb', ldb,
+                     '--ldc', ldc,
+                     '--offA', offa,
+                     '--offBX', offb,
+                     '--offCY', offc,
+                     '--alpha', alpha,
+                     '--beta', beta,
+                     '--order', order,
+                     '--transposeA', transa,
+                     '--transposeB', transb,
+                     '--side', side,
+                     '--uplo', uplo,
+                     '--diag', diag,
+                     '--function', function,
+                     '--precision', precision,
+                     '-p', '10',
+					 '--roundtrip', roundtrip]
+    elif library == 'clblas':
+        arguments = [executable(library),
+                     '--' + device,
+                     '-m', sizem,
+                     '-n', sizen,
+                     '-k', sizek,
+                     '--lda', lda,
+                     '--ldb', ldb,
+                     '--ldc', ldc,
+                     '--offA', offa,
+                     '--offBX', offb,
+                     '--offCY', offc,
+                     '--alpha', alpha,
+                     '--beta', beta,
+                     '--order', order,
+                     '--transposeA', transa,
+                     '--transposeB', transb,
+                     '--side', side,
+                     '--uplo', uplo,
+                     '--diag', diag,
+                     '--function', function,
+                     '--precision', precision,
+                     '-p', '10',
+					 '--roundtrip', roundtrip]
+    else:
+        printLog( 'ERROR: unknown library:"' +library+ '" can\'t assemble command')
+        quit()
+
+    writeline = True
+   
+    try:
+        printLog('Executing Command: '+str(arguments))
+        output = checkTimeOutPut(arguments);
+        output = output.split(os.linesep);
+        printLog('Execution Successfull---------------\n')
+    except errorHandler.ApplicationException as ae:
+        writeline = False
+        #Killing the process
+        #if system() != 'Windows':
+        #    currCommandProcess.kill()
+        #    printLog('ERROR: Killed process')
+        printLog('ERROR: Command is taking too much of time-- '+ae.message+'\n'+'Command: \n'+str(arguments))
+    except subprocess.CalledProcessError as clientCrash:
+        if clientCrash.output.count('bad_alloc'):
+            writeline = False
+            printLog( 'Omitting line from table - problem is too large')
+        elif clientCrash.output.count('CL_INVALID_BUFFER_SIZE'):
+            writeline = False
+            printLog( 'Omitting line from table - problem is too large')
+        elif clientCrash.output.count('CL_INVALID_WORK_GROUP_SIZE'):
+            writeline = False
+            printLog( 'Omitting line from table - workgroup size is invalid')
+        elif clientCrash.output.count('lda must be set to 0 or a value >='):
+            writeline = False
+            printLog( 'Omitting line from table - lda is too small')
+        elif clientCrash.output.count('ldb must be set to 0 or a value >='):
+            writeline = False
+            printLog( 'Omitting line from table - ldb is too small')
+        elif clientCrash.output.count('ldc must be set to 0 or a value >='):
+            writeline = False
+            printLog( 'Omitting line from table - ldc is too small')
+        else:
+            writeline = False
+            printLog('ERROR: client crash.\n')
+            printLog(str(clientCrash.output))
+            printLog( str(clientCrash))
+            printLog('In original code we quit here - 1')
+            continue
+            #quit()  
+
+    if writeline:
+        gflopsoutput = itertools.ifilter( lambda x: x.count('Gflops'), output)
+        gflopsoutput = list(itertools.islice(gflopsoutput, None))
+        thisResult = re.search('\d+\.*\d*e*-*\d*$', gflopsoutput[0])
+        if thisResult != None:
+            thisResult = float(thisResult.group(0))
+            thisResult = (params.sizem,
+                          params.sizen,
+                          params.sizek,
+                          params.lda,
+                          params.ldb,
+                          params.ldc,
+                          params.offa,
+                          params.offb,
+                          params.offc,
+                          params.alpha,
+                          params.beta,
+                          params.order,
+                          params.transa,
+                          params.transb,
+                          params.side,
+                          params.uplo,
+                          params.diag,
+                          params.precision + params.function,
+                          params.device,
+                          params.library,
+                          params.label,
+                          thisResult)
+
+            outputRow = ''
+            for x in thisResult:
+                outputRow = outputRow + str(x) + ','
+            outputRow = outputRow.rstrip(',')
+            table.write(outputRow + '\n')
+            table.flush()
+        else:
+            if gflopsoutput[0].find('nan') or gflopsoutput[0].find('inf'):
+                printLog( 'WARNING: output from client was funky for this run. skipping table row')
+            else:
+                prinLog( 'ERROR: output from client makes no sense')
+                prinLog(str( gflopsoutput[0]))
+                printLog('In original code we quit here - 2')
+                continue
+                #quit()
+printLog("=========================MEASURE PERFORMANCE ENDS===========================\n")
diff --git a/src/scripts/perf/performanceUtility.py b/src/scripts/perf/performanceUtility.py
new file mode 100644
index 0000000..e3c7626
--- /dev/null
+++ b/src/scripts/perf/performanceUtility.py
@@ -0,0 +1,97 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+#This file contains a number of utilities function which could be independent of
+#any specific domain concept
+
+import signal
+from subprocess import check_output
+import errorHandler
+from datetime import datetime
+
+def currentUser():
+    try:
+        return check_output("who", shell = True).split()[0];
+    except:
+        print 'Unhandled Exception at performanceUtility::currentUser()'
+        raise
+    
+#Details: Generate sorted numbers in radices of 2,3 and 5 upto a given upper limit number
+def generate235Radices(maxSize):
+    sizeList = list()
+    i = 0
+    j = 0
+    k = 0
+    SUM = int()
+    sumj = int()
+    sumk = int()
+    sumi = 1
+    while(True):
+        sumj = 1
+        j = 0
+        while(True):
+            sumk = 1
+            k = 0
+            while(True):
+                SUM = sumi*sumj*sumk
+                if ( SUM > maxSize ): break
+                sizeList.append(SUM)
+                k += 1
+                sumk *= 2
+            if (k == 0): break
+            j += 1
+            sumj *= 3
+        if ( j == 0 and k == 0): break
+        i += 1
+        sumi *= 5
+    sizeList.sort()
+    return sizeList
+
+
+def timeout(timeout_time, default):
+    def timeout_function(f):
+        def f2(args):
+            def timeout_handler(signum, frame):
+                raise errorHandler.TimeoutException()
+ 
+            old_handler = signal.signal(signal.SIGALRM, timeout_handler) 
+            signal.alarm(timeout_time) # triger alarm in timeout_time seconds
+            retval = ""
+            try: 
+                retval = f(args)
+            except errorHandler.TimeoutException:
+                raise errorHandler.ApplicationException(__file__, errorHandler.TIME_OUT)
+            except:
+                signal.alarm(0)
+                raise
+            finally:
+                #print 'executing finally'
+                signal.signal(signal.SIGALRM, old_handler) 
+            signal.alarm(0)
+            return retval
+        return f2
+    return timeout_function
+
+
+def logTxtOutput(fileName, mode, txt):
+    todayFile =  fileName+'-'+datetime.now().strftime('%Y-%b-%d')+'.txt'
+    with open(todayFile, mode) as f:
+        f.write('------\n'+txt+'\n')
+        
+def log(filename, txt):
+    with open(filename, 'a') as f:
+        f.write(datetime.now().ctime()+'# '+txt+'\n')
+        
\ No newline at end of file
diff --git a/src/scripts/perf/plotPerformance.py b/src/scripts/perf/plotPerformance.py
new file mode 100644
index 0000000..0c62fcc
--- /dev/null
+++ b/src/scripts/perf/plotPerformance.py
@@ -0,0 +1,309 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+# to use this script, you will need to download and install the 32-BIT VERSION of:
+# - Python 2.7 x86 (32-bit) - http://www.python.org/download/releases/2.7.1
+#
+# you will also need the 32-BIT VERSIONS of the following packages as not all the packages are available in 64bit at the time of this writing
+# The ActiveState python distribution is recommended for windows
+# (make sure to get the python 2.7-compatible packages):
+# - NumPy 1.5.1 (32-bit, 64-bit unofficial, supports Python 2.4 - 2.7 and 3.1 - 3.2.) - http://sourceforge.net/projects/numpy/files/NumPy/
+# - matplotlib 1.0.1 (32-bit & 64-bit, supports Python 2.4 - 2.7) - http://sourceforge.net/projects/matplotlib/files/matplotlib/
+#
+# For ActiveState Python, all that one should need to type is 'pypm install matplotlib'
+
+import datetime
+import sys
+import argparse
+import subprocess
+import itertools
+import os
+import matplotlib
+import pylab
+from matplotlib.backends.backend_pdf import PdfPages
+from blasPerformanceTesting import *
+
+def plotGraph(dataForAllPlots, title, plottype, plotkwargs, xaxislabel, yaxislabel):
+  """
+  display a pretty graph
+  """
+  colors = ['k','y','m','c','r','b','g']
+  #plottype = 'plot'
+  for thisPlot in dataForAllPlots:
+    getattr(pylab, plottype)(thisPlot.xdata, thisPlot.ydata,
+                             '{}.-'.format(colors.pop()), 
+                             label=thisPlot.label, **plotkwargs)
+  if len(dataForAllPlots) > 1:
+    pylab.legend(loc='best')
+  
+  pylab.title(title)
+  pylab.xlabel(xaxislabel)
+  pylab.ylabel(yaxislabel)
+  pylab.grid(True)
+  
+  if args.outputFilename == None:
+    # if no pdf output is requested, spit the graph to the screen . . .
+    pylab.show()
+  else:
+    pylab.savefig(args.outputFilename,dpi=(1024/8))
+    # . . . otherwise, gimme gimme pdf
+    #pdf = PdfPages(args.outputFilename)
+    #pdf.savefig()
+    #pdf.close()
+
+######## plotFromDataFile() Function to plot from data file begins ########
+def plotFromDataFile():
+  data = []
+  """
+  read in table(s) from file(s)
+  """
+  for thisFile in args.datafile:
+    if not os.path.isfile(thisFile):
+      print 'No file with the name \'{}\' exists. Please indicate another filename.'.format(thisFile)
+      quit()
+  
+    results = open(thisFile, 'r')
+    results_contents = results.read()
+    results_contents = results_contents.rstrip().split('\n')
+  
+    firstRow = results_contents.pop(0)
+    print firstRow
+    print blas_table_header()
+    print firstRow.rstrip()==blas_table_header()
+    if firstRow.rstrip() != blas_table_header():
+      print 'ERROR: input file \'{}\' does not match expected format.'.format(thisFile)
+      quit()
+  
+    for row in results_contents:
+        row = row.split(',')
+        row = TableRow(BlasTestCombination(row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8],row[9],row[10],row[11],row[12],row[13],row[14], row[15], row[16], row[17][1:], row[17][0], row[18], row[19], row[20]), row[21])
+        data.append(BlasGraphPoint(row.parameters.sizem, row.parameters.sizen, row.parameters.sizek, row.parameters.lda, row.parameters.ldb, row.parameters.ldc, row.parameters.offa , row.parameters.offb , row.parameters.offc , row.parameters.device, row.parameters.order, row.parameters.transa, row.parameters.transb, row.parameters.precision + row.parameters.function, row.parameters.library, row.parameters.label, row.gflops))
+  
+  """
+  data sanity check
+  """
+  # if multiple plotvalues have > 1 value among the data rows, the user must specify which to plot
+  multiplePlotValues = []
+  for option in plotvalues:
+    values = []
+    for point in data:
+      values.append(getattr(point, option)) 
+    multiplePlotValues.append(len(set(values)) > 1)
+  if multiplePlotValues.count(True) > 1 and args.plot == None:
+    print 'ERROR: more than one parameter of {} has multiple values. Please specify which parameter to plot with --plot'.format(plotvalues)
+    quit()
+  
+  # if args.graphxaxis is not 'problemsize', the user should know that the results might be strange
+  #if args.graphxaxis != 'problemsize':
+  #  xaxisvalueSet = []
+  #  for option in xaxisvalues:
+  #    if option != 'problemsize':
+  #      values = []
+  #      for point in data:
+  #        values.append(getattr(point, option)) 
+  #      xaxisvalueSet.append(len(set(values)) > 1)
+  #  if xaxisvalueSet.count(True) > 1:
+  #    print 'WARNING: more than one parameter of {} is varied. unexpected results may occur. please double check your graphs for accuracy.'.format(xaxisvalues)
+  
+  # multiple rows should not have the same input values
+  #pointInputs = []
+  #for point in data:
+  #  pointInputs.append(point.__str__().split(';')[0])
+  #if len(set(pointInputs)) != len(data):
+  #  print 'ERROR: imported table has duplicate rows with identical input parameters'
+  #  quit()
+  
+  """
+  figure out if we have multiple plots on this graph (and what they should be)
+  """
+  if args.plot != None:
+    multiplePlots = args.plot
+  elif multiplePlotValues.count(True) == 1 and plotvalues[multiplePlotValues.index(True)] != 'sizek':
+    # we don't ever want to default to sizek, because it's probably going to vary for most plots
+    # we'll require the user to explicitly request multiple plots on sizek if necessary
+    multiplePlots = plotvalues[multiplePlotValues.index(True)]
+  else:
+    # default to device if none of the options to plot have multiple values
+    multiplePlots = 'device'
+  
+  """
+  assemble data for the graphs
+  """
+  data.sort(key=lambda row: int(getattr(row, args.graphxaxis)))
+  
+  # choose scale for x axis
+  if args.xaxisscale == None:
+    # user didn't specify. autodetect
+    if int(getattr(data[len(data)-1], args.graphxaxis)) > 2000: # big numbers on x-axis
+      args.xaxisscale = 'log2'
+    elif int(getattr(data[len(data)-1], args.graphxaxis)) > 10000: # bigger numbers on x-axis
+      args.xaxisscale = 'log10'
+    else: # small numbers on x-axis
+      args.xaxisscale = 'linear'
+  
+  if args.xaxisscale == 'linear':
+    plotkwargs = {}
+    plottype = 'plot'
+  elif args.xaxisscale == 'log2':
+    plottype = 'semilogx'
+    plotkwargs = {'basex':2}
+  elif args.xaxisscale == 'log10':
+    plottype = 'semilogx'
+    plotkwargs = {'basex':10}
+  else:
+    print 'ERROR: invalid value for x-axis scale'
+    quit()
+  
+  plots = set(getattr(row, multiplePlots) for row in data)
+  
+  class DataForOnePlot:
+    def __init__(self, inlabel, inxdata, inydata):
+      self.label = inlabel
+      self.xdata = inxdata
+      self.ydata = inydata
+  
+  dataForAllPlots = []
+  for plot in plots:
+    dataForThisPlot = itertools.ifilter( lambda x: getattr(x, multiplePlots) == plot, data)
+    dataForThisPlot = list(itertools.islice(dataForThisPlot, None))
+    #if args.graphxaxis == 'problemsize':
+    #  xdata = [int(row.x) * int(row.y) * int(row.z) * int(row.batchsize) for row in dataForThisPlot]
+    #else:
+    xdata = [getattr(row, args.graphxaxis) for row in dataForThisPlot]
+    ydata = [getattr(row, args.graphyaxis) for row in dataForThisPlot]
+    dataForAllPlots.append(DataForOnePlot(plot,xdata,ydata))
+  
+  """
+  assemble labels for the graph or use the user-specified ones
+  """
+  if args.graphtitle:
+    # use the user selection
+    title = args.graphtitle
+  else:
+    # autogen a lovely title
+    title = 'Performance vs. ' + args.graphxaxis.capitalize()
+  
+  if args.xaxislabel:
+    # use the user selection
+    xaxislabel = args.xaxislabel
+  else:
+    # autogen a lovely x-axis label
+    if args.graphxaxis == 'cachesize':
+      units = '(bytes)'
+    else:
+      units = '(datapoints)'
+  
+    xaxislabel = args.graphxaxis + ' ' + units
+  
+  if args.yaxislabel:
+    # use the user selection
+    yaxislabel = args.yaxislabel
+  else:
+    # autogen a lovely y-axis label
+    if args.graphyaxis == 'gflops':
+      units = 'GFLOPS'
+    yaxislabel = 'Performance (' + units + ')'
+  
+  """
+  display a pretty graph
+  """
+  colors = ['k','y','m','c','r','b','g']
+  
+  for thisPlot in dataForAllPlots:
+    getattr(pylab, plottype)(thisPlot.xdata, thisPlot.ydata, '{}.-'.format(colors.pop()), label=thisPlot.label, **plotkwargs)
+  
+  if len(dataForAllPlots) > 1:
+    pylab.legend(loc='best')
+  
+  pylab.title(title)
+  pylab.xlabel(xaxislabel)
+  pylab.ylabel(yaxislabel)
+  pylab.grid(True)
+  
+  if args.outputFilename == None:
+    # if no pdf output is requested, spit the graph to the screen . . .
+    pylab.show()
+  else:
+    # . . . otherwise, gimme gimme pdf
+    #pdf = PdfPages(args.outputFilename)
+    #pdf.savefig()
+    #pdf.close()
+    pylab.savefig(args.outputFilename,dpi=(1024/8))
+######### plotFromDataFile() Function to plot from data file ends #########
+
+
+
+######## "main" program begins #####
+"""
+define and parse parameters
+"""
+xaxisvalues = ['sizem','sizen','sizek']
+yaxisvalues = ['gflops']
+plotvalues = ['lda','ldb','ldc','sizek','device','label','order','transa','transb','function','library']
+
+
+
+parser = argparse.ArgumentParser(description='Plot performance of the clblas\
+    library. clblas.plotPerformance.py reads in data tables from clblas.\
+    measurePerformance.py and plots their values')
+fileOrDb = parser.add_mutually_exclusive_group(required=True)
+fileOrDb.add_argument('-d', '--datafile',
+  dest='datafile', action='append', default=None, required=False,
+  help='indicate a file to use as input. must be in the format output by\
+  clblas.measurePerformance.py. may be used multiple times to indicate\
+  multiple input files. e.g., -d cypressOutput.txt -d caymanOutput.txt')
+parser.add_argument('-x', '--x_axis',
+  dest='graphxaxis', default=None, choices=xaxisvalues, required=True,
+  help='indicate which value will be represented on the x axis. problemsize\
+      is defined as x*y*z*batchsize')
+parser.add_argument('-y', '--y_axis',
+  dest='graphyaxis', default='gflops', choices=yaxisvalues,
+  help='indicate which value will be represented on the y axis')
+parser.add_argument('--plot',
+  dest='plot', default=None, choices=plotvalues,
+  help='indicate which of {} should be used to differentiate multiple plots.\
+      this will be chosen automatically if not specified'.format(plotvalues))
+parser.add_argument('--title',
+  dest='graphtitle', default=None,
+  help='the desired title for the graph generated by this execution. if\
+      GRAPHTITLE contains any spaces, it must be entered in \"double quotes\".\
+      if this option is not specified, the title will be autogenerated')
+parser.add_argument('--x_axis_label',
+  dest='xaxislabel', default=None,
+  help='the desired label for the graph\'s x-axis. if XAXISLABEL contains\
+      any spaces, it must be entered in \"double quotes\". if this option\
+      is not specified, the x-axis label will be autogenerated')
+parser.add_argument('--x_axis_scale',
+  dest='xaxisscale', default=None, choices=['linear','log2','log10'],
+  help='the desired scale for the graph\'s x-axis. if nothing is specified,\
+      it will be selected automatically')
+parser.add_argument('--y_axis_label',
+  dest='yaxislabel', default=None,
+  help='the desired label for the graph\'s y-axis. if YAXISLABEL contains any\
+      spaces, it must be entered in \"double quotes\". if this option is not\
+      specified, the y-axis label will be autogenerated')
+parser.add_argument('--outputfile',
+  dest='outputFilename', default=None,
+  help='name of the file to output graphs. Supported formats: emf, eps, pdf, png, ps, raw, rgba, svg, svgz.')
+
+args = parser.parse_args()
+
+if args.datafile != None:
+  plotFromDataFile()
+else:
+  print "Atleast specify if you want to use text files or database for plotting graphs. Use -h or --help option for more details"
+  quit()
+
diff --git a/src/targetver.h b/src/targetver.h
new file mode 100644
index 0000000..3b0dda2
--- /dev/null
+++ b/src/targetver.h
@@ -0,0 +1,29 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#pragma once
+
+// The following macros define the minimum required platform.  The minimum required platform
+// is the earliest version of Windows, Internet Explorer etc. that has the necessary features to run
+// your application.  The macros work by enabling all features available on platform versions up to and
+// including the version specified.
+
+// Modify the following defines if you have to target a platform prior to the ones specified below.
+// Refer to MSDN for the latest info on corresponding values for different platforms.
+#ifndef _WIN32_WINNT            // Specifies that the minimum required platform is Windows Vista.
+#define _WIN32_WINNT 0x0600     // Change this to the appropriate value to target other versions of Windows.
+#endif
+
diff --git a/src/tests/BasicRoutines.cpp b/src/tests/BasicRoutines.cpp
new file mode 100644
index 0000000..04767d3
--- /dev/null
+++ b/src/tests/BasicRoutines.cpp
@@ -0,0 +1,102 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+class BasicRoutines : public testing::Test {
+protected:
+    BasicRoutines() :
+        platform(0), device(0), context(NULL), queue(NULL) {
+    }
+
+    virtual ~BasicRoutines() {
+    }
+
+    virtual void SetUp() {
+        cl_int err;
+        cl_context_properties props[] = { CL_CONTEXT_PLATFORM, 0, 0 };
+
+        ASSERT_EQ(CL_SUCCESS, clGetPlatformIDs(1, &platform, NULL));
+        ASSERT_EQ(CL_SUCCESS,
+            clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL));
+        props[1] = (cl_context_properties)platform;
+        context = clCreateContext(props, 1, &device, NULL, NULL, &err);
+        ASSERT_EQ(CL_SUCCESS, err) << "clCreateContext() failed";
+        queue = clCreateCommandQueue(context, device, 0, &err);
+        ASSERT_EQ(CL_SUCCESS, err) << "clCreateCommandQueue() failed";
+    }
+
+    virtual void TearDown() {
+        if (queue != NULL) {
+            clReleaseCommandQueue(queue);
+        }
+        if (context != NULL) {
+            clReleaseContext(context);
+        }
+    }
+
+    cl_platform_id platform;
+    cl_device_id device;
+    cl_context context;
+    cl_command_queue queue;
+};
+
+TEST_F(BasicRoutines, UsualCodeFlow) {
+    EXPECT_EQ(CL_SUCCESS, clblasSetup());
+    EXPECT_EQ(CL_SUCCESS, AMD_clBLAS_PREBUILD_KERNELS(context));
+    EXPECT_EQ(CL_SUCCESS, AMD_clBLAS_CLEANUP_KERNELS(context));
+    clblasTeardown();
+}
+
+TEST_F(BasicRoutines, DoubleSetup) {
+    EXPECT_EQ(CL_SUCCESS, clblasSetup());
+    EXPECT_NE(clblasSetup(), CL_SUCCESS);
+    clblasTeardown();
+}
+
+TEST_F(BasicRoutines, MissedSetup) {
+    EXPECT_NE(AMD_clBLAS_PREBUILD_KERNELS(context), CL_SUCCESS);
+}
+
+TEST_F(BasicRoutines, BadContext) {
+    EXPECT_EQ(CL_SUCCESS, clblasSetup());
+    EXPECT_NE(AMD_clBLAS_PREBUILD_KERNELS(NULL), CL_SUCCESS);
+    clblasTeardown();
+}
+
+TEST_F(BasicRoutines, TwoContexts) {
+    cl_int err;
+    cl_context_properties props[] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_context anotherContext;
+
+    EXPECT_EQ(CL_SUCCESS, clblasSetup());
+
+    props[1] = (cl_context_properties)platform;
+    anotherContext = clCreateContext(props, 1, &device, NULL, NULL, &err);
+    ASSERT_EQ(CL_SUCCESS, err) << "Need a context";
+    ASSERT_NE(context, anotherContext) << "Contexts must be different";
+
+    EXPECT_EQ(CL_SUCCESS, AMD_clBLAS_PREBUILD_KERNELS(context));
+    EXPECT_EQ(CL_SUCCESS, AMD_clBLAS_PREBUILD_KERNELS(anotherContext));
+
+    EXPECT_EQ(CL_SUCCESS, AMD_clBLAS_CLEANUP_KERNELS(context));
+    EXPECT_EQ(CL_SUCCESS, AMD_clBLAS_CLEANUP_KERNELS(anotherContext));
+
+    clReleaseContext(context);
+    clblasTeardown();
+}
diff --git a/src/tests/BlasBase.cpp b/src/tests/BlasBase.cpp
new file mode 100644
index 0000000..8590599
--- /dev/null
+++ b/src/tests/BlasBase.cpp
@@ -0,0 +1,525 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <string.h>
+#include <iostream>
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <BlasBase.h>
+
+namespace clMath {
+
+BlasBase* BlasBase::getInstance()
+{
+    static BlasBase instance;
+
+    if (!instance.initialized()) {
+      return NULL;
+    }
+    return &instance;
+}
+
+BlasBase::BlasBase()
+    : platform_(0), primaryDevice_(0), additionalDevice_(0), context_(NULL),
+    useNumCommandQueues_(false), numCommandQueues_(1),
+    useAlpha_(false), useBeta_(false), useSeed_(false),
+    useM_(false), useN_(false), useK_(false),
+    M_(0), N_(0), K_(0),
+    useIncX_(false), useIncY_(false),
+    incX_(0), incY_(0),
+    useImages_(false), devType_(CL_DEVICE_TYPE_GPU), imageA_(0), imageB_(0)
+{
+    memset(&alpha_, 0, sizeof(alpha_));
+    memset(&beta_, 0, sizeof(beta_));
+    memset(commandQueues_, 0, sizeof(commandQueues_));
+
+    SetUp();
+}
+
+BlasBase::~BlasBase()
+{
+    /*
+     * Teardown() is disabled due to troubles with test interrupting
+     * with CTRL-C in windows. This occurs since after pressing of these keys
+     * the OpenCL runtime is destroyed before calling global object destructors.
+     */
+#if 0
+    TearDown();
+#endif
+}
+
+cl_int
+BlasBase::getPlatforms(cl_platform_id **platforms, cl_int *error)
+{
+    cl_int err;
+    cl_uint nrPlatforms;
+
+    //platforms = NULL;
+
+    if (error != NULL) {
+        *error = CL_SUCCESS;
+    }
+
+    err = clGetPlatformIDs(0, NULL, &nrPlatforms);
+    if (err != CL_SUCCESS) {
+        if (error != NULL) {
+            *error = err;
+        }
+        return 0;
+    }
+    if (nrPlatforms == 0) {
+        return 0;
+    }
+
+    *platforms = new cl_platform_id[nrPlatforms];
+    err = clGetPlatformIDs(nrPlatforms, *platforms, NULL);
+    if (err != CL_SUCCESS) {
+        if (error != NULL) {
+            *error = err;
+        }
+        delete[] platforms;
+        return 0;
+    }
+    return nrPlatforms;
+}
+
+cl_device_id
+BlasBase::getDevice(cl_device_type type, const char* name,
+                       cl_int *error)
+{
+    cl_int err;
+    cl_uint nrDevices, i, p;
+    cl_device_id *devices, result = NULL;
+    size_t sz;
+    char *str;
+    cl_platform_id *platforms, selPlatform = NULL;
+    cl_uint nrPlatforms;
+    cl_device_info devInfo;
+
+    nrPlatforms = getPlatforms(&platforms, &err);
+
+    if (error != NULL) {
+        *error = CL_SUCCESS;
+    }
+
+    /*
+     * If device name is not specified, then any AMD device is preferable.
+     * It there are not AMD devices of such a type presented in the system,
+     * then get a device of another vendor. If this is the additional device
+     * which is being tried to get, it must be supported by the same platform
+     * as the primary device does.
+     */
+
+    if (name == NULL) {
+        name = "Advanced Micro Devices, Inc.";
+        devInfo = CL_DEVICE_VENDOR;
+    }
+    else {
+        devInfo = CL_DEVICE_NAME;
+        type = CL_DEVICE_TYPE_ALL;
+    }
+
+    for (p = 0; p < nrPlatforms; p++) {
+        cl_platform_id platform = platforms[p];
+        err = clGetDeviceIDs(platform, type, 0, NULL, &nrDevices);
+        if (err == CL_DEVICE_NOT_FOUND) {
+            continue;
+        }
+        if (err != CL_SUCCESS) {
+            if (error != NULL) {
+                *error = err;
+            }
+            return NULL;
+        }
+        if (nrDevices == 0) {
+            return NULL;
+        }
+
+        devices = new cl_device_id[nrDevices];
+        err = clGetDeviceIDs(platform, type, nrDevices, devices, NULL);
+        if (err != CL_SUCCESS) {
+            if (error != NULL) {
+                *error = err;
+            }
+            delete[] devices;
+            return NULL;
+        }
+
+        for (i = 0; i < nrDevices; i++) {
+            err = clGetDeviceInfo(devices[i], devInfo, 0, NULL, &sz);
+            if (err != CL_SUCCESS) {
+                continue;
+            }
+            str = new char[sz + 1];
+            memset(str, 0, sz + 1);
+            err = clGetDeviceInfo(devices[i], devInfo, sz, str, NULL);
+            if (err != CL_SUCCESS) {
+                delete[] str;
+                continue;
+            }
+            if ((devInfo == CL_DEVICE_VENDOR) && (result == NULL) &&
+                ((platform_ == NULL) || (platform == platform_))) {
+
+                result = devices[i];
+                selPlatform = platform;
+            }
+                printf("---- %s\n", str);
+            if (strcmp(str, name) == 0) {
+                //printf("---- %s\n", str);
+                platform_ = platform;
+                result = devices[i];
+                delete[] str;
+                break;
+            }
+            delete[] str;
+        }
+        delete[] devices;
+        devices = NULL;
+    }
+
+    if (platform_ == NULL) {
+        platform_ = selPlatform;
+    }
+
+    delete[] platforms;
+    return result;
+}
+
+void
+BlasBase::SetUp()
+{
+    cl_int err = CL_SUCCESS;
+    cl_context_properties props[] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_uint i = 1;
+    cl_uint addDevQueueIdx = MAX_COMMAND_QUEUES;
+    cl_device_id devices[2] = {NULL, NULL};
+
+    primaryDevice_ = getDevice(devType_, devName_, &err);
+    if ((err != CL_SUCCESS) || (primaryDevice_ == NULL)) {
+        ASSERT_EQ(CL_SUCCESS, clGetPlatformIDs(1, &platform_, NULL));
+        ASSERT_EQ(CL_SUCCESS,
+            clGetDeviceIDs(platform_, devType_, 1, &primaryDevice_, NULL));
+    }
+
+    devices[0] = primaryDevice_;
+
+#if !defined(TEST_WITH_SINGLE_DEVICE)
+    cl_device_type addDevType;
+
+    if (MAX_COMMAND_QUEUES > 1) {
+        addDevType = (devType_ == CL_DEVICE_TYPE_GPU) ? CL_DEVICE_TYPE_CPU :
+                                                    CL_DEVICE_TYPE_GPU;
+        additionalDevice_ = getDevice(addDevType, NULL, NULL);
+        if (additionalDevice_ != NULL) {
+            addDevQueueIdx = (MAX_COMMAND_QUEUES <= 3) ?
+                (MAX_COMMAND_QUEUES - 1) : 2;
+            devices[1] = additionalDevice_;
+            i = 2;
+        }
+    }
+#endif  /* !TEST_WITH_SINGLE_DEVICE */
+
+    props[1] = (cl_context_properties)platform_;
+
+    context_ = clCreateContext(props, i, devices, NULL, NULL, &err);
+    ASSERT_EQ(CL_SUCCESS, err) << "clCreateContext() failed";
+	#ifdef DEBUG_CONTEXT
+	printf("SetUp: Created context %p\n", context_);
+	#endif
+	printf("SetUp: about to create command queues\n");
+    for (i = 0; i < MAX_COMMAND_QUEUES; i++) {
+        cl_device_id dev;
+
+        dev = (i == addDevQueueIdx) ? additionalDevice_ : primaryDevice_;
+        commandQueues_[i] = clCreateCommandQueue(context_, dev,
+            0 /*CL_QUEUE_PROFILING_ENABLE*/, &err);
+        ASSERT_EQ(CL_SUCCESS, err) << "clCreateCommandQueue() failed";
+    }
+
+    ASSERT_EQ(CL_SUCCESS, clblasSetup());
+}
+
+void
+BlasBase::TearDown()
+{
+    cl_uint i;
+
+    for (i = 0; i < MAX_COMMAND_QUEUES; i++) {
+        clReleaseCommandQueue(commandQueues_[i]);
+    }
+    numCommandQueues_ = 1;
+
+    if (context_ != NULL) {
+        clReleaseContext(context_);
+        context_ = NULL;
+    }
+
+    primaryDevice_ = additionalDevice_ = NULL;
+
+    clblasTeardown();
+}
+
+bool
+BlasBase::initialized()
+{
+    return (context_ != NULL);
+}
+
+bool
+BlasBase::setDeviceType(cl_device_type* devType, const char* devName)
+{
+    if (devType_ == *devType && devName_ == devName) {
+        return true;
+    }
+
+    devType_ = *devType;
+    devName_ = devName;
+    if (!initialized()) {
+        return true;
+    }
+    TearDown();
+    SetUp();
+    *devType = devType_;
+    return initialized();
+}
+
+cl_mem
+BlasBase::createEnqueueBuffer(
+    const void *data,
+    size_t matrSize,
+    size_t off,
+    cl_mem_flags mode)
+{
+    cl_int err;
+    cl_mem buf;
+    cl_uint i;
+
+	#ifdef DEBUG_CONTEXT
+	cl_uint refcnt;
+	printf("BLASBASE: createEnqBuff - Querying context %p\n", context_);
+	if (clGetContextInfo(context_, CL_CONTEXT_REFERENCE_COUNT, sizeof(cl_uint), &refcnt, NULL) != CL_SUCCESS)
+	{
+		printf("BLASBASE: clGetContextInfo FAILED\n");
+	} else {
+		printf("BLASBASE: REFCNT = %u\n", refcnt);
+	}
+	#endif
+    buf = clCreateBuffer(context_, mode, matrSize + off, NULL, &err);
+
+	if ( data != NULL ) {
+    if (err == CL_SUCCESS ) {
+        for (i = 0; i < numCommandQueues_; i++) {
+            err = clEnqueueWriteBuffer(commandQueues_[i], buf, CL_TRUE,
+                                       off, matrSize, data, 0, NULL, NULL);
+            if (err != CL_SUCCESS) {
+                clReleaseMemObject(buf);
+                return NULL;
+            }
+        }
+    }
+	}
+
+    return buf;
+}
+
+bool
+BlasBase::isDevSupportDoublePrecision(void)
+{
+    cl_int err;
+    cl_uint v;
+    size_t len;
+    char *extensions, *s;
+
+    /* Check for cl_khr_fp64 extension */
+
+    err = clGetDeviceInfo(primaryDevice_, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
+            sizeof(cl_uint), &v, NULL);
+    if (err != CL_SUCCESS) {
+        return false;
+    }
+
+    if (v != 0) {
+        return true;
+    }
+
+    /* Check extensions */
+
+    err = clGetDeviceInfo(primaryDevice_, CL_DEVICE_EXTENSIONS, 0, NULL, &len);
+    if (err != CL_SUCCESS) {
+        return false;
+    }
+
+    extensions = new char[len];
+    err = clGetDeviceInfo(primaryDevice_, CL_DEVICE_EXTENSIONS, len, extensions, NULL);
+    if (err != CL_SUCCESS) {
+        delete[] extensions;
+        return false;
+    }
+
+    /* Check for cl_amd_fp64 extension */
+    s = strstr(extensions, "cl_amd_fp64");      /* strlen("cl_amd_fp64") = 11 */
+    if (s != NULL) {
+        if ((s[11] == ' ') || (s[11] == '\0')) {
+            delete[] extensions;
+            return true;
+        }
+    }
+
+    delete[] extensions;
+
+    return false;
+}
+
+void
+BlasBase::removeScratchImages(void)
+{
+    //if (imageB_) {
+    //    clblasRemoveScratchImage(imageB_);
+    //}
+    //if (imageA_) {
+    //    clblasRemoveScratchImage(imageA_);
+    //}
+}
+
+size_t
+BlasBase::scratchImageWidth(void)
+{
+    size_t width;
+
+    clGetImageInfo(reinterpret_cast<cl_mem>(imageA_), CL_IMAGE_WIDTH,
+                   sizeof(width), &width, NULL);
+    return width;
+}
+
+size_t
+BlasBase::scratchImageHeight(void)
+{
+    size_t height;
+
+    clGetImageInfo(reinterpret_cast<cl_mem>(imageA_), CL_IMAGE_HEIGHT,
+                   sizeof(height), &height, NULL);
+
+    return height;
+}
+
+cl_ulong
+BlasBase::maxMemAllocSize(void)
+{
+    cl_int err;
+    cl_ulong rc = 0;
+
+    err = clGetDeviceInfo(primaryDevice_, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                          sizeof(rc), &rc, NULL);
+    if ((err == CL_SUCCESS) && (additionalDevice_ != NULL)) {
+        cl_ulong u;
+
+        err = clGetDeviceInfo(additionalDevice_, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                              sizeof(u), &u, NULL);
+        if (err == CL_SUCCESS) {
+            rc = std::min(rc, u);
+        }
+    }
+
+    return rc;
+}
+
+cl_ulong
+BlasBase::availGlobalMemSize(int primAdd)
+{
+    cl_ulong gmemSize;
+    cl_device_id dev;
+
+    dev = (primAdd) ? additionalDevice_ : primaryDevice_;
+    clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(gmemSize),
+                    &gmemSize, NULL);
+
+    return gmemSize;
+}
+
+void
+BlasBase::printDevInfoStr(cl_device_info param, const char *paramName,
+                          int primAdd)
+{
+    char buf[4096];
+    cl_device_id dev;
+
+    dev = (primAdd) ? additionalDevice_ : primaryDevice_;
+    if (clGetDeviceInfo(dev, param, sizeof(buf), buf, NULL) == CL_SUCCESS) {
+        std::cout << paramName << ": " << buf << std::endl;
+    }
+}
+
+void
+BlasBase::printEnvInfo(void)
+{
+    cl_ulong memSize;
+    int i;
+
+    if (primaryDevice_ == NULL) {
+        return;
+    }
+
+    cl_uint libMajor, libMinor, libPatch;
+    clblasGetVersion( &libMajor, &libMinor, &libPatch );
+
+    std::cout << std::endl << "Test environment:" << std::endl << std::endl;
+
+    for (i = 0; i < 2; i++) {
+        if (additionalDevice_ != NULL) {
+            if (!i) {
+                std::cout << "PRIMARY DEVICE (used in all cases):" << std::endl;
+            }
+            else {
+                std::cout << "ADDITIONAL DEVICE (used only in cases with "
+                             "multiple command queues to cover cases with "
+                             "problem distribution among command queues "
+                             "belonging to different devices):" << std::endl;
+            }
+        }
+        else if (i) {
+            break;
+        }
+
+        printDevInfoStr(CL_DEVICE_NAME, "Device name", i);
+        printDevInfoStr(CL_DEVICE_VENDOR, "Device vendor", i);
+        std::cout << "Platform (bit): ";
+#if defined( _WIN32 )
+        std::cout << "Windows ";
+    #if defined( _WIN64 )
+            std::cout << "(x64)" << std::endl;
+    #else
+            std::cout << "(x32)" << std::endl;
+    #endif
+#else
+        std::cout << "Linux" << std::endl;
+#endif
+        std::cout << "clblas version: " << libMajor << "." << libMinor << "."
+            << libPatch << std::endl;
+        printDevInfoStr(CL_DRIVER_VERSION, "Driver version", i);
+        printDevInfoStr(CL_DEVICE_VERSION, "Device version", i);
+        memSize = availGlobalMemSize(i);
+        std::cout << "Global mem size: " << memSize / (1024 * 1024) <<
+                     " MB" << std::endl;
+
+        std::cout << "---------------------------------------------------------"
+                  << std::endl << std::endl;
+    }
+}
+
+}   // namespace
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
new file mode 100644
index 0000000..1f0e07c
--- /dev/null
+++ b/src/tests/CMakeLists.txt
@@ -0,0 +1,450 @@
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
+
+set(SRC_COMMON
+    cmdline.c
+    common.cpp
+    clBLAS-wrapper.cpp
+    BlasBase.cpp
+)
+
+set(SRC_COMMON_TIMER
+    timer.c
+)
+
+# group of sources with reference implementation stuff
+set (SRC_COMMON_REFIMPL
+    blas.c
+    blas-cblas.c
+    blas-wrapper.cpp
+)
+
+set(SRC_CORR
+    correctness/blas-lapack.c
+    correctness/BlasBase-corr.cpp
+    correctness/corr-gemm.cpp
+    correctness/corr-trmm.cpp
+    correctness/corr-trsm.cpp
+    correctness/corr-gemv.cpp
+    correctness/corr-symv.cpp
+    correctness/corr-spmv.cpp
+    correctness/corr-syr2k.cpp
+    correctness/corr-syrk.cpp
+    correctness/corr-trmv.cpp
+    correctness/corr-tpmv.cpp
+    correctness/corr-trsv.cpp
+	correctness/corr-symm.cpp
+	correctness/corr-gemm2.cpp
+	correctness/corr-ger.cpp
+	correctness/corr-gerc.cpp
+    correctness/corr-her.cpp
+	correctness/corr-her2.cpp
+	correctness/corr-syr.cpp
+    correctness/corr-spr.cpp
+	correctness/corr-syr2.cpp
+	correctness/corr-hemv.cpp
+	correctness/corr-hpmv.cpp
+	correctness/corr-hemm.cpp
+	correctness/corr-herk.cpp
+    correctness/corr-tpsv.cpp
+    correctness/corr-hpr.cpp
+	correctness/corr-hpr2.cpp
+	correctness/corr-spr2.cpp
+	correctness/corr-gbmv.cpp
+    correctness/corr-hbmv.cpp
+	correctness/corr-tbmv.cpp
+    correctness/corr-tbsv.cpp
+    correctness/corr-sbmv.cpp
+    correctness/corr-her2k.cpp
+    correctness/corr-scal.cpp
+	correctness/corr-swap.cpp
+	correctness/corr-copy.cpp
+    correctness/corr-axpy.cpp
+	correctness/corr-dot.cpp
+    correctness/corr-dotc.cpp
+    correctness/corr-rotg.cpp
+    correctness/corr-rotm.cpp
+	correctness/corr-rot.cpp
+    correctness/corr-rotmg.cpp
+    correctness/corr-nrm2.cpp
+    correctness/corr-asum.cpp
+    correctness/corr-iamax.cpp
+    correctness/test-correctness.cpp
+    correctness/tcase-filter.cpp
+)
+
+set(SRC_PERF
+    performance/PerformanceRecorder.cpp
+    performance/PerformanceTest.cpp
+    performance/TrxmPerformanceTest.cpp
+    performance/BlasBase-perf.cpp
+    performance/perf-gemm.cpp
+    performance/perf-gemm2.cpp
+    performance/perf-gemv.cpp
+    performance/perf-syr2k.cpp
+    performance/perf-syrk.cpp
+    performance/perf-symv.cpp
+    performance/perf-spmv.cpp
+    performance/perf-trmm.cpp
+    performance/perf-trsm.cpp
+    performance/perf-trmv.cpp
+    performance/perf-tpmv.cpp
+    performance/perf-trsv.cpp
+    performance/perf-symm.cpp
+    performance/perf-ger.cpp
+    performance/perf-gerc.cpp
+    performance/perf-syr.cpp
+    performance/perf-spr.cpp
+	performance/perf-her.cpp
+	performance/perf-her2.cpp
+    performance/perf-syr2.cpp
+	performance/perf-hemm.cpp
+    performance/perf-hemv.cpp
+    performance/perf-hpmv.cpp
+    performance/perf-herk.cpp
+	performance/perf-tpsv.cpp
+    performance/perf-hpr.cpp
+	performance/perf-hpr2.cpp
+    performance/perf-spr2.cpp
+    performance/perf-sbmv.cpp
+    performance/perf-gbmv.cpp
+    performance/perf-hbmv.cpp
+    performance/perf-tbmv.cpp
+    performance/perf-tbsv.cpp
+    performance/perf-her2k.cpp
+    performance/perf-scal.cpp
+    performance/perf-swap.cpp
+	performance/perf-copy.cpp
+    performance/perf-axpy.cpp
+	performance/perf-dot.cpp
+    performance/perf-dotc.cpp
+    performance/perf-rotg.cpp
+    performance/perf-rotm.cpp
+	performance/perf-rot.cpp
+    performance/perf-rotmg.cpp
+    performance/perf-nrm2.cpp
+    performance/perf-asum.cpp
+    performance/perf-iamax.cpp
+	performance/test-performance.cpp
+)
+
+set(SRC_FUNC
+   functional/func-error.cpp
+   functional/func-event.cpp
+   functional/func-thread.cpp
+   functional/func-queue.cpp
+   #functional/func-images.cpp
+   functional/test-functional.cpp
+   functional/BlasBase-func.cpp
+)
+
+set(TESTS_HEADERS
+    ${clBLAS_SOURCE_DIR}/clBLAS.h
+    ${clBLAS_SOURCE_DIR}/clBLAS-complex.h
+	${clBLAS_SOURCE_DIR}/include/cltypes.h
+	${clBLAS_SOURCE_DIR}/include/defbool.h
+    include/blas-internal.h
+    include/blas-cblas.h
+    include/blas-wrapper.h
+    include/clBLAS-wrapper.h
+    include/cmdline.h
+    include/BlasBase.h
+    include/common.h
+    include/BlasBase.h
+    include/gemm.h
+    include/trmm.h
+    include/tpmv.h
+    include/trsm.h
+    include/gemv.h
+    include/symv.h
+    include/spmv.h
+    include/syr2k.h
+    include/syrk.h
+    include/trmv.h
+    include/trsv.h
+	include/symm.h
+	include/ger.h
+    include/gerc.h
+    include/syr.h
+    include/spr.h
+    include/her.h
+	include/her2.h
+    include/syr2.h
+	include/hemv.h
+	include/hpmv.h
+	include/hemm.h
+	include/herk.h
+    include/tpsv.h
+    include/hpr.h
+	include/hpr2.h
+    include/spr2.h
+    include/gbmv.h
+    include/hbmv.h
+    include/tbmv.h
+    include/tbsv.h
+	include/copy.h
+    include/sbmv.h
+    include/dot.h
+    include/dotc.h
+    include/her2k.h
+    include/scal.h
+	include/swap.h
+    include/axpy.h
+    include/rotg.h
+    include/rotm.h
+	include/rot.h
+    include/asum.h
+    include/rotmg.h
+    include/nrm2.h
+    include/iamax.h
+	include/blas-math.h
+    include/blas-random.h
+    include/matrix.h
+    include/timer.h
+)
+
+set(CORR_HEADERS
+    correctness/blas-lapack.h
+    correctness/trsm-delta.h
+    correctness/tcase-filter.h
+    correctness/delta.h
+	correctness/trsv-delta.h
+)
+
+set(PERF_HEADERS
+    performance/PerformanceTest.h
+    performance/PerformanceRecorder.h
+)
+
+set(FUNC_HEADERS
+   functional/func.h
+)
+
+# Setup Visual Studio file tabs
+source_group(correctness FILES ${SRC_CORR} ${CORR_HEADERS})
+source_group(performance FILES ${SRC_PERF} ${PERF_HEADERS})
+source_group(functional  FILES ${SRC_FUNC} ${FUNC_HEADERS})
+
+# FIXME: it's a temporary solution to workaround segfault in clGetProgramInfo()
+# at paramVal = CL_PROGRAM_BINARIES and several devices in the context
+add_definitions( -DTEST_WITH_SINGLE_DEVICE )
+
+# Having problems on build server, compiling gtest headers with -pedantic; disabling detection of long long
+# http://code.google.com/p/googletest/issues/detail?id=334
+if( CMAKE_COMPILER_IS_GNUCXX )
+	add_definitions( -Wno-long-long )
+endif( )
+
+if( CMAKE_Fortran_COMPILER_ID STREQUAL "PGI" )
+	message( STATUS "Detected PGI Fortran compiler." )
+
+	# By default, -Mipa=fast is used, and this does not mix well with the cl compiler
+	string( REPLACE "-Mipa=fast" "" CMAKE_Fortran_FLAGS_RELEASE ${CMAKE_Fortran_FLAGS_RELEASE} )
+	
+	# In windows, dynamically link to the C runtime, and tell fortran linker to not include default main subroutine
+	if( WIN32 )
+		set( CMAKE_EXE_LINKER_FLAGS "-Bdynamic -Mnostartup ${CMAKE_EXE_LINKER_FLAGS}" )
+	endif( )
+endif( )
+
+# Library with functions for time measurement. In Windows they are included automatically
+if(UNIX)
+    set(TIME_LIBRARY "rt")
+    set(THREAD_LIBRARY "pthread")
+endif()
+
+# This logic supports the build server, if it compiles the runtime seperately from the test programs
+# It stitches together a path to a previously built static library, based on our 'make install' logic
+# Search for 64bit libs if FIND_LIBRARY_USE_LIB64_PATHS is set to true in the global environment, 32bit libs else
+get_property( LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS )
+if( LIB64 )
+	set( runtime.library "${CMAKE_INSTALL_PREFIX}/lib64" )
+else( )
+	set( runtime.library "${CMAKE_INSTALL_PREFIX}/lib32" )
+endif( )
+
+if( WIN32 )
+	set( runtime.library "${runtime.library}/import/clBLAS${CMAKE_STATIC_LIBRARY_SUFFIX}" )
+else( )
+	set( runtime.library "${runtime.library}/${CMAKE_SHARED_LIBRARY_PREFIX}clBLAS${CMAKE_SHARED_LIBRARY_SUFFIX}" )
+endif( )
+
+if( GTEST_FOUND )
+    if( CORR_TEST_WITH_ACML AND ACML_FOUND )
+		include_directories(${OPENCL_INCLUDE_DIRS} ${GTEST_INCLUDE_DIRS}
+			${clBLAS_SOURCE_DIR} ${ACML_INCLUDE_DIRS}
+			${clBLAS_SOURCE_DIR}/tests/include  ${clBLAS_SOURCE_DIR}/include)
+
+		add_definitions(-DCORR_TEST_WITH_ACML)
+	    
+	    add_executable(test-correctness ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL}
+									    ${CORR_HEADERS} ${TESTS_HEADERS})
+
+	    add_executable(test-medium ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL}
+							      ${CORR_HEADERS} ${TESTS_HEADERS})
+	    set_target_properties(test-medium PROPERTIES COMPILE_DEFINITIONS MEDIUM_TESTS)
+
+	    add_executable(test-short ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL}
+							      ${CORR_HEADERS} ${TESTS_HEADERS})
+	    set_target_properties(test-short PROPERTIES COMPILE_DEFINITIONS SHORT_TESTS)
+
+	    # The build server builds the library with gcc 4.1.2 to support Red Hat 5.5, but the test programs must be built with 
+	    # gcc > 4.3.2 to support ACML.  
+	    # If the runtime is being built by the project, use it, otherwise link to a runtime library specified in the install prefix
+	    if( BUILD_RUNTIME )
+		    target_link_libraries(test-correctness ${ACML_LIBRARIES} ${GTEST_LIBRARIES} clBLAS)
+		    target_link_libraries(test-medium ${ACML_LIBRARIES} ${GTEST_LIBRARIES} clBLAS)
+		    target_link_libraries(test-short ${ACML_LIBRARIES} ${GTEST_LIBRARIES} clBLAS)
+	    else( )
+		    target_link_libraries(test-correctness
+			    ${ACML_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library})
+		    target_link_libraries(test-medium
+			    ${ACML_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library})
+		    target_link_libraries(test-short
+			    ${ACML_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library})
+	    endif( )
+	else( )
+		#Link against the netlib reference library
+		include_directories(${OPENCL_INCLUDE_DIRS} ${GTEST_INCLUDE_DIRS}
+			${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_DIR}/tests/include ${clBLAS_SOURCE_DIR}/include})
+
+		add_executable(test-correctness ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL}
+						${CORR_HEADERS} ${TESTS_HEADERS})
+
+		add_executable(test-medium ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL}
+					  ${CORR_HEADERS} ${TESTS_HEADERS})
+		set_target_properties( test-medium PROPERTIES COMPILE_DEFINITIONS MEDIUM_TESTS )
+
+		add_executable(test-short ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL}
+					  ${CORR_HEADERS} ${TESTS_HEADERS})
+		set_target_properties( test-short PROPERTIES COMPILE_DEFINITIONS SHORT_TESTS )
+
+		if( NOT CORR_TEST_WITH_ACML AND NOT WIN32)
+			set_target_properties( test-correctness PROPERTIES LINKER_LANGUAGE Fortran )
+			set_target_properties( test-medium PROPERTIES LINKER_LANGUAGE Fortran )
+			set_target_properties( test-short PROPERTIES LINKER_LANGUAGE Fortran )
+		endif( )
+		
+	    if( BUILD_RUNTIME )
+			if( NETLIB_FOUND )
+				target_link_libraries(test-correctness ${Netlib_LIBRARIES} ${GTEST_LIBRARIES} clBLAS)
+				target_link_libraries(test-medium ${Netlib_LIBRARIES} ${GTEST_LIBRARIES} clBLAS)
+				target_link_libraries(test-short ${Netlib_LIBRARIES} ${GTEST_LIBRARIES} clBLAS)
+			else( )
+				target_link_libraries(test-correctness BLAS_LIBRARIES ${GTEST_LIBRARIES} clBLAS)
+				target_link_libraries(test-medium BLAS_LIBRARIES ${GTEST_LIBRARIES} clBLAS)
+				target_link_libraries(test-short BLAS_LIBRARIES ${GTEST_LIBRARIES} clBLAS)
+			endif( )
+		else( )
+			if( NETLIB_FOUND )
+				target_link_libraries(test-correctness ${Netlib_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} )
+				target_link_libraries(test-medium ${Netlib_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} )
+				target_link_libraries(test-short ${Netlib_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} )
+			else( )
+				target_link_libraries(test-correctness BLAS_LIBRARIES ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} )
+				target_link_libraries(test-medium BLAS_LIBRARIES ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} )
+				target_link_libraries(test-short BLAS_LIBRARIES ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} )
+			endif( )
+		endif( )
+    endif( )
+    
+    set_property( TARGET test-correctness PROPERTY FOLDER "Test")
+    set_property( TARGET test-medium PROPERTY FOLDER "Test")
+    set_property( TARGET test-short PROPERTY FOLDER "Test")
+
+    if( TARGET_PLATFORM EQUAL 64 )
+	    # CPack configuration; include the executable into the package
+	    install( TARGETS test-correctness test-medium test-short
+			    RUNTIME DESTINATION bin64
+			    LIBRARY DESTINATION lib64
+			    ARCHIVE DESTINATION lib64/import
+			    )
+    else()
+	    # CPack configuration; include the executable into the package
+	    install( TARGETS test-correctness test-medium test-short
+			    RUNTIME DESTINATION bin32
+			    LIBRARY DESTINATION lib32
+			    ARCHIVE DESTINATION lib32/import
+			    )
+    endif()
+
+	if( ACML_FOUND )
+		include_directories(${OPENCL_INCLUDE_DIRS} ${GTEST_INCLUDE_DIRS}
+			${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_DIR}/tests/include  ${clBLAS_SOURCE_DIR}/include)
+
+		add_definitions(-DPERF_TEST_WITH_ACML)
+		include_directories(${ACML_INCLUDE_DIRS})
+		add_executable(test-performance ${SRC_PERF} ${SRC_COMMON}
+			${SRC_COMMON_TIMER} ${PERF_HEADERS} ${TESTS_HEADERS}
+			${SRC_COMMON_REFIMPL})
+		target_link_libraries(test-performance ${ACML_LIBRARIES})
+
+
+		if( BUILD_RUNTIME )
+			target_link_libraries(test-performance ${GTEST_LIBRARIES} ${TIME_LIBRARY} clBLAS)
+		else()
+			target_link_libraries( test-performance ${GTEST_LIBRARIES} ${TIME_LIBRARY} ${OPENCL_LIBRARIES} ${runtime.library} )
+		endif()
+
+        set_property( TARGET test-performance PROPERTY FOLDER "Test")
+
+		if( TARGET_PLATFORM EQUAL 64 )
+			# CPack configuration; include the executable into the package
+			install( TARGETS test-performance
+					RUNTIME DESTINATION bin64
+					LIBRARY DESTINATION lib64
+					ARCHIVE DESTINATION lib64/import
+					)
+		else()
+			# CPack configuration; include the executable into the package
+			install( TARGETS test-performance
+					RUNTIME DESTINATION bin32
+					LIBRARY DESTINATION lib32
+					ARCHIVE DESTINATION lib32/import
+					)
+		endif()
+	endif()
+
+	include_directories(${OPENCL_INCLUDE_DIRS} ${GTEST_INCLUDE_DIRS}
+		${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_DIR}/tests/include ${clBLAS_SOURCE_DIR}/include )
+
+	add_executable(test-functional ${SRC_FUNC} ${SRC_COMMON} ${SRC_COMMON_TIMER}
+								  ${FUNC_HEADERS} ${TESTS_HEADERS})
+								  
+	if( BUILD_RUNTIME )
+		target_link_libraries(test-functional ${GTEST_LIBRARIES} ${TIME_LIBRARY} clBLAS)
+	else()
+		target_link_libraries(test-functional ${GTEST_LIBRARIES} ${TIME_LIBRARY} ${THREAD_LIBRARY} ${OPENCL_LIBRARIES} ${runtime.library} )
+	endif()
+
+    set_property( TARGET test-functional PROPERTY FOLDER "Test")
+
+	if( TARGET_PLATFORM EQUAL 64 )
+		# CPack configuration; include the executable into the package
+		install( TARGETS test-functional
+				RUNTIME DESTINATION bin64
+				LIBRARY DESTINATION lib64
+				ARCHIVE DESTINATION lib64/import
+				)
+	else()
+		# CPack configuration; include the executable into the package
+		install( TARGETS test-functional
+				RUNTIME DESTINATION bin32
+				LIBRARY DESTINATION lib32
+				ARCHIVE DESTINATION lib32/import
+				)
+	endif()
+endif()
diff --git a/src/tests/blas-cblas.c b/src/tests/blas-cblas.c
new file mode 100644
index 0000000..6f7e30c
--- /dev/null
+++ b/src/tests/blas-cblas.c
@@ -0,0 +1,57 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <blas-cblas.h>
+
+
+complex
+compose_complex(float x, float y)
+{
+    complex z = { x, y };
+    return z;
+}
+
+float
+complex_real(complex z)
+{
+    return z.real;
+}
+
+float
+complex_imag(complex z)
+{
+    return z.imag;
+}
+
+doublecomplex
+compose_doublecomplex(double x, double y)
+{
+    doublecomplex z = { x, y };
+    return z;
+}
+
+double
+doublecomplex_real(doublecomplex z)
+{
+    return z.real;
+}
+
+double
+doublecomplex_imag(doublecomplex z)
+{
+    return z.imag;
+}
diff --git a/src/tests/blas-wrapper.cpp b/src/tests/blas-wrapper.cpp
new file mode 100644
index 0000000..befb9fa
--- /dev/null
+++ b/src/tests/blas-wrapper.cpp
@@ -0,0 +1,2462 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <clBLAS.h>
+
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+
+void
+::clMath::blas::gemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    float alpha,
+    const float *A,
+    size_t lda,
+    const float *X,
+    int incx,
+    float beta,
+    float *Y,
+    int incy)
+{
+    blasSgemv(order, transA, M, N, alpha, A, lda, X, incx, beta, Y, incy);
+}
+
+void
+::clMath::blas::gemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    double alpha,
+    const double *A,
+    size_t lda,
+    const double *X,
+    int incx,
+    double beta,
+    double *Y,
+    int incy)
+{
+    blasDgemv(order, transA, M, N, alpha, A, lda, X, incx, beta, Y, incy);
+}
+
+void
+::clMath::blas::gemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const FloatComplex *A,
+    size_t lda,
+    const FloatComplex *X,
+    int incx,
+    FloatComplex beta,
+    FloatComplex *Y,
+    int incy)
+{
+    blasCgemv(order, transA, M, N, alpha, A, lda, X, incx, beta, Y, incy);
+}
+
+void
+::clMath::blas::gemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const DoubleComplex *A,
+    size_t lda,
+    const DoubleComplex *X,
+    int incx,
+    DoubleComplex beta,
+    DoubleComplex *Y,
+    int incy)
+{
+    blasZgemv(order, transA, M, N, alpha, A, lda, X, incx, beta, Y, incy);
+}
+
+void
+::clMath::blas::symv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    float alpha,
+    const float *A,
+    size_t lda,
+    const float *X,
+    int incx,
+    float beta,
+    float *Y,
+    int incy)
+{
+    blasSsymv(order, uplo, N, alpha, A, lda, X, incx, beta, Y, incy);
+}
+
+void
+::clMath::blas::symv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    double alpha,
+    const double *A,
+    size_t lda,
+    const double *X,
+    int incx,
+    double beta,
+    double *Y,
+    int incy)
+{
+    blasDsymv(order, uplo, N, alpha, A, lda, X, incx, beta, Y, incy);
+}
+
+void
+::clMath::blas::gemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    float alpha,
+    const float *A,
+    size_t lda,
+    const float *B,
+    size_t ldb,
+    float beta,
+    float *C,
+    size_t ldc)
+{
+    blasSgemm(order, transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void
+::clMath::blas::gemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    double alpha,
+    const double *A,
+    size_t lda,
+    const double *B,
+    size_t ldb,
+    double beta,
+    double *C,
+    size_t ldc)
+{
+    blasDgemm(order, transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void
+::clMath::blas::gemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const FloatComplex *A,
+    size_t lda,
+    const FloatComplex *B,
+    size_t ldb,
+    FloatComplex beta,
+    FloatComplex *C,
+    size_t ldc)
+{
+    blasCgemm(order, transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void
+::clMath::blas::gemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const DoubleComplex *A,
+    size_t lda,
+    const DoubleComplex *B,
+    size_t ldb,
+    DoubleComplex beta,
+    DoubleComplex *C,
+    size_t ldc)
+{
+    blasZgemm(order, transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void
+::clMath::blas::trmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    float alpha,
+    const float *A,
+    size_t lda,
+    float *B,
+    size_t ldb)
+{
+    blasStrmm(order, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb);
+}
+
+void
+::clMath::blas::trmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    double alpha,
+    const double *A,
+    size_t lda,
+    double *B,
+    size_t ldb)
+{
+    blasDtrmm(order, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb);
+}
+
+void
+::clMath::blas::trmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const FloatComplex *A,
+    size_t lda,
+    FloatComplex *B,
+    size_t ldb)
+{
+    blasCtrmm(order, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb);
+}
+
+void
+::clMath::blas::trmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const DoubleComplex *A,
+    size_t lda,
+    DoubleComplex *B,
+    size_t ldb)
+{
+    blasZtrmm(order, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb);
+}
+
+void
+::clMath::blas::trsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    float alpha,
+    const float *A,
+    size_t lda,
+    float *B,
+    size_t ldb)
+{
+    blasStrsm(order, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb);
+}
+
+void
+::clMath::blas::trsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    double alpha,
+    const double *A,
+    size_t lda,
+    double *B,
+    size_t ldb)
+{
+    blasDtrsm(order, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb);
+}
+
+void
+::clMath::blas::trsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const FloatComplex *A,
+    size_t lda,
+    FloatComplex *B,
+    size_t ldb)
+{
+    blasCtrsm(order, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb);
+}
+
+void
+::clMath::blas::trsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const DoubleComplex *A,
+    size_t lda,
+    DoubleComplex *B,
+    size_t ldb)
+{
+    blasZtrsm(order, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb);
+}
+
+void
+::clMath::blas::syr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    float alpha,
+    const float *A,
+    size_t lda,
+    const float *B,
+    size_t ldb,
+    float beta,
+    float *C,
+    size_t ldc)
+{
+    blasSsyr2k(order, uplo, transA, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void
+::clMath::blas::syr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    double alpha,
+    const double *A,
+    size_t lda,
+    const double *B,
+    size_t ldb,
+    double beta,
+    double *C,
+    size_t ldc)
+{
+    blasDsyr2k(order, uplo, transA, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void
+::clMath::blas::syr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const FloatComplex *A,
+    size_t lda,
+    const FloatComplex *B,
+    size_t ldb,
+    FloatComplex beta,
+    FloatComplex *C,
+    size_t ldc)
+{
+    blasCsyr2k(order, uplo, transA, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void
+::clMath::blas::syr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const DoubleComplex *A,
+    size_t lda,
+    const DoubleComplex *B,
+    size_t ldb,
+    DoubleComplex beta,
+    DoubleComplex *C,
+    size_t ldc)
+{
+    blasZsyr2k(order, uplo, transA, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void
+::clMath::blas::syrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    float alpha,
+    const float *A,
+    size_t lda,
+    float beta,
+    float *C,
+    size_t ldc)
+{
+    blasSsyrk(order, uplo, transA, N, K, alpha, A, lda, beta, C, ldc);
+}
+
+void
+::clMath::blas::syrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    double alpha,
+    const double *A,
+    size_t lda,
+    double beta,
+    double *C,
+    size_t ldc)
+{
+    blasDsyrk(order, uplo, transA, N, K, alpha, A, lda, beta, C, ldc);
+}
+
+void
+::clMath::blas::syrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const FloatComplex *A,
+    size_t lda,
+    FloatComplex beta,
+    FloatComplex *C,
+    size_t ldc)
+{
+    blasCsyrk(order, uplo, transA, N, K, alpha, A, lda, beta, C, ldc);
+}
+
+void
+::clMath::blas::syrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const DoubleComplex *A,
+    size_t lda,
+    DoubleComplex beta,
+    DoubleComplex *C,
+    size_t ldc)
+{
+    blasZsyrk(order, uplo, transA, N, K, alpha, A, lda, beta, C, ldc);
+}
+
+void
+::clMath::blas::trmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    float *A,
+    size_t offa,
+    size_t lda,
+    float *X,
+    size_t offx,
+    int incx)
+{
+	blasStrmv( order, uplo, transA, diag, N, A, offa, lda, X, offx, incx );
+}
+
+void
+::clMath::blas::trmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    double *A,
+    size_t offa,
+    size_t lda,
+    double *X,
+    size_t offx,
+    int incx)
+{
+    blasDtrmv( order, uplo, transA, diag, N, A, offa, lda, X, offx, incx );
+}
+
+void
+::clMath::blas::trmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    FloatComplex *A,
+    size_t offa,
+    size_t lda,
+    FloatComplex *X,
+    size_t offx,
+    int incx)
+{
+    blasCtrmv( order, uplo, transA, diag, N, A, offa, lda, X, offx, incx );
+}
+
+void
+::clMath::blas::trmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    DoubleComplex *A,
+    size_t offa,
+    size_t lda,
+    DoubleComplex *X,
+    size_t offx,
+    int incx)
+{
+    blasZtrmv( order, uplo, transA, diag, N, A, offa, lda, X, offx, incx );
+
+}
+
+//TPMV
+void
+::clMath::blas::tpmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    float *AP,
+    size_t offa,
+    float *X,
+    size_t offx,
+    int incx)
+{
+    blasStpmv( order, uplo, transA, diag, N, AP, offa, X, offx, incx );
+}
+
+void
+::clMath::blas::tpmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    double *AP,
+    size_t offa,
+    double *X,
+    size_t offx,
+    int incx)
+{
+    blasDtpmv( order, uplo, transA, diag, N, AP, offa, X, offx, incx );
+}
+
+void
+::clMath::blas::tpmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    FloatComplex *AP,
+    size_t offa,
+    FloatComplex *X,
+    size_t offx,
+    int incx)
+{
+    blasCtpmv( order, uplo, transA, diag, N, AP, offa, X, offx, incx );
+}
+
+void
+::clMath::blas::tpmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    DoubleComplex *AP,
+    size_t offa,
+    DoubleComplex *X,
+    size_t offx,
+    int incx)
+{
+    blasZtpmv( order, uplo, transA, diag, N, AP, offa, X, offx, incx );
+
+}
+
+
+void
+::clMath::blas::trsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    float *A,
+    size_t offa,
+    size_t lda,
+    float *X,
+    size_t offx,
+    int incx)
+{
+        blasStrsv( order, uplo, transA, diag, N, A,offa, lda, X,offx, incx );
+}
+
+void
+::clMath::blas::trsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    double *A,
+    size_t offa,
+    size_t lda,
+    double *X,
+    size_t offx,
+    int incx)
+{
+    blasDtrsv( order, uplo, transA, diag, N, A,offa,  lda, X,offx, incx );
+}
+
+void
+::clMath::blas::trsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    FloatComplex *A,
+    size_t offa,
+    size_t lda,
+    FloatComplex *X,
+    size_t offx,
+    int incx)
+{
+    blasCtrsv( order, uplo, transA, diag, N, A,offa, lda, X,offx,  incx );
+}
+
+void
+::clMath::blas::trsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    DoubleComplex *A,
+    size_t offa,
+    size_t lda,
+    DoubleComplex *X,
+    size_t offx,
+    int incx)
+{
+    blasZtrsv( order, uplo, transA, diag, N, A,offa, lda, X,offx,  incx );
+
+}
+
+void
+::clMath::blas::tpsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    float *A,
+    size_t offa,
+    float *X,
+    size_t offx,
+    int incx)
+{
+        blasStpsv( order, uplo, transA, diag, N, A, offa, X, offx, incx );
+}
+
+void
+::clMath::blas::tpsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    double *A,
+    size_t offa,
+    double *X,
+    size_t offx,
+    int incx)
+{
+    blasDtpsv( order, uplo, transA, diag, N, A, offa, X, offx, incx );
+}
+
+void
+::clMath::blas::tpsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    FloatComplex *A,
+    size_t offa,
+    FloatComplex *X,
+    size_t offx,
+    int incx)
+{
+    blasCtpsv( order, uplo, transA, diag, N, A, offa, X, offx, incx );
+}
+
+void
+::clMath::blas::tpsv(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    DoubleComplex *A,
+    size_t offa,
+    DoubleComplex *X,
+    size_t offx,
+    int incx)
+{
+    blasZtpsv( order, uplo, transA, diag, N, A, offa, X, offx, incx );
+
+}
+
+void
+::clMath::blas::symm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        float alpha,
+        float* A,
+        size_t offa,
+		size_t lda,
+        float* B,
+		size_t offb,
+        size_t ldb,
+        float beta,
+        float* C,
+		size_t offc,
+        size_t ldc)
+{
+	blasSsymm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc );
+}
+
+void
+::clMath::blas::symm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        double alpha,
+        double* A,
+		size_t offa,
+        size_t lda,
+        double* B,
+		size_t offb,
+        size_t ldb,
+        double beta,
+        double* C,
+		size_t offc,
+        size_t ldc)
+{
+    blasDsymm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc );
+}
+
+void
+::clMath::blas::symm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* A,
+		size_t offa,
+        size_t lda,
+        FloatComplex* B,
+		size_t offb,
+        size_t ldb,
+        FloatComplex beta,
+        FloatComplex* C,
+		size_t offc,
+        size_t ldc)
+{
+    blasCsymm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc );
+}
+
+void
+::clMath::blas::symm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* A,
+		size_t offa,
+        size_t lda,
+        DoubleComplex* B,
+		size_t offb,
+        size_t ldb,
+        DoubleComplex beta,
+        DoubleComplex* C,
+		size_t offc,
+        size_t ldc)
+{
+    blasZsymm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc );
+}
+
+void
+::clMath::blas::ger(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        float  alpha,
+        float *x,
+        size_t offx,
+	int  incx,
+	float *y,
+        size_t offy,
+	int incy,
+	float *A,
+	size_t offa,
+        size_t lda)
+{
+    blasSger( order, M, N, alpha, x, offx, incx, y, offy, incy , A, offa, lda );
+}
+
+
+void
+::clMath::blas::ger(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        double  alpha,
+        double *x,
+        size_t offx,
+        int  incx,
+        double *y,
+        size_t offy,
+        int incy,
+        double *A,
+        size_t offa,
+        size_t lda)
+{
+    blasDger( order, M, N, alpha, x, offx, incx, y, offy, incy , A, offa, lda );
+}
+
+
+void
+::clMath::blas::ger(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        FloatComplex  alpha,
+      FloatComplex *x,
+        size_t offx,
+        int  incx,
+        FloatComplex *y,
+        size_t offy,
+        int incy,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda)
+{
+    blasCgeru( order, M, N, alpha, x, offx, incx, y, offy, incy , A, offa, lda );
+}
+
+
+void
+::clMath::blas::ger(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        DoubleComplex  alpha,
+        DoubleComplex *x,
+        size_t offx,
+        int  incx,
+        DoubleComplex *y,
+        size_t offy,
+        int incy,
+        DoubleComplex *A,
+        size_t offa,
+        size_t lda)
+{
+    blasZgeru( order, M, N, alpha, x, offx, incx, y, offy, incy , A, offa, lda );
+}
+
+void
+::clMath::blas::gerc(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        FloatComplex  alpha,
+        FloatComplex *x,
+        size_t offx,
+        int  incx,
+        FloatComplex *y,
+        size_t offy,
+        int incy,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda)
+{
+    blasCgerc( order, M, N, alpha, x, offx, incx, y, offy, incy , A, offa, lda );
+}
+
+
+void
+::clMath::blas::gerc(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        DoubleComplex  alpha,
+         DoubleComplex *x,
+        size_t offx,
+        int  incx,
+        DoubleComplex *y,
+        size_t offy,
+        int incy,
+        DoubleComplex *A,
+        size_t offa,
+        size_t lda)
+{
+    blasZgerc( order, M, N, alpha, x, offx, incx, y, offy, incy , A, offa, lda );
+}
+
+
+
+void
+::clMath::blas::syr(
+		clblasOrder order,
+		clblasUplo uplo,
+		size_t N,
+        float alpha,
+        float* X,
+        size_t offx,
+        int incx,
+        float* A,
+        size_t offa,
+        size_t lda)
+{
+	blasSsyr(order, uplo, N, alpha, X, offx, incx, A, offa, lda);
+}
+
+void
+::clMath::blas::syr(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        double alpha,
+        double* X,
+        size_t offx,
+        int incx,
+        double* A,
+        size_t offa,
+        size_t lda)
+{
+    blasDsyr(order, uplo, N, alpha, X, offx, incx, A, offa, lda);
+}
+
+//SPR
+void
+::clMath::blas::spr(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        float alpha,
+        float* X,
+        size_t offx,
+        int incx,
+        float* AP,
+        size_t offa)
+{
+    blasSspr(order, uplo, N, alpha, X, offx, incx, AP, offa);
+}
+
+void
+::clMath::blas::spr(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        double alpha,
+        double* X,
+        size_t offx,
+        int incx,
+        double* AP,
+        size_t offa)
+{
+    blasDspr(order, uplo, N, alpha, X, offx, incx, AP, offa);
+}
+
+void
+::clMath::blas::her(
+        clblasOrder order,
+	clblasUplo uplo,
+        size_t N,
+        float alpha,
+        FloatComplex *x,
+        size_t offx,
+        int  incx,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda)
+{
+    blasCher( order, uplo, N, alpha, x, offx, incx, A, offa, lda );
+}
+
+
+void
+::clMath::blas::her(
+        clblasOrder order,
+	clblasUplo uplo,
+        size_t N,
+        double alpha,
+        DoubleComplex *x,
+        size_t offx,
+        int  incx,
+        DoubleComplex *A,
+        size_t offa,
+        size_t lda)
+{
+    blasZher( order, uplo,  N, alpha, x, offx, incx, A, offa, lda );
+}
+
+
+void
+::clMath::blas::syr2(
+		clblasOrder order,
+		clblasUplo uplo,
+		size_t N,
+        float alpha,
+        float* X,
+        size_t offx,
+        int incx,
+		float* Y,
+        size_t offy,
+        int incy,
+        float* A,
+        size_t offa,
+        size_t lda)
+{
+	blasSsyr2(order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda);
+}
+
+void
+::clMath::blas::syr2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        double alpha,
+        double* X,
+        size_t offx,
+        int incx,
+		double* Y,
+		size_t offy,
+		int incy,
+        double* A,
+        size_t offa,
+        size_t lda)
+{
+    blasDsyr2(order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda);
+}
+
+//HER2
+void
+::clMath::blas::her2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* X,
+        size_t offx,
+        int incx,
+        FloatComplex* Y,
+        size_t offy,
+        int incy,
+        FloatComplex* A,
+        size_t offa,
+        size_t lda)
+{
+    blasCher2(order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda);
+}
+
+void
+::clMath::blas::her2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* X,
+        size_t offx,
+        int incx,
+        DoubleComplex* Y,
+        size_t offy,
+        int incy,
+        DoubleComplex* A,
+        size_t offa,
+        size_t lda)
+{
+    blasZher2(order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda);
+}
+
+
+void
+::clMath::blas::hemv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        FloatComplex alpha,
+		FloatComplex* A,
+		size_t offa,
+		size_t lda,
+        FloatComplex* X,
+        size_t offx,
+        int incx,
+		FloatComplex beta,
+        FloatComplex* Y,
+        size_t offy,
+        int incy)
+{
+    blasChemv(order, uplo, N, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy);
+}
+
+void
+::clMath::blas::hemv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex* X,
+        size_t offx,
+        int incx,
+        DoubleComplex beta,
+        DoubleComplex* Y,
+        size_t offy,
+        int incy)
+{
+    blasZhemv(order, uplo, N, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy);
+}
+
+//HEMM
+void
+::clMath::blas::hemm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* A,
+        size_t offa,
+        size_t lda,
+        FloatComplex* B,
+        size_t offb,
+        size_t ldb,
+        FloatComplex beta,
+        FloatComplex* C,
+        size_t offc,
+        size_t ldc)
+{
+    blasChemm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc );
+}
+
+void
+::clMath::blas::hemm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex* B,
+        size_t offb,
+        size_t ldb,
+        DoubleComplex beta,
+        DoubleComplex* C,
+        size_t offc,
+        size_t ldc)
+{
+    blasZhemm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc );
+}
+
+void
+::clMath::blas::herk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    float alpha,
+    const FloatComplex *A,
+    size_t lda,
+    float beta,
+    FloatComplex *C,
+    size_t ldc)
+{
+    blasCherk(order, uplo, transA, N, K, alpha, A, lda, beta, C, ldc);
+}
+
+void
+::clMath::blas::herk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    double alpha,
+    const DoubleComplex *A,
+    size_t lda,
+    double beta,
+    DoubleComplex *C,
+    size_t ldc)
+{
+    blasZherk(order, uplo, transA, N, K, alpha, A, lda, beta, C, ldc);
+}
+
+
+void
+::clMath::blas::spmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    float alpha,
+    const float *A,
+    size_t offa,
+    const float *X,
+    size_t offx,
+    int incx,
+    float beta,
+    float *Y,
+    size_t offy,
+    int incy)
+{
+    blasSspmv(order, uplo, N, alpha, A, offa, X, offx, incx, beta, Y, offy, incy);
+}
+
+void
+::clMath::blas::spmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    double alpha,
+    const double *A,
+    size_t offa,
+    const double *X,
+    size_t offx,
+    int incx,
+    double beta,
+    double *Y,
+    size_t offy,
+    int incy)
+{
+    blasDspmv(order, uplo, N, alpha, A, offa, X, offx, incx, beta, Y, offy, incy);
+}
+
+void
+::clMath::blas::hpmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        FloatComplex alpha,
+		FloatComplex* A,
+		size_t offa,
+        FloatComplex* X,
+        size_t offx,
+        int incx,
+		FloatComplex beta,
+        FloatComplex* Y,
+        size_t offy,
+        int incy)
+{
+    blasChpmv(order, uplo, N, alpha, A, offa, X, offx, incx, beta, Y, offy, incy);
+}
+
+void
+::clMath::blas::hpmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* A,
+        size_t offa,
+        DoubleComplex* X,
+        size_t offx,
+        int incx,
+        DoubleComplex beta,
+        DoubleComplex* Y,
+        size_t offy,
+        int incy)
+{
+    blasZhpmv(order, uplo, N, alpha, A, offa, X, offx, incx, beta, Y, offy, incy);
+}
+
+void
+::clMath::blas::hpr(
+        clblasOrder order,
+	    clblasUplo uplo,
+        size_t N,
+        float alpha,
+        FloatComplex *x,
+        size_t offx,
+        int  incx,
+        FloatComplex *AP,
+        size_t offa)
+{
+    blasChpr( order, uplo, N, alpha, x, offx, incx, AP, offa);
+}
+
+
+void
+::clMath::blas::hpr(
+        clblasOrder order,
+	    clblasUplo uplo,
+        size_t N,
+        double alpha,
+        DoubleComplex *x,
+        size_t offx,
+        int  incx,
+        DoubleComplex *AP,
+        size_t offa)
+{
+    blasZhpr( order, uplo,  N, alpha, x, offx, incx, AP, offa );
+}
+
+void
+::clMath::blas::spr2(
+		clblasOrder order,
+		clblasUplo uplo,
+		size_t N,
+        float alpha,
+        float* X,
+        size_t offx,
+        int incx,
+		float* Y,
+        size_t offy,
+        int incy,
+        float* AP,
+        size_t offa)
+{
+	blasSspr2(order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa);
+}
+
+void
+::clMath::blas::spr2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        double alpha,
+        double* X,
+        size_t offx,
+        int incx,
+		double* Y,
+		size_t offy,
+		int incy,
+        double* AP,
+        size_t offa)
+{
+    blasDspr2(order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa);
+}
+
+void
+::clMath::blas::hpr2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* X,
+        size_t offx,
+        int incx,
+        FloatComplex* Y,
+        size_t offy,
+        int incy,
+        FloatComplex* AP,
+        size_t offa)
+{
+    blasChpr2(order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa);
+}
+
+void
+::clMath::blas::hpr2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* X,
+        size_t offx,
+        int incx,
+        DoubleComplex* Y,
+        size_t offy,
+        int incy,
+        DoubleComplex* AP,
+        size_t offa)
+{
+    blasZhpr2(order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa);
+}
+
+void
+clMath::blas::gbmv(
+        clblasOrder order,
+        clblasTranspose trans,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        float alpha,
+        float *A,
+        size_t offa,
+        size_t lda,
+        float *X,
+        size_t offx,
+        int incx,
+        float beta,
+        float *Y,
+        size_t offy,
+        int incy)
+        {
+            return blasSgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx,
+                                    beta, Y, offy, incy );
+        }
+
+void
+clMath::blas::gbmv(
+        clblasOrder order,
+        clblasTranspose trans,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        double alpha,
+        double *A,
+        size_t offa,
+        size_t lda,
+        double *X,
+        size_t offx,
+        int incx,
+        double beta,
+        double *Y,
+        size_t offy,
+        int incy)
+        {
+            return blasDgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx,
+                                    beta, Y, offy, incy);
+        }
+
+void
+clMath::blas::gbmv(
+        clblasOrder order,
+        clblasTranspose trans,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        FloatComplex alpha,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda,
+        FloatComplex *X,
+        size_t offx,
+        int incx,
+        FloatComplex beta,
+        FloatComplex *Y,
+        size_t offy,
+        int incy)
+        {
+            return blasCgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx,
+                                    beta, Y, offy, incy);
+        }
+
+void
+clMath::blas::gbmv(
+        clblasOrder order,
+        clblasTranspose trans,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        DoubleComplex alpha,
+        DoubleComplex *A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex *X,
+        size_t offx,
+        int incx,
+        DoubleComplex beta,
+        DoubleComplex *Y,
+        size_t offy,
+        int incy)
+        {
+            return blasZgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx,
+                                    beta, Y, offy, incy );
+        }
+//TBMV
+
+void
+clMath::blas::tbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        float *A,
+        size_t offa,
+        size_t lda,
+        float *X,
+        size_t offx,
+        int incx)
+        {
+            return blasStbmv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx );
+        }
+
+void
+clMath::blas::tbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        double *A,
+        size_t offa,
+        size_t lda,
+        double *X,
+        size_t offx,
+        int incx)
+        {
+            return blasDtbmv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx );
+        }
+
+void
+clMath::blas::tbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda,
+        FloatComplex *X,
+        size_t offx,
+        int incx)
+        {
+            return blasCtbmv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx );
+        }
+
+void
+clMath::blas::tbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        DoubleComplex *A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex *X,
+        size_t offx,
+        int incx)
+        {
+            return blasZtbmv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx );
+        }
+
+//SBMV
+
+void
+clMath::blas::sbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        size_t K,
+        float alpha,
+        float *A,
+        size_t offa,
+        size_t lda,
+        float *X,
+        size_t offx,
+        int incx,
+        float beta,
+        float *Y,
+        size_t offy,
+        int incy)
+        {
+            return blasSsbmv( order, uplo, N, K, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy );
+        }
+
+void
+clMath::blas::sbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        size_t K,
+        double alpha,
+        double *A,
+        size_t offa,
+        size_t lda,
+        double *X,
+        size_t offx,
+        int incx,
+        double beta,
+        double *Y,
+        size_t offy,
+        int incy)
+        {
+            return blasDsbmv( order, uplo, N, K, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy );
+        }
+
+//HBMV
+
+void
+clMath::blas::hbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        size_t K,
+        FloatComplex alpha,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda,
+        FloatComplex *X,
+        size_t offx,
+        int incx,
+        FloatComplex beta,
+        FloatComplex *Y,
+        size_t offy,
+        int incy)
+        {
+            return blasChbmv( order, uplo, N, K, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy );
+        }
+
+void
+clMath::blas::hbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        size_t K,
+        DoubleComplex alpha,
+        DoubleComplex *A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex *X,
+        size_t offx,
+        int incx,
+        DoubleComplex beta,
+        DoubleComplex *Y,
+        size_t offy,
+        int incy)
+        {
+            return blasZhbmv( order, uplo, N, K, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy );
+        }
+
+//TBSV
+
+void
+clMath::blas::tbsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        float *A,
+        size_t offa,
+        size_t lda,
+        float *X,
+        size_t offx,
+        int incx)
+        {
+            return blasStbsv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx );
+        }
+
+void
+clMath::blas::tbsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        double *A,
+        size_t offa,
+        size_t lda,
+        double *X,
+        size_t offx,
+        int incx)
+        {
+            return blasDtbsv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx );
+        }
+
+void
+clMath::blas::tbsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda,
+        FloatComplex *X,
+        size_t offx,
+        int incx)
+        {
+            return blasCtbsv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx );
+        }
+
+void
+clMath::blas::tbsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        DoubleComplex *A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex *X,
+        size_t offx,
+        int incx)
+        {
+            return blasZtbsv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx );
+        }
+
+void
+::clMath::blas::her2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const FloatComplex *A,
+    size_t offa,
+    size_t lda,
+    const FloatComplex *B,
+    size_t offb,
+    size_t ldb,
+    float beta,
+    FloatComplex *C,
+    size_t offc,
+    size_t ldc)
+{
+    blasCher2k(order, uplo, transA, N, K, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc);
+}
+
+void
+::clMath::blas::her2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const DoubleComplex *A,
+    size_t offa,
+    size_t lda,
+    const DoubleComplex *B,
+    size_t offb,
+    size_t ldb,
+    double beta,
+    DoubleComplex *C,
+    size_t offc,
+    size_t ldc)
+{
+    blasZher2k(order, uplo, transA, N, K, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc);
+}
+
+//copy
+void
+::clMath::blas::copy(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx,
+        float *Y,
+        size_t offy,
+        int incy)
+        {
+
+            return blasScopy( N, X, offx, incx, Y, offy, incy );
+        }
+
+void
+::clMath::blas::copy(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx,
+        double *Y,
+        size_t offy,
+        int incy)
+        {
+
+            return blasDcopy( N, X, offx, incx, Y, offy, incy );
+        }
+
+void
+::clMath::blas::copy(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx,
+        FloatComplex *Y,
+        size_t offy,
+        int incy)
+        {
+
+            return blasCcopy( N, X, offx, incx, Y, offy, incy );
+        }
+
+void
+::clMath::blas::copy(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx,
+        DoubleComplex *Y,
+        size_t offy,
+        int incy)
+        {
+
+            return blasZcopy( N, X, offx, incx, Y, offy, incy );
+        }
+
+
+//swap
+
+void
+clMath::blas::swap(
+        size_t N,
+        float *X,
+        size_t offBX,
+        int incx,
+        float *Y,
+        size_t offCY,
+        int incy)
+        {
+
+            return blasSswap( N, X, offBX, incx, Y, offCY, incy );
+        }
+
+void
+clMath::blas::swap(
+        size_t N,
+        double *X,
+        size_t offBX,
+        int incx,
+        double *Y,
+        size_t offCY,
+        int incy)
+        {
+
+            return blasDswap( N, X, offBX, incx, Y, offCY, incy );
+        }
+
+void
+clMath::blas::swap(
+        size_t N,
+        FloatComplex *X,
+        size_t offBX,
+        int incx,
+        FloatComplex *Y,
+        size_t offCY,
+        int incy)
+        {
+
+            return blasCswap( N, X, offBX, incx, Y, offCY, incy );
+        }
+
+void
+clMath::blas::swap(
+        size_t N,
+        DoubleComplex *X,
+        size_t offBX,
+        int incx,
+        DoubleComplex *Y,
+        size_t offCY,
+        int incy)
+        {
+
+            return blasZswap( N, X, offBX, incx, Y, offCY, incy );
+        }
+
+void
+::clMath::blas::scal(
+        bool is_css_zds,
+        size_t N,
+        float alpha,
+        float *X,
+        size_t offx,
+        int incx)
+{
+        is_css_zds = is_css_zds;
+        return blasSscal(N, alpha, X, offx, incx);
+}
+
+void
+::clMath::blas::scal(
+        bool is_css_zds,
+        size_t N,
+        double alpha,
+        double *X,
+        size_t offx,
+        int incx)
+{
+        is_css_zds = is_css_zds;    // Remove warning
+        return blasDscal(N, alpha, X, offx, incx);
+}
+
+void
+::clMath::blas::scal(
+        bool is_css_zds,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex *X,
+        size_t offx,
+        int incx)
+{
+    if(is_css_zds) {
+        return blasCsscal(N, CREAL(alpha), X, offx, incx);
+    } else {
+        return blasCscal(N, alpha, X, offx, incx);
+    }
+}
+
+void
+::clMath::blas::scal(
+        bool is_css_zds,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex *X,
+        size_t offx,
+        int incx)
+{
+    if(is_css_zds) {
+        return blasZdscal(N, CREAL(alpha), X, offx, incx);
+    } else {
+        return blasZscal(N, alpha, X, offx, incx);
+    }
+}
+
+//DOT
+float
+clMath::blas::dot(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx,
+        float *Y,
+        size_t offy,
+        int incy)
+        {
+
+            return blasSdot( N, X, offx, incx, Y, offy, incy );
+        }
+
+double
+clMath::blas::dot(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx,
+        double *Y,
+        size_t offy,
+        int incy)
+        {
+
+            return blasDdot( N, X, offx, incx, Y, offy, incy );
+        }
+
+FloatComplex
+clMath::blas::dot(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx,
+        FloatComplex *Y,
+        size_t offy,
+        int incy)
+        {
+
+            return blasCdotu( N, X, offx, incx, Y, offy, incy );
+        }
+
+DoubleComplex
+clMath::blas::dot(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx,
+        DoubleComplex *Y,
+        size_t offy,
+        int incy)
+        {
+
+            return blasZdotu( N, X, offx, incx, Y, offy, incy );
+        }
+
+//ASUM
+
+float
+clMath::blas::asum(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx)
+        {
+
+            return blasSasum( N, X, offx, incx);
+        }
+
+double
+clMath::blas::asum(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx)
+        {
+
+            return blasDasum( N, X, offx, incx);
+        }
+
+float
+clMath::blas::asum(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx)
+        {
+
+            return blasScasum( N, X, offx, incx);
+        }
+
+double
+clMath::blas::asum(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx)
+        {
+
+            return blasDzasum( N, X, offx, incx);
+        }
+
+//DOTC
+FloatComplex
+clMath::blas::dotc(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx,
+        FloatComplex *Y,
+        size_t offy,
+        int incy)
+        {
+
+            return blasCdotc( N, X, offx, incx, Y, offy, incy );
+        }
+
+DoubleComplex
+clMath::blas::dotc(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx,
+        DoubleComplex *Y,
+        size_t offy,
+        int incy)
+        {
+
+            return blasZdotc( N, X, offx, incx, Y, offy, incy );
+        }
+
+
+
+
+//axpy calls
+void
+	clMath::blas::axpy(
+		size_t N,
+        float alpha,
+		const float * X,
+		size_t offBX,
+		int incx,
+		float *Y,
+		size_t offCY,
+		int incy)
+{
+    return blasSaxpy(N, alpha, X, offBX, incx, Y, offCY, incy);
+}
+
+void
+	clMath::blas::axpy(
+		size_t N,
+        double alpha,
+		const double *X,
+		size_t offBX,
+		int incx,
+		double *Y,
+		size_t offCY,
+		int incy)
+{
+    return blasDaxpy(N, alpha, X, offBX, incx, Y, offCY, incy);
+}
+
+void
+	clMath::blas::axpy(
+		size_t N,
+        FloatComplex alpha,
+		const FloatComplex *X,
+		size_t offBX,
+		int incx,
+		FloatComplex *Y,
+		size_t offCY,
+		int incy)
+{
+    return blasCaxpy(N, alpha, X, offBX, incx, Y, offCY, incy);
+}
+
+void
+	clMath::blas::axpy(
+		size_t N,
+        DoubleComplex alpha,
+		const DoubleComplex *X,
+		size_t offBX,
+		int incx,
+		DoubleComplex *Y,
+		size_t offCY,
+		int incy)
+{
+    return blasZaxpy(N, alpha, X, offBX, incx, Y, offCY, incy);
+}
+
+void
+clMath::blas::rotg(
+        float* SA,
+        size_t offSA,
+        float* SB,
+        size_t offSB,
+        float* C,
+        size_t offC,
+        float* S,
+        size_t offS)
+        {
+            return blasSrotg(SA, offSA, SB, offSB, C, offC, S, offS);
+        }
+
+void
+clMath::blas::rotg(
+        double* SA,
+        size_t offSA,
+        double* SB,
+        size_t offSB,
+        double* C,
+        size_t offC,
+        double* S,
+        size_t offS)
+        {
+            return blasDrotg(SA, offSA, SB, offSB, C, offC, S, offS);
+        }
+
+void
+clMath::blas::rotg(
+        FloatComplex* SA,
+        size_t offSA,
+        FloatComplex* SB,
+        size_t offSB,
+        float* C,
+        size_t offC,
+        FloatComplex* S,
+        size_t offS)
+        {
+            return blasCrotg(SA, offSA, SB, offSB, C, offC, S, offS);
+        }
+
+void
+clMath::blas::rotg(
+        DoubleComplex* SA,
+        size_t offSA,
+        DoubleComplex* SB,
+        size_t offSB,
+        double* C,
+        size_t offC,
+        DoubleComplex* S,
+        size_t offS)
+        {
+            return blasZrotg(SA, offSA, SB, offSB, C, offC, S, offS);
+        }
+
+void
+clMath::blas::rotmg(
+        float *D1,
+        size_t offD1,
+        float *D2,
+        size_t offD2,
+        float *X1,
+        size_t offX1,
+        const float *Y1,
+        size_t offY1,
+        float *PARAM,
+        size_t offParam)
+        {
+            return blasSrotmg(D1, offD1, D2, offD2, X1, offX1, Y1, offY1,
+                        PARAM, offParam);
+        }
+
+void
+clMath::blas::rotmg(
+        double *D1,
+        size_t offD1,
+        double *D2,
+        size_t offD2,
+        double *X1,
+        size_t offX1,
+        const double *Y1,
+        size_t offY1,
+        double *PARAM,
+        size_t offParam)
+        {
+            return blasDrotmg(D1, offD1, D2, offD2, X1, offX1, Y1, offY1,
+                        PARAM, offParam);
+        }
+
+void
+clMath::blas::rotm(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx,
+        float *Y,
+        size_t offy,
+        int incy,
+        float *PARAM,
+        size_t offParam)
+        {
+            return blasSrotm(N, X, offx, incx, Y, offy, incy,
+                     PARAM, offParam);
+        }
+
+void
+clMath::blas::rotm(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx,
+        double *Y,
+        size_t offy,
+        int incy,
+        double *PARAM,
+        size_t offParam)
+        {
+            return blasDrotm(N, X, offx, incx, Y, offy, incy,
+                     PARAM, offParam);
+        }
+//rot
+void
+clMath::blas::rot(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx,
+        float *Y,
+        size_t offy,
+        int incy,
+		float C,
+		float S)
+        {
+            return blasSrot(N, X, offx, incx, Y, offy, incy,
+                     C, S);
+        }
+
+void
+clMath::blas::rot(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx,
+        double *Y,
+        size_t offy,
+        int incy,
+		double C,
+		double S)
+        {
+            return blasDrot(N, X, offx, incx, Y, offy, incy,
+                     C, S);
+        }
+
+void
+clMath::blas::rot(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx,
+        FloatComplex *Y,
+        size_t offy,
+        int incy,
+        FloatComplex C,
+        FloatComplex S)
+        {
+            return blasCsrot(N, X, offx, incx, Y, offy, incy,
+                     CREAL(C), CREAL(S));
+        }
+
+void
+clMath::blas::rot(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx,
+        DoubleComplex *Y,
+        size_t offy,
+        int incy,
+        DoubleComplex C,
+        DoubleComplex S)
+        {
+            return blasZdrot(N, X, offx, incx, Y, offy, incy,
+                     CREAL(C), CREAL(S));
+        }
+
+int
+clMath::blas::iamax(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx)
+        {
+            return blasiSamax( N, X, offx, incx );
+        }
+
+int
+clMath::blas::iamax(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx)
+        {
+
+            return blasiDamax( N, X, offx, incx );
+        }
+
+int
+clMath::blas::iamax(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx)
+        {
+
+            return blasiCamax( N, X, offx, incx );
+        }
+
+int
+clMath::blas::iamax(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx)
+        {
+
+            return blasiZamax( N, X, offx, incx );
+        }
+
+
+float
+clMath::blas::nrm2(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx)
+        {
+
+            return blasSnrm2( N, X, offx, incx );
+        }
+
+double
+clMath::blas::nrm2(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx)
+        {
+
+            return blasDnrm2( N, X, offx, incx );
+        }
+
+float
+clMath::blas::nrm2(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx)
+        {
+
+            return blasScnrm2( N, X, offx, incx );
+        }
+
+double
+clMath::blas::nrm2(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx)
+        {
+
+            return blasDznrm2( N, X, offx, incx );
+        }
diff --git a/src/tests/blas.c b/src/tests/blas.c
new file mode 100644
index 0000000..54a0fbe
--- /dev/null
+++ b/src/tests/blas.c
@@ -0,0 +1,4966 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>         /* abort() */
+#include <stdio.h>          /* fprintf(), stderr */
+
+#include <clBLAS.h>
+#include <blas-internal.h>
+#include <common.h>
+
+#if defined CORR_TEST_WITH_ACML
+#include <acml.h>
+#else
+#include <blas-cblas.h>
+#endif
+
+void
+blasSgemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    float alpha,
+    const float *A,
+    size_t lda,
+    const float *X,
+    int incx,
+    float beta,
+    float *Y,
+    int incy)
+{
+    char fTransA;
+    int fM, fN;
+    int fLDA;
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fTransA = encodeTranspose(transA);
+    fM = (int)M;
+    fN = (int)N;
+    fLDA = (int)lda;
+
+    sgemv(fTransA, fM, fN,
+        alpha, (float*)A, fLDA, (float*)X, incx, beta, Y, incy);
+}
+
+void
+blasDgemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    double alpha,
+    const double *A,
+    size_t lda,
+    const double *X,
+    int incx,
+    double beta,
+    double *Y,
+    int incy)
+{
+    char fTransA;
+    int fM, fN;
+    int fLDA;
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fTransA = encodeTranspose(transA);
+    fM = (int)M;
+    fN = (int)N;
+    fLDA = (int)lda;
+
+    dgemv(fTransA, fM, fN,
+        alpha, (double*)A, fLDA, (double*)X, incx, beta, Y, incy);
+}
+
+void
+blasCgemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const FloatComplex *A,
+    size_t lda,
+    const FloatComplex *X,
+    int incx,
+    FloatComplex beta,
+    FloatComplex *Y,
+    int incy)
+{
+    char fTransA;
+    int fM, fN;
+    int fLDA;
+    complex *fA, *fX, *fY;
+    complex fAlpha, fBeta;
+#if 0
+    size_t sizeA, sizeX, sizeY;
+    size_t i;
+
+    sizeA = lda * N; //column major
+
+    if (transA == clblasNoTrans) {
+        sizeX = (N - 1) * abs(incx) + 1;
+        sizeY = (M - 1) * abs(incy) + 1;
+    } else {
+        sizeX = (M - 1) * abs(incx) + 1;
+        sizeY = (N - 1) * abs(incy) + 1;
+    }
+#endif
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fTransA = encodeTranspose(transA);
+    fM = (int)M;
+    fN = (int)N;
+    fLDA = (int)lda;
+
+    fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha));
+    fBeta = compose_complex(CREAL(beta), CIMAG(beta));
+
+#if 0
+    fA = (complex*)calloc(sizeA, sizeof(complex));
+    if (fA == NULL) {
+        return;
+    }
+    fX = (complex*)calloc(sizeX, sizeof(complex));
+    if (fX == NULL) {
+        free(fA);
+        return;
+    }
+    fY = (complex*)calloc(sizeY, sizeof(complex));
+    if (fY == NULL) {
+        free(fX);
+        free(fA);
+        return;
+    }
+
+    for (i = 0; i < sizeA; i++) {
+        fA[i] = compose_complex(CREAL(A[i]), CIMAG(A[i]));
+    }
+    for (i = 0; i < sizeX; i++) {
+        fX[i] = compose_complex(CREAL(X[i]), CIMAG(X[i]));
+    }
+    for (i = 0; i < sizeY; i++) {
+        fY[i] = compose_complex(CREAL(Y[i]), CIMAG(Y[i]));
+    }
+#else
+    fA = (complex*)A;
+    fX = (complex*)X;
+    fY = (complex*)Y;
+#endif
+    cgemv(fTransA, fM, fN,
+        &fAlpha, fA, fLDA, fX, incx, &fBeta, fY, incy);
+#if 0
+    for (i = 0; i < sizeY; i++) {
+        Y[i] = floatComplex(complex_real(fY[i]), complex_imag(fY[i]));
+    }
+    free(fY);
+    free(fX);
+    free(fA);
+#endif
+}
+
+void
+blasZgemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const DoubleComplex *A,
+    size_t lda,
+    const DoubleComplex *X,
+    int incx,
+    DoubleComplex beta,
+    DoubleComplex *Y,
+    int incy)
+{
+    char fTransA;
+    int fM, fN;
+    int fLDA;
+    doublecomplex *fA, *fX, *fY;
+    doublecomplex fAlpha, fBeta;
+#if 0
+    size_t sizeA, sizeX, sizeY;
+    size_t i;
+
+    sizeA = lda * N; //column major
+
+    if (transA == clblasNoTrans) {
+        sizeX = (N - 1) * abs(incx) + 1;
+        sizeY = (M - 1) * abs(incy) + 1;
+    } else {
+        sizeX = (M - 1) * abs(incx) + 1;
+        sizeY = (N - 1) * abs(incy) + 1;
+    }
+#endif
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fTransA = encodeTranspose(transA);
+    fM = (int)M;
+    fN = (int)N;
+    fLDA = (int)lda;
+
+    fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha));
+    fBeta = compose_doublecomplex(CREAL(beta), CIMAG(beta));
+#if 0
+    fA = (doublecomplex*)calloc(sizeA, sizeof(doublecomplex));
+    if (fA == NULL) {
+        return;
+    }
+    fX = (doublecomplex*)calloc(sizeX, sizeof(doublecomplex));
+    if (fX == NULL) {
+        free(fA);
+        return;
+    }
+    fY = (doublecomplex*)calloc(sizeY, sizeof(doublecomplex));
+    if (fY == NULL) {
+        free(fX);
+        free(fA);
+        return;
+    }
+
+    for (i = 0; i < sizeA; i++) {
+        fA[i] = compose_doublecomplex(CREAL(A[i]), CIMAG(A[i]));
+    }
+    for (i = 0; i < sizeX; i++) {
+        fX[i] = compose_doublecomplex(CREAL(X[i]), CIMAG(X[i]));
+    }
+    for (i = 0; i < sizeY; i++) {
+        fY[i] = compose_doublecomplex(CREAL(Y[i]), CIMAG(Y[i]));
+    }
+#else
+    fA = (doublecomplex*)A;
+    fX = (doublecomplex*)X;
+    fY = (doublecomplex*)Y;
+#endif
+    zgemv(fTransA, fM, fN,
+        &fAlpha, fA, fLDA, fX, incx, &fBeta, fY, incy);
+#if 0
+    for (i = 0; i < sizeY; i++) {
+        Y[i] = doubleComplex(
+            doublecomplex_real(fY[i]),
+            doublecomplex_imag(fY[i]));
+    }
+    free(fY);
+    free(fX);
+    free(fA);
+#endif
+}
+
+void
+blasSsymv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    float alpha,
+    const float *A,
+    size_t lda,
+    const float *X,
+    int incx,
+    float beta,
+    float *Y,
+    int incy)
+{
+    char fUplo;
+    int fN;
+    int fLDA;
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fUplo = encodeUplo(uplo);
+    fN = (int)N;
+    fLDA = (int)lda;
+
+    ssymv(fUplo, fN, alpha, (float*)A, fLDA, (float*)X, incx, beta, Y, incy);
+}
+
+void
+blasDsymv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    double alpha,
+    const double *A,
+    size_t lda,
+    const double *X,
+    int incx,
+    double beta,
+    double *Y,
+    int incy)
+{
+    char fUplo;
+    int fN;
+    int fLDA;
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fUplo = encodeUplo(uplo);
+    fN = (int)N;
+    fLDA = (int)lda;
+
+    dsymv(fUplo, fN, alpha, (double*)A, fLDA, (double*)X, incx, beta, Y, incy);
+}
+
+void
+blasSgemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    float alpha,
+    const float *A,
+    size_t lda,
+    const float *B,
+    size_t ldb,
+    float beta,
+    float *C,
+    size_t ldc)
+{
+    char fTransA, fTransB;
+    int fM, fN, fK;
+    int fLDA, fLDB, fLDC;
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fTransA = encodeTranspose(transA);
+    fTransB = encodeTranspose(transB);
+    fM = (int)M;
+    fN = (int)N;
+    fK = (int)K;
+    fLDA = (int)lda;
+    fLDB = (int)ldb;
+    fLDC = (int)ldc;
+
+    sgemm(fTransA, fTransB, fM, fN, fK,
+        alpha, (float*)A, fLDA, (float*)B, fLDB, beta, C, fLDC);
+}
+
+void
+blasDgemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    double alpha,
+    const double *A,
+    size_t lda,
+    const double *B,
+    size_t ldb,
+    double beta,
+    double *C,
+    size_t ldc)
+{
+    char fTransA, fTransB;
+    int fM, fN, fK;
+    int fLDA, fLDB, fLDC;
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fTransA = encodeTranspose(transA);
+    fTransB = encodeTranspose(transB);
+    fM = (int)M;
+    fN = (int)N;
+    fK = (int)K;
+    fLDA = (int)lda;
+    fLDB = (int)ldb;
+    fLDC = (int)ldc;
+
+    dgemm(fTransA, fTransB, fM, fN, fK,
+        alpha, (double*)A, fLDA, (double*)B, fLDB, beta, C, fLDC);
+}
+
+void
+blasCgemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const FloatComplex *A,
+    size_t lda,
+    const FloatComplex *B,
+    size_t ldb,
+    FloatComplex beta,
+    FloatComplex *C,
+    size_t ldc)
+{
+    char fTransA, fTransB;
+    int fM, fN, fK;
+    int fLDA, fLDB, fLDC;
+    complex *fA, *fB, *fC;
+    complex fAlpha, fBeta;
+#if 0
+    size_t ma, ka, nb, kb, mc, nc;
+    size_t i;
+
+    if (transA == clblasNoTrans) {
+        ma = lda;
+        ka = K;
+    } else {
+        ka = lda;
+        ma = M;
+    }
+    if (transB == clblasNoTrans) {
+        kb = ldb;
+        nb = N;
+    } else {
+        nb = ldb;
+        kb = K;
+    }
+    mc = ldc;
+    nc = N;
+#endif
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fTransA = encodeTranspose(transA);
+    fTransB = encodeTranspose(transB);
+    fM = (int)M;
+    fN = (int)N;
+    fK = (int)K;
+    fLDA = (int)lda;
+    fLDB = (int)ldb;
+    fLDC = (int)ldc;
+
+    fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha));
+    fBeta = compose_complex(CREAL(beta), CIMAG(beta));
+#if 0
+    fA = (complex*)calloc(ma * ka, sizeof(complex));
+    if (fA == NULL) {
+        return;
+    }
+    fB = (complex*)calloc(kb * nb, sizeof(complex));
+    if (fB == NULL) {
+        free(fA);
+        return;
+    }
+    fC = (complex*)calloc(mc * nc, sizeof(complex));
+    if (fC == NULL) {
+        free(fB);
+        free(fA);
+        return;
+    }
+
+    for (i = 0; i < ma * ka; i++) {
+        fA[i] = compose_complex(CREAL(A[i]), CIMAG(A[i]));
+    }
+    for (i = 0; i < kb * nb; i++) {
+        fB[i] = compose_complex(CREAL(B[i]), CIMAG(B[i]));
+    }
+    for (i = 0; i < mc * nc; i++) {
+        fC[i] = compose_complex(CREAL(C[i]), CIMAG(C[i]));
+    }
+#else
+    fA = (complex*)A;
+    fB = (complex*)B;
+    fC = (complex*)C;
+#endif
+    cgemm(fTransA, fTransB, fM, fN, fK,
+        &fAlpha, fA, fLDA, fB, fLDB, &fBeta, fC, fLDC);
+#if 0
+    for (i = 0; i < mc * nc; i++) {
+        C[i] = floatComplex(complex_real(fC[i]), complex_imag(fC[i]));
+    }
+    free(fC);
+    free(fB);
+    free(fA);
+#endif
+}
+
+void
+blasZgemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const DoubleComplex *A,
+    size_t lda,
+    const DoubleComplex *B,
+    size_t ldb,
+    DoubleComplex beta,
+    DoubleComplex *C,
+    size_t ldc)
+{
+    char fTransA, fTransB;
+    int fM, fN, fK;
+    int fLDA, fLDB, fLDC;
+    doublecomplex *fA, *fB, *fC;
+    doublecomplex fAlpha, fBeta;
+#if 0
+    size_t ma, ka, nb, kb, mc, nc;
+    size_t i;
+
+    if (transA == clblasNoTrans) {
+        ma = lda;
+        ka = K;
+    } else {
+        ka = lda;
+        ma = M;
+    }
+    if (transB == clblasNoTrans) {
+        kb = ldb;
+        nb = N;
+    } else {
+        nb = ldb;
+        kb = K;
+    }
+    mc = ldc;
+    nc = N;
+#endif
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fTransA = encodeTranspose(transA);
+    fTransB = encodeTranspose(transB);
+    fM = (int)M;
+    fN = (int)N;
+    fK = (int)K;
+    fLDA = (int)lda;
+    fLDB = (int)ldb;
+    fLDC = (int)ldc;
+
+    fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha));
+    fBeta = compose_doublecomplex(CREAL(beta), CIMAG(beta));
+#if 0
+    fA = (doublecomplex*)calloc(ma * ka, sizeof(doublecomplex));
+    if (fA == NULL) {
+        return;
+    }
+    fB = (doublecomplex*)calloc(kb * nb, sizeof(doublecomplex));
+    if (fB == NULL) {
+        free(fA);
+        return;
+    }
+    fC = (doublecomplex*)calloc(mc * nc, sizeof(doublecomplex));
+    if (fC == NULL) {
+        free(fB);
+        free(fA);
+        return;
+    }
+
+    for (i = 0; i < ma * ka; i++) {
+        fA[i] = compose_doublecomplex(CREAL(A[i]), CIMAG(A[i]));
+    }
+    for (i = 0; i < kb * nb; i++) {
+        fB[i] = compose_doublecomplex(CREAL(B[i]), CIMAG(B[i]));
+    }
+    for (i = 0; i < mc * nc; i++) {
+        fC[i] = compose_doublecomplex(CREAL(C[i]), CIMAG(C[i]));
+    }
+#else
+    fA = (doublecomplex*)A;
+    fB = (doublecomplex*)B;
+    fC = (doublecomplex*)C;
+#endif
+    zgemm(fTransA, fTransB, fM, fN, fK,
+        &fAlpha, fA, fLDA, fB, fLDB, &fBeta, fC, fLDC);
+#if 0
+    for (i = 0; i < mc * nc; i++) {
+        C[i] = doubleComplex(doublecomplex_real(fC[i]), doublecomplex_imag(fC[i]));
+    }
+    free(fC);
+    free(fB);
+    free(fA);
+#endif
+}
+
+void blasStrmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    float alpha,
+    const float *A,
+    size_t lda,
+    float *B,
+    size_t ldb)
+{
+    char fSide, fUplo, fTransA, fDiag;
+    int fM, fN;
+    int fLDA, fLDB;
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fSide = encodeSide(side);
+    fUplo = encodeUplo(uplo);
+    fTransA = encodeTranspose(transA);
+    fDiag = encodeDiag(diag);
+    fM = (int)M;
+    fN = (int)N;
+    fLDA = (int)lda;
+    fLDB = (int)ldb;
+
+    strmm(fSide, fUplo, fTransA, fDiag, fM, fN,
+        alpha, (float*)A, fLDA, B, fLDB);
+}
+
+void blasDtrmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    double alpha,
+    const double *A,
+    size_t lda,
+    double *B,
+    size_t ldb)
+{
+    char fSide, fUplo, fTransA, fDiag;
+    int fM, fN;
+    int fLDA, fLDB;
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fSide = encodeSide(side);
+    fUplo = encodeUplo(uplo);
+    fTransA = encodeTranspose(transA);
+    fDiag = encodeDiag(diag);
+    fM = (int)M;
+    fN = (int)N;
+    fLDA = (int)lda;
+    fLDB = (int)ldb;
+
+    dtrmm(fSide, fUplo, fTransA, fDiag, fM, fN,
+        alpha, (double*)A, fLDA, B, fLDB);
+}
+
+void blasCtrmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const FloatComplex *A,
+    size_t lda,
+    FloatComplex *B,
+    size_t ldb)
+{
+    char fSide, fUplo, fTransA, fDiag;
+    int fM, fN;
+    int fLDA, fLDB;
+    complex *fA, *fB;
+    complex fAlpha;
+#if 0
+    size_t ma, na, mb, nb;
+    size_t i;
+
+    ma = lda;
+    if (side == clblasLeft) {
+        na = M;
+    }
+    else {
+        na = N;
+    }
+    mb = ldb;
+    nb = N;
+#endif
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fSide = encodeSide(side);
+    fUplo = encodeUplo(uplo);
+    fTransA = encodeTranspose(transA);
+    fDiag = encodeDiag(diag);
+    fM = (int)M;
+    fN = (int)N;
+    fLDA = (int)lda;
+    fLDB = (int)ldb;
+
+    fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha));
+#if 0
+    fA = (complex*)calloc(ma * na, sizeof(complex));
+    if (fA == NULL) {
+        return;
+    }
+    fB = (complex*)calloc(mb * nb, sizeof(complex));
+    if (fB == NULL) {
+        free(fA);
+        return;
+    }
+
+    for (i = 0; i < ma * na; i++) {
+        fA[i] = compose_complex(CREAL(A[i]), CIMAG(A[i]));
+    }
+    for (i = 0; i < mb * nb; i++) {
+        fB[i] = compose_complex(CREAL(B[i]), CIMAG(B[i]));
+    }
+#else
+    fA = (complex*)A;
+    fB = (complex*)B;
+#endif
+    ctrmm(fSide, fUplo, fTransA, fDiag, fM, fN,
+        &fAlpha, fA, fLDA, fB, fLDB);
+#if 0
+    for (i = 0; i < mb * nb; i++) {
+        B[i] = floatComplex(complex_real(fB[i]), complex_imag(fB[i]));
+    }
+    free(fB);
+    free(fA);
+#endif
+}
+
+void blasZtrmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const DoubleComplex *A,
+    size_t lda,
+    DoubleComplex *B,
+    size_t ldb)
+{
+    char fSide, fUplo, fTransA, fDiag;
+    int fM, fN;
+    int fLDA, fLDB;
+    doublecomplex *fA, *fB;
+    doublecomplex fAlpha;
+#if 0
+    size_t ma, na, mb, nb;
+    size_t i;
+
+    ma = lda;
+    if (side == clblasLeft) {
+        na = M;
+    }
+    else {
+        na = N;
+    }
+    mb = ldb;
+    nb = N;
+#endif
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fSide = encodeSide(side);
+    fUplo = encodeUplo(uplo);
+    fTransA = encodeTranspose(transA);
+    fDiag = encodeDiag(diag);
+    fM = (int)M;
+    fN = (int)N;
+    fLDA = (int)lda;
+    fLDB = (int)ldb;
+
+    fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha));
+#if 0
+    fA = (doublecomplex*)calloc(ma * na, sizeof(doublecomplex));
+    if (fA == NULL) {
+        return;
+    }
+    fB = (doublecomplex*)calloc(mb * nb, sizeof(doublecomplex));
+    if (fB == NULL) {
+        free(fA);
+        return;
+    }
+
+    for (i = 0; i < ma * na; i++) {
+        fA[i] = compose_doublecomplex(CREAL(A[i]), CIMAG(A[i]));
+    }
+    for (i = 0; i < mb * nb; i++) {
+        fB[i] = compose_doublecomplex(CREAL(B[i]), CIMAG(B[i]));
+    }
+#else
+    fA = (doublecomplex*)A;
+    fB = (doublecomplex*)B;
+#endif
+    ztrmm(fSide, fUplo, fTransA, fDiag, fM, fN,
+        &fAlpha, fA, fLDA, fB, fLDB);
+#if 0
+    for (i = 0; i < mb * nb; i++) {
+        B[i] = doubleComplex(doublecomplex_real(fB[i]), doublecomplex_imag(fB[i]));
+    }
+    free(fB);
+    free(fA);
+#endif
+}
+
+void blasStrsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    float alpha,
+    const float *A,
+    size_t lda,
+    float *B,
+    size_t ldb)
+{
+    char fSide, fUplo, fTransA, fDiag;
+    int fM, fN;
+    int fLDA, fLDB;
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fSide = encodeSide(side);
+    fUplo = encodeUplo(uplo);
+    fTransA = encodeTranspose(transA);
+    fDiag = encodeDiag(diag);
+    fM = (int)M;
+    fN = (int)N;
+    fLDA = (int)lda;
+    fLDB = (int)ldb;
+
+    strsm(fSide, fUplo, fTransA, fDiag, fM, fN,
+        alpha, (float*)A, fLDA, B, fLDB);
+}
+
+void blasDtrsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    double alpha,
+    const double *A,
+    size_t lda,
+    double *B,
+    size_t ldb)
+{
+    char fSide, fUplo, fTransA, fDiag;
+    int fM, fN;
+    int fLDA, fLDB;
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fSide = encodeSide(side);
+    fUplo = encodeUplo(uplo);
+    fTransA = encodeTranspose(transA);
+    fDiag = encodeDiag(diag);
+    fM = (int)M;
+    fN = (int)N;
+    fLDA = (int)lda;
+    fLDB = (int)ldb;
+
+    dtrsm(fSide, fUplo, fTransA, fDiag, fM, fN,
+        alpha, (double*)A, fLDA, B, fLDB);
+}
+
+void blasCtrsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const FloatComplex *A,
+    size_t lda,
+    FloatComplex *B,
+    size_t ldb)
+{
+    char fSide, fUplo, fTransA, fDiag;
+    int fM, fN;
+    int fLDA, fLDB;
+    complex *fA, *fB;
+    complex fAlpha;
+#if 0
+    size_t ma, na, mb, nb;
+    size_t i;
+
+    ma = lda;
+    if (side == clblasLeft) {
+        na = M;
+    }
+    else {
+        na = N;
+    }
+    mb = ldb;
+    nb = N;
+#endif
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fSide = encodeSide(side);
+    fUplo = encodeUplo(uplo);
+    fTransA = encodeTranspose(transA);
+    fDiag = encodeDiag(diag);
+    fM = (int)M;
+    fN = (int)N;
+    fLDA = (int)lda;
+    fLDB = (int)ldb;
+
+    fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha));
+#if 0
+    fA = (complex*)calloc(ma * na, sizeof(complex));
+    if (fA == NULL) {
+        return;
+    }
+    fB = (complex*)calloc(mb * nb, sizeof(complex));
+    if (fB == NULL) {
+        free(fA);
+        return;
+    }
+
+    for (i = 0; i < ma * na; i++) {
+        fA[i] = compose_complex(CREAL(A[i]), CIMAG(A[i]));
+    }
+    for (i = 0; i < mb * nb; i++) {
+        fB[i] = compose_complex(CREAL(B[i]), CIMAG(B[i]));
+    }
+#else
+    fA = (complex*)A;
+    fB = (complex*)B;
+#endif
+    ctrsm(fSide, fUplo, fTransA, fDiag, fM, fN,
+        &fAlpha, fA, fLDA, fB, fLDB);
+#if 0
+    for (i = 0; i < mb * nb; i++) {
+        B[i] = floatComplex(complex_real(fB[i]), complex_imag(fB[i]));
+    }
+    free(fB);
+    free(fA);
+#endif
+}
+
+void blasZtrsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const DoubleComplex *A,
+    size_t lda,
+    DoubleComplex *B,
+    size_t ldb)
+{
+    char fSide, fUplo, fTransA, fDiag;
+    int fM, fN;
+    int fLDA, fLDB;
+    doublecomplex *fA, *fB;
+    doublecomplex fAlpha;
+#if 0
+    size_t ma, na, mb, nb;
+    size_t i;
+
+    ma = lda;
+    if (side == clblasLeft) {
+        na = M;
+    }
+    else {
+        na = N;
+    }
+    mb = ldb;
+    nb = N;
+#endif
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fSide = encodeSide(side);
+    fUplo = encodeUplo(uplo);
+    fTransA = encodeTranspose(transA);
+    fDiag = encodeDiag(diag);
+    fM = (int)M;
+    fN = (int)N;
+    fLDA = (int)lda;
+    fLDB = (int)ldb;
+
+    fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha));
+#if 0
+    fA = (doublecomplex*)calloc(ma * na, sizeof(doublecomplex));
+    if (fA == NULL) {
+        return;
+    }
+    fB = (doublecomplex*)calloc(mb * nb, sizeof(doublecomplex));
+    if (fB == NULL) {
+        free(fA);
+        return;
+    }
+
+    for (i = 0; i < ma * na; i++) {
+        fA[i] = compose_doublecomplex(CREAL(A[i]), CIMAG(A[i]));
+    }
+    for (i = 0; i < mb * nb; i++) {
+        fB[i] = compose_doublecomplex(CREAL(B[i]), CIMAG(B[i]));
+    }
+#else
+    fA = (doublecomplex*)A;
+    fB = (doublecomplex*)B;
+#endif
+    ztrsm(fSide, fUplo, fTransA, fDiag, fM, fN,
+        &fAlpha, fA, fLDA, fB, fLDB);
+#if 0
+    for (i = 0; i < mb * nb; i++) {
+        B[i] = doubleComplex(doublecomplex_real(fB[i]), doublecomplex_imag(fB[i]));
+    }
+    free(fB);
+    free(fA);
+#endif
+}
+
+void
+blasSsyr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    float alpha,
+    const float *A,
+    size_t lda,
+    const float *B,
+    size_t ldb,
+    float beta,
+    float *C,
+    size_t ldc)
+{
+    char fUplo, fTransA;
+    int fN, fK;
+    int fLDA, fLDB, fLDC;
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fUplo = encodeUplo(uplo);
+    fTransA = encodeTranspose(transA);
+    fN = (int)N;
+    fK = (int)K;
+    fLDA = (int)lda;
+    fLDB = (int)ldb;
+    fLDC = (int)ldc;
+
+    ssyr2k(fUplo, fTransA, fN, fK, alpha, (float*)A, fLDA, (float*)B, fLDB,
+        beta, C, fLDC);
+}
+
+void
+blasDsyr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    double alpha,
+    const double *A,
+    size_t lda,
+    const double *B,
+    size_t ldb,
+    double beta,
+    double *C,
+    size_t ldc)
+{
+    char fUplo, fTransA;
+    int fN, fK;
+    int fLDA, fLDB, fLDC;
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fUplo = encodeUplo(uplo);
+    fTransA = encodeTranspose(transA);
+    fN = (int)N;
+    fK = (int)K;
+    fLDA = (int)lda;
+    fLDB = (int)ldb;
+    fLDC = (int)ldc;
+
+    dsyr2k(fUplo, fTransA, fN, fK, alpha, (double*)A, fLDA, (double*)B, fLDB,
+        beta, C, fLDC);
+}
+
+void
+blasCsyr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const FloatComplex *A,
+    size_t lda,
+    const FloatComplex *B,
+    size_t ldb,
+    FloatComplex beta,
+    FloatComplex *C,
+    size_t ldc)
+{
+    char fUplo, fTransA;
+    int fN, fK;
+    int fLDA, fLDB, fLDC;
+    complex *fA, *fB, *fC;
+    complex fAlpha, fBeta;
+#if 0
+    size_t na, ka, nb, kb, rowsC, columnsC;
+    size_t i;
+
+    if (transA == clblasNoTrans) {
+        na = lda;
+        ka = K;
+        nb = ldb;
+        kb = K;
+    }
+    else {
+        ka = lda;
+        na = N;
+        kb = ldb;
+        nb = N;
+    }
+    rowsC = ldc;
+    columnsC = N;
+#endif
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fUplo = encodeUplo(uplo);
+    fTransA = encodeTranspose(transA);
+    fN = (int)N;
+    fK = (int)K;
+    fLDA = (int)lda;
+    fLDB = (int)ldb;
+    fLDC = (int)ldc;
+
+    fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha));
+    fBeta = compose_complex(CREAL(beta), CIMAG(beta));
+#if 0
+    fA = (complex*)calloc(na * ka, sizeof(complex));
+    if (fA == NULL) {
+        return;
+    }
+    fB = (complex*)calloc(nb * kb, sizeof(complex));
+    if (fB == NULL) {
+        free(fA);
+        return;
+    }
+    fC = (complex*)calloc(rowsC * columnsC, sizeof(complex));
+    if (fC == NULL) {
+        free(fB);
+        free(fA);
+        return;
+    }
+
+    for (i = 0; i < na * ka; i++) {
+        fA[i] = compose_complex(CREAL(A[i]), CIMAG(A[i]));
+    }
+    for (i = 0; i < nb * kb; i++) {
+        fB[i] = compose_complex(CREAL(B[i]), CIMAG(B[i]));
+    }
+    for (i = 0; i < rowsC * columnsC; i++) {
+        fC[i] = compose_complex(CREAL(C[i]), CIMAG(C[i]));
+    }
+#else
+    fA = (complex*)A;
+    fB = (complex*)B;
+    fC = (complex*)C;
+#endif
+    csyr2k(fUplo, fTransA, fN, fK, &fAlpha, fA, fLDA,
+        fB, fLDB, &fBeta, fC, fLDC);
+#if 0
+    for (i = 0; i < rowsC * columnsC; i++) {
+        C[i] = floatComplex(complex_real(fC[i]), complex_imag(fC[i]));
+    }
+    free(fC);
+    free(fB);
+    free(fA);
+#endif
+}
+
+void
+blasZsyr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const DoubleComplex *A,
+    size_t lda,
+    const DoubleComplex *B,
+    size_t ldb,
+    DoubleComplex beta,
+    DoubleComplex *C,
+    size_t ldc)
+{
+    char fUplo, fTransA;
+    int fN, fK;
+    int fLDA, fLDB, fLDC;
+    doublecomplex *fA, *fB, *fC;
+    doublecomplex fAlpha, fBeta;
+#if 0
+    size_t na, ka, nb, kb, rowsC, columnsC;
+    size_t i;
+
+
+    if (transA == clblasNoTrans) {
+        na = lda;
+        ka = K;
+        nb = ldb;
+        kb = K;
+    }
+    else {
+        ka = lda;
+        na = N;
+        kb = ldb;
+        nb = N;
+    }
+    rowsC = ldc;
+    columnsC = N;
+#endif
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fUplo = encodeUplo(uplo);
+    fTransA = encodeTranspose(transA);
+    fN = (int)N;
+    fK = (int)K;
+    fLDA = (int)lda;
+    fLDB = (int)ldb;
+    fLDC = (int)ldc;
+
+    fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha));
+    fBeta = compose_doublecomplex(CREAL(beta), CIMAG(beta));
+#if 0
+    fA = (doublecomplex*)calloc(na * ka, sizeof(doublecomplex));
+    if (fA == NULL) {
+        return;
+    }
+    fB = (doublecomplex*)calloc(nb * kb, sizeof(doublecomplex));
+    if (fB == NULL) {
+        free(fA);
+        return;
+    }
+    fC = (doublecomplex*)calloc(rowsC * columnsC, sizeof(doublecomplex));
+    if (fC == NULL) {
+        free(fB);
+        free(fA);
+        return;
+    }
+
+    for (i = 0; i < na * ka; i++) {
+        fA[i] = compose_doublecomplex(CREAL(A[i]), CIMAG(A[i]));
+    }
+    for (i = 0; i < nb * kb; i++) {
+        fB[i] = compose_doublecomplex(CREAL(B[i]), CIMAG(B[i]));
+    }
+    for (i = 0; i < rowsC * columnsC; i++) {
+        fC[i] = compose_doublecomplex(CREAL(C[i]), CIMAG(C[i]));
+    }
+#else
+    fA = (doublecomplex*)A;
+    fB = (doublecomplex*)B;
+    fC = (doublecomplex*)C;
+#endif
+    zsyr2k(fUplo, fTransA, fN, fK, &fAlpha, fA, fLDA,
+        fB, fLDB, &fBeta, fC, fLDC);
+#if 0
+    for (i = 0; i < rowsC * columnsC; i++) {
+        C[i] = doubleComplex(doublecomplex_real(fC[i]), doublecomplex_imag(fC[i]));
+    }
+    free(fC);
+    free(fB);
+    free(fA);
+#endif
+}
+
+
+void
+blasSsyrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    float alpha,
+    const float *A,
+    size_t lda,
+    float beta,
+    float *C,
+    size_t ldc)
+{
+    char fUplo, fTransA;
+    int fN, fK;
+    int fLDA, fLDC;
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fUplo = encodeUplo(uplo);
+    fTransA = encodeTranspose(transA);
+    fN = (int)N;
+    fK = (int)K;
+    fLDA = (int)lda;
+    fLDC = (int)ldc;
+
+    ssyrk(fUplo, fTransA, fN, fK, alpha, (float*)A, fLDA,
+        beta, C, fLDC);
+}
+
+void
+blasDsyrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    double alpha,
+    const double *A,
+    size_t lda,
+    double beta,
+    double *C,
+    size_t ldc)
+{
+    char fUplo, fTransA;
+    int fN, fK;
+    int fLDA, fLDC;
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fUplo = encodeUplo(uplo);
+    fTransA = encodeTranspose(transA);
+    fN = (int)N;
+    fK = (int)K;
+    fLDA = (int)lda;
+    fLDC = (int)ldc;
+
+    dsyrk(fUplo, fTransA, fN, fK, alpha, (double*)A, fLDA,
+        beta, C, fLDC);
+}
+
+void
+blasCsyrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const FloatComplex *A,
+    size_t lda,
+    FloatComplex beta,
+    FloatComplex *C,
+    size_t ldc)
+{
+    char fUplo, fTransA;
+    int fN, fK;
+    int fLDA, fLDC;
+    complex *fA, *fC;
+    complex fAlpha, fBeta;
+#if 0
+    size_t i;
+    size_t na, ka, rowsC, columnsC;
+    if (transA == clblasNoTrans) {
+        na = lda;
+        ka = K;
+    }
+    else {
+        ka = lda;
+        na = N;
+    }
+    rowsC = ldc;
+    columnsC = N;
+#endif
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fUplo = encodeUplo(uplo);
+    fTransA = encodeTranspose(transA);
+    fN = (int)N;
+    fK = (int)K;
+    fLDA = (int)lda;
+    fLDC = (int)ldc;
+
+    fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha));
+    fBeta = compose_complex(CREAL(beta), CIMAG(beta));
+#if 0
+    fA = (complex*)calloc(na * ka, sizeof(complex));
+    if (fA == NULL) {
+        return;
+    }
+    fC = (complex*)calloc(rowsC * columnsC, sizeof(complex));
+    if (fC == NULL) {
+        free(fA);
+        return;
+    }
+
+    for (i = 0; i < na * ka; i++) {
+        fA[i] = compose_complex(CREAL(A[i]), CIMAG(A[i]));
+    }
+    for (i = 0; i < rowsC * columnsC; i++) {
+        fC[i] = compose_complex(CREAL(C[i]), CIMAG(C[i]));
+    }
+#else
+    fA = (complex*)A;
+    fC = (complex*)C;
+#endif
+    csyrk(fUplo, fTransA, fN, fK, &fAlpha, fA, fLDA,
+        &fBeta, fC, fLDC);
+#if 0
+    for (i = 0; i < rowsC * columnsC; i++) {
+        C[i] = floatComplex(complex_real(fC[i]), complex_imag(fC[i]));
+    }
+    free(fC);
+    free(fA);
+#endif
+}
+
+void
+blasZsyrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const DoubleComplex *A,
+    size_t lda,
+    DoubleComplex beta,
+    DoubleComplex *C,
+    size_t ldc)
+{
+    char fUplo, fTransA;
+    int fN, fK;
+    int fLDA, fLDC;
+    doublecomplex *fA, *fC;
+    doublecomplex fAlpha, fBeta;
+#if 0
+    size_t na, ka, rowsC, columnsC;
+    size_t i;
+
+    if (transA == clblasNoTrans) {
+        na = lda;
+        ka = K;
+    }
+    else {
+        ka = lda;
+        na = N;
+    }
+    rowsC = ldc;
+    columnsC = N;
+#endif
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fUplo = encodeUplo(uplo);
+    fTransA = encodeTranspose(transA);
+    fN = (int)N;
+    fK = (int)K;
+    fLDA = (int)lda;
+    fLDC = (int)ldc;
+
+    fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha));
+    fBeta = compose_doublecomplex(CREAL(beta), CIMAG(beta));
+#if 0
+    fA = (doublecomplex*)calloc(na * ka, sizeof(doublecomplex));
+    if (fA == NULL) {
+        return;
+    }
+    fC = (doublecomplex*)calloc(rowsC * columnsC, sizeof(doublecomplex));
+    if (fC == NULL) {
+        free(fA);
+        return;
+    }
+
+    for (i = 0; i < na * ka; i++) {
+        fA[i] = compose_doublecomplex(CREAL(A[i]), CIMAG(A[i]));
+    }
+    for (i = 0; i < rowsC * columnsC; i++) {
+        fC[i] = compose_doublecomplex(CREAL(C[i]), CIMAG(C[i]));
+    }
+#else
+    fA = (doublecomplex*)A;
+    fC = (doublecomplex*)C;
+#endif
+    zsyrk(fUplo, fTransA, fN, fK, &fAlpha, fA, fLDA,
+        &fBeta, fC, fLDC);
+#if 0
+    for (i = 0; i < rowsC * columnsC; i++) {
+        C[i] = doubleComplex(doublecomplex_real(fC[i]), doublecomplex_imag(fC[i]));
+    }
+    free(fC);
+    free(fA);
+#endif
+}
+
+void
+blasStrmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        float *A,
+        size_t offa,
+        size_t lda,
+        float *X,
+        size_t offx,
+        int incx)
+{
+	char fUplo, fDiag, fTrans;
+	int fN, fLda;
+
+	fUplo = encodeUplo(uplo);
+    fTrans = encodeTranspose(transA);
+    fDiag = encodeDiag(diag);
+
+	if (order != clblasColumnMajor)
+    {
+		fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+	}
+
+	fN = (int)N;
+	fLda = (int)lda;
+
+	strmv( fUplo, fTrans, fDiag, fN, A+offa, fLda, X+offx, incx );
+}
+
+void
+blasDtrmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        double *A,
+        size_t offa,
+        size_t lda,
+        double *X,
+        size_t offx,
+        int incx)
+{
+    char fUplo, fDiag, fTrans;
+    int fN, fLda;
+
+    fUplo = encodeUplo(uplo);
+    fTrans = encodeTranspose(transA);
+    fDiag = encodeDiag(diag);
+
+	if (order != clblasColumnMajor)
+    {
+    	fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+	}
+
+
+    fN = (int)N;
+    fLda = (int)lda;
+
+    dtrmv( fUplo, fTrans, fDiag, fN, A+offa , fLda, X+offx,  incx );
+}
+
+void
+blasCtrmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda,
+        FloatComplex *X,
+        size_t offx,
+        int incx)
+{
+    char fUplo, fDiag, fTrans;
+    int fN, fLda;
+	complex *fA, *fX;
+
+    fUplo = encodeUplo(uplo);
+    fTrans = encodeTranspose(transA);
+    fDiag = encodeDiag(diag);
+
+	if (order != clblasColumnMajor)
+    {
+    	fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+	}
+
+
+    fN = (int)N;
+    fLda = (int)lda;
+	fA = (complex*) A + offa;
+	fX = (complex*) X + offx;
+
+    ctrmv( fUplo, fTrans, fDiag, fN, fA, fLda, fX, incx );
+}
+
+void
+blasZtrmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        DoubleComplex *A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex *X,
+        size_t offx,
+        int incx)
+{
+    char fUplo, fDiag, fTrans;
+    int fN, fLda;
+    doublecomplex *fA, *fX;
+
+    fUplo = encodeUplo(uplo);
+    fTrans = encodeTranspose(transA);
+    fDiag = encodeDiag(diag);
+
+	if (order != clblasColumnMajor)
+    {
+		fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+
+    fN = (int)N;
+    fLda = (int)lda;
+
+    fA = (doublecomplex*)A + offa;
+    fX = (doublecomplex*)X + offx;
+    ztrmv( fUplo, fTrans, fDiag, fN, fA, fLda, fX, incx );
+}
+
+//TPMV
+
+void
+blasStpmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        float *AP,
+        size_t offa,
+        float *X,
+        size_t offx,
+        int incx)
+{
+    char fUplo, fDiag, fTrans;
+    int fN;
+
+    fUplo = encodeUplo(uplo);
+    fTrans = encodeTranspose(transA);
+    fDiag = encodeDiag(diag);
+
+    if (order != clblasColumnMajor)
+    {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fN = (int)N;
+
+    stpmv( fUplo, fTrans, fDiag, fN, AP+offa, X+offx, incx );
+}
+
+void
+blasDtpmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        double *AP,
+        size_t offa,
+        double *X,
+        size_t offx,
+        int incx)
+{
+    char fUplo, fDiag, fTrans;
+    int fN;
+
+    fUplo = encodeUplo(uplo);
+    fTrans = encodeTranspose(transA);
+    fDiag = encodeDiag(diag);
+
+    if (order != clblasColumnMajor)
+    {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+
+    fN = (int)N;
+
+    dtpmv( fUplo, fTrans, fDiag, fN, AP+offa , X+offx,  incx );
+}
+
+void
+blasCtpmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        FloatComplex *AP,
+        size_t offa,
+        FloatComplex *X,
+        size_t offx,
+        int incx)
+{
+    char fUplo, fDiag, fTrans;
+    int fN;
+    complex *fAP, *fX;
+
+    fUplo = encodeUplo(uplo);
+    fTrans = encodeTranspose(transA);
+    fDiag = encodeDiag(diag);
+
+    if (order != clblasColumnMajor)
+    {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+
+    fN = (int)N;
+    fAP = (complex*) AP + offa;
+    fX = (complex*) X + offx;
+
+    ctpmv( fUplo, fTrans, fDiag, fN, fAP, fX, incx );
+}
+
+void
+blasZtpmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        DoubleComplex *AP,
+        size_t offa,
+        DoubleComplex *X,
+        size_t offx,
+        int incx)
+{
+    char fUplo, fDiag, fTrans;
+    int fN;
+    doublecomplex *fAP, *fX;
+
+    fUplo = encodeUplo(uplo);
+    fTrans = encodeTranspose(transA);
+    fDiag = encodeDiag(diag);
+
+    if (order != clblasColumnMajor)
+    {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+      abort();
+    }
+
+
+    fN = (int)N;
+
+    fAP = (doublecomplex*)AP + offa;
+    fX = (doublecomplex*)X + offx;
+    ztpmv( fUplo, fTrans, fDiag, fN, fAP, fX, incx );
+}
+
+
+void
+blasStrsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        float *A,
+        size_t offa,
+        size_t lda,
+        float *X,
+        size_t offx,
+        int incx)
+{
+        char fUplo, fDiag, fTrans;
+        int fN, fLda;
+
+        fUplo = encodeUplo(uplo);
+        fTrans = encodeTranspose(transA);
+        fDiag = encodeDiag(diag);
+
+
+	if (order != clblasColumnMajor)
+    	{
+			fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    	}
+
+
+        fN = (int)N;
+        fLda = (int)lda;
+
+        strsv( fUplo, fTrans, fDiag, fN, (A+offa), fLda, (X+offx), incx );
+}
+
+void
+blasDtrsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        double *A,
+        size_t offa,
+        size_t lda,
+        double *X,
+        size_t offx,
+        int incx)
+{
+    char fUplo, fDiag, fTrans;
+    int fN, fLda;
+
+    fUplo = encodeUplo(uplo);
+    fTrans = encodeTranspose(transA);
+    fDiag = encodeDiag(diag);
+
+    if (order != clblasColumnMajor) {
+		fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fN = (int)N;
+    fLda = (int)lda;
+
+    dtrsv( fUplo, fTrans, fDiag, fN, (A+offa), fLda, (X+offx),  incx );
+}
+
+void
+blasCtrsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda,
+        FloatComplex *X,
+        size_t offx,
+        int incx)
+{
+    char fUplo, fDiag, fTrans;
+    int fN, fLda;
+        complex *fA, *fX;
+
+    fUplo = encodeUplo(uplo);
+    fTrans = encodeTranspose(transA);
+    fDiag = encodeDiag(diag);
+
+    if (order != clblasColumnMajor)
+    {
+		fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+
+
+
+    fN = (int)N;
+    fLda = (int)lda;
+
+#if 0
+    fA = (complex*)calloc(N * lda, sizeof(complex));
+    if (fA == NULL) {
+        return;
+    }
+    fX = (complex*)calloc(1 + ((N-1)* abs(incx)), sizeof(complex));
+    if (fX == NULL) {
+        free(fA);
+        return;
+    }
+
+
+    for (i = 0; i < (N * lda); i++) {
+        fA[i] = compose_complex(CREAL(A[i]), CIMAG(A[i]));
+    }
+    for (i = 0; i < (1 +((N-1)* abs(incx))); i++) {
+        fX[i] = compose_complex(CREAL(X[i]), CIMAG(X[i]));
+    }
+#else
+    fA = (complex*)A;
+    fX = (complex*)X;
+#endif
+    ctrsv(fUplo, fTrans,fDiag, fN,fA+offa, fLda,
+         fX+offx, incx);
+#if 0
+    for (i = 0; i < (1 +((N-1)* abs(incx))); i++) {
+        X[i] = floatComplex(complex_real(fX[i]), complex_imag(fX[i]));
+    }
+    free(fX);
+    free(fA);
+#endif
+
+}
+
+void
+blasZtrsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        DoubleComplex *A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex *X,
+        size_t offx,
+        int incx)
+{
+    char fUplo, fDiag, fTrans;
+    int fN, fLda;
+    doublecomplex *fA, *fX;
+
+    fUplo = encodeUplo(uplo);
+    fTrans = encodeTranspose(transA);
+    fDiag = encodeDiag(diag);
+
+    if (order != clblasColumnMajor) {
+		fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fN = (int)N;
+    fLda = (int)lda;
+#if 0
+    fA = (doublecomplex*)calloc(N * lda, sizeof(doublecomplex));
+    if (fA == NULL) {
+        return;
+    }
+    fX = (doublecomplex*)calloc((1 + ((N-1) * abs(incx))), sizeof(doublecomplex));
+    if (fX == NULL) {
+        free(fX);
+        return;
+    }
+
+    for (i = 0; i < (N * lda); i++) {
+        fA[i] = compose_doublecomplex(CREAL(A[i]), CIMAG(A[i]));
+    }
+    for (i = 0; i < (1 + ((N-1) * abs(incx))); i++) {
+        fX[i] = compose_doublecomplex(CREAL(X[i]), CIMAG(X[i]));
+    }
+#else
+    fA = (doublecomplex*)A;
+    fX = (doublecomplex*)X;
+#endif
+    ztrsv( fUplo, fTrans, fDiag, fN, fA + offa, fLda, fX + offx, incx );
+#if 0
+    for (i = 0; i < ((1 + ((N-1) * abs(incx))); i++) {
+        X[i] = doubleComplex(doublecomplex_real(fX[i]), doublecomplex_imag(fX[i]));
+    }
+    free(fX);
+    free(fA);
+#endif
+
+}
+
+void
+blasStpsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        float *A,
+        size_t offa,
+        float *X,
+        size_t offx,
+        int incx)
+{
+        char fUplo, fDiag, fTrans;
+        int fN;
+
+        fUplo = encodeUplo(uplo);
+        fTrans = encodeTranspose(transA);
+        fDiag = encodeDiag(diag);
+
+
+        if (order != clblasColumnMajor)
+        {
+            fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+            abort();
+        }
+
+        fN = (int)N;
+        stpsv( fUplo, fTrans, fDiag, fN, (A+offa), (X+offx), incx );
+}
+
+void
+blasDtpsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        double *A,
+        size_t offa,
+        double *X,
+        size_t offx,
+        int incx)
+{
+        char fUplo, fDiag, fTrans;
+        int fN;
+
+        fUplo = encodeUplo(uplo);
+        fTrans = encodeTranspose(transA);
+        fDiag = encodeDiag(diag);
+
+
+        if (order != clblasColumnMajor)
+        {
+            fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+            abort();
+        }
+
+        fN = (int)N;
+        dtpsv( fUplo, fTrans, fDiag, fN, (A+offa), (X+offx), incx );
+}
+
+void
+blasCtpsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        FloatComplex *A,
+        size_t offa,
+        FloatComplex *X,
+        size_t offx,
+        int incx)
+{
+    char fUplo, fDiag, fTrans;
+    int fN;
+    complex *fA, *fX;
+
+    fUplo = encodeUplo(uplo);
+    fTrans = encodeTranspose(transA);
+    fDiag = encodeDiag(diag);
+
+    if (order != clblasColumnMajor)
+    {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+    fN = (int)N;
+
+    fA = (complex*)A;
+    fX = (complex*)X;
+
+    ctpsv(fUplo, fTrans,fDiag, fN,fA+offa, fX+offx, incx);
+}
+
+void
+blasZtpsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        DoubleComplex *A,
+        size_t offa,
+        DoubleComplex *X,
+        size_t offx,
+        int incx)
+{
+    char fUplo, fDiag, fTrans;
+    int fN;
+    doublecomplex *fA, *fX;
+
+    fUplo = encodeUplo(uplo);
+    fTrans = encodeTranspose(transA);
+    fDiag = encodeDiag(diag);
+
+    if (order != clblasColumnMajor)
+    {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+    fN = (int)N;
+
+    fA = (doublecomplex*)A;
+    fX = (doublecomplex*)X;
+
+    ztpsv(fUplo, fTrans,fDiag, fN,fA+offa, fX+offx, incx);
+}
+
+
+void
+	blasSsymm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        float alpha,
+        float* A,
+		size_t offa,
+        size_t lda,
+        float* B,
+		size_t offb,
+        size_t ldb,
+        float beta,
+        float* C,
+		size_t offc,
+        size_t ldc)
+		{
+
+			char fSide, fUplo;
+			int fM, fN, fLda, fLdb, fLdc;
+
+			fSide = encodeSide( side );
+			fUplo = encodeUplo( uplo );
+
+			fM = (int) M;
+			fN = (int) N;
+			fLda= (int) lda;
+			fLdb = (int) ldb;
+			fLdc = (int) ldc;
+
+			if (order != clblasColumnMajor) {
+
+			fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+	        abort();
+
+			}
+
+			ssymm( fSide, fUplo, fM, fN, alpha, (A+offa), fLda, (B+offb), fLdb, beta, (C+offc), fLdc );
+
+		}
+
+void
+    blasDsymm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        double alpha,
+        double* A,
+        size_t offa,
+        size_t lda,
+        double* B,
+        size_t offb,
+        size_t ldb,
+        double beta,
+        double* C,
+        size_t offc,
+        size_t ldc)
+        {
+
+            char fSide, fUplo;
+            int fM, fN, fLda, fLdb, fLdc;
+
+            fSide = encodeSide( side );
+            fUplo = encodeUplo( uplo );
+
+            fM = (int) M;
+            fN = (int) N;
+            fLda= (int) lda;
+            fLdb = (int) ldb;
+            fLdc = (int) ldc;
+
+			if (order != clblasColumnMajor) {
+
+            fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+	        abort();
+
+			}
+
+            dsymm( fSide, fUplo, fM, fN, alpha, (A+offa), fLda, (B+offb), fLdb, beta, (C+offc), fLdc );
+
+        }
+
+void
+    blasCsymm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* A,
+        size_t offa,
+        size_t lda,
+        FloatComplex* B,
+        size_t offb,
+        size_t ldb,
+        FloatComplex beta,
+        FloatComplex* C,
+        size_t offc,
+        size_t ldc)
+        {
+
+            char fSide, fUplo;
+            int fM, fN, fLda, fLdb, fLdc;
+			complex *fA, *fB, *fC, fAlpha, fBeta;
+
+            fSide = encodeSide( side );
+            fUplo = encodeUplo( uplo );
+
+            fM = (int) M;
+            fN = (int) N;
+            fLda= (int) lda;
+            fLdb = (int) ldb;
+            fLdc = (int) ldc;
+			fA = (complex*) A;
+			fB = (complex*) B;
+			fC = (complex*) C;
+			fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha));
+			fBeta = compose_complex(CREAL(beta), CIMAG(beta));
+
+			if (order != clblasColumnMajor) {
+
+            fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+	        abort();
+
+			}
+
+            csymm( fSide, fUplo, fM, fN, &fAlpha, (fA+offa), fLda, (fB+offb), fLdb, &fBeta, (fC+offc), fLdc );
+
+        }
+
+void
+    blasZsymm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex* B,
+        size_t offb,
+        size_t ldb,
+        DoubleComplex beta,
+        DoubleComplex* C,
+        size_t offc,
+        size_t ldc)
+        {
+
+            char fSide, fUplo;
+            int fM, fN, fLda, fLdb, fLdc;
+			doublecomplex *fA, *fB, *fC, fAlpha, fBeta;
+
+            fSide = encodeSide( side );
+            fUplo = encodeUplo( uplo );
+
+            fM = (int) M;
+            fN = (int) N;
+            fLda= (int) lda;
+            fLdb = (int) ldb;
+            fLdc = (int) ldc;
+			fA =(doublecomplex*) A;
+			fB =(doublecomplex*) B;
+			fC =(doublecomplex*) C;
+
+			fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha));
+            fBeta  = compose_doublecomplex(CREAL(beta), CIMAG(beta));
+
+			if (order != clblasColumnMajor) {
+
+            fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+	        abort();
+
+			}
+
+            zsymm( fSide, fUplo, fM, fN, &fAlpha, (fA+offa), fLda, (fB+offb), fLdb, &fBeta, (fC+offc), fLdc );
+
+        }
+
+
+void
+        blasSger(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        float alpha,
+        float* x,
+        size_t offx,
+        int incx,
+        float* y,
+        size_t offy,
+        int incy,
+        float* A,
+        size_t offa,
+        size_t lda)
+                {
+
+                        int fM, fN, fLda;
+
+                        fM = (int) M;
+                        fN = (int) N;
+                        fLda= (int) lda;
+
+                        if (order != clblasColumnMajor) {
+                        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+                        }
+                        sger( fM, fN, alpha, (x+offx), incx, (y+offy), incy, (A+offa), fLda );
+
+                }
+
+
+void
+    blasDger(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        double alpha,
+        double* x,
+        size_t offx,
+        int incx,
+        double* y,
+        size_t offy,
+        int incy,
+        double* A,
+        size_t offa,
+        size_t lda)
+        {
+
+            int fM, fN, fLda;
+            fM = (int) M;
+            fN = (int) N;
+            fLda= (int) lda;
+
+                        if (order != clblasColumnMajor) {
+            fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+                        }
+            dger( fM, fN, alpha, (x+offx), incx, (y+offy), incy, (A+offa), fLda );
+
+        }
+
+
+void
+    blasCgeru(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* x,
+        size_t offx,
+        int incx,
+        FloatComplex* y,
+        size_t offy,
+        int incy,
+        FloatComplex* A,
+        size_t offa,
+        size_t lda)
+        {
+
+            int fM, fN, fLda;
+            complex *fA, *fx, *fy, fAlpha;
+            fM = (int) M;
+            fN = (int) N;
+            fLda= (int) lda;
+            fA = (complex*) A;
+ 	    fx = (complex*) x;
+	    fy = (complex*) y;
+            fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha));
+
+            if (order != clblasColumnMajor) {
+            fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+            cgeru( fM, fN, &fAlpha, (fx+offx), incx, (fy+offy), incy, (fA+offa), fLda );
+
+        }
+
+
+void
+    blasZgeru(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* x,
+        size_t offx,
+        int incx,
+        DoubleComplex* y,
+        size_t offy,
+        int incy,
+        DoubleComplex* A,
+        size_t offa,
+        size_t lda)
+        {
+
+            int fM, fN, fLda;
+            doublecomplex *fA, *fx, *fy, fAlpha;
+            fM = (int) M;
+            fN = (int) N;
+            fLda= (int) lda;
+            fA =(doublecomplex*) A;
+            fx =(doublecomplex*) x;
+            fy =(doublecomplex*) y;
+
+            fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha));
+            if (order != clblasColumnMajor) {
+            fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+            abort();
+ }
+
+            zgeru( fM, fN, &fAlpha, (fx+offx), incx, (fy+offy), incy, (fA+offa), fLda );
+        }
+
+
+void
+    blasCgerc(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* x,
+        size_t offx,
+        int incx,
+        FloatComplex* y,
+        size_t offy,
+        int incy,
+        FloatComplex* A,
+        size_t offa,
+        size_t lda)
+        {
+
+            int fM, fN, fLda;
+            complex *fA, *fx, *fy, fAlpha;
+            fM = (int) M;
+            fN = (int) N;
+            fLda= (int) lda;
+            fA = (complex*) A;
+            fx = (complex*) x;
+            fy = (complex*) y;
+            fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha));
+
+            if (order != clblasColumnMajor) {
+            fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+            cgerc( fM, fN, &fAlpha, (fx+offx), incx, (fy+offy), incy, (fA+offa), fLda );
+
+        }
+
+void
+    blasZgerc(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* x,
+        size_t offx,
+        int incx,
+        DoubleComplex* y,
+        size_t offy,
+        int incy,
+        DoubleComplex* A,
+        size_t offa,
+        size_t lda)
+        {
+
+            int fM, fN, fLda;
+            doublecomplex *fA, *fx, *fy, fAlpha;
+            fM = (int) M;
+            fN = (int) N;
+            fLda= (int) lda;
+            fA =(doublecomplex*) A;
+            fx =(doublecomplex*) x;
+            fy =(doublecomplex*) y;
+
+            fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha));
+            if (order != clblasColumnMajor) {
+            fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+            abort();
+ }
+            zgerc( fM, fN, &fAlpha, (fx+offx), incx, (fy+offy), incy, (fA+offa), fLda );
+
+        }
+
+void
+	blasSsyr(
+		clblasOrder order,
+		clblasUplo uplo,
+		size_t N,
+		float alpha,
+		float* X,
+		size_t offx,
+		int incx,
+		float* A,
+		size_t offa,
+		size_t lda)
+		{
+			char fUplo;
+            int fN, fLda, fIncx;
+			float *fA, fAlpha, *fX;
+
+            fUplo = encodeUplo( uplo );
+            fN    = (int) N;
+            fLda  = (int) lda;
+			fIncx = (int) incx;
+			fA = (float*) A;
+			fX = (float*) X;
+			fAlpha = alpha;
+
+			if (order != clblasColumnMajor)
+			{
+    	        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+		        abort();
+			}
+			ssyr(fUplo, fN, fAlpha, (fX + offx), fIncx, (fA + offa), fLda);
+		}
+
+
+void
+    blasDsyr(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        double alpha,
+        double* X,
+        size_t offx,
+        int incx,
+        double* A,
+        size_t offa,
+        size_t lda)
+        {
+            char fUplo;
+            int fN, fLda, fIncx;
+            double *fA, fAlpha, *fX;
+
+            fUplo = encodeUplo( uplo );
+            fN    = (int) N;
+            fLda  = (int) lda;
+            fIncx = (int) incx;
+            fA = (double*) A;
+            fX = (double*) X;
+            fAlpha = alpha;
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+	    dsyr(fUplo, fN, fAlpha, (fX + offx), fIncx, (fA + offa), fLda);
+
+
+}
+
+//SPR
+
+void
+    blasSspr(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        float alpha,
+        float* X,
+        size_t offx,
+        int incx,
+        float* AP,
+        size_t offa)
+        {
+            char fUplo;
+            int fN, fIncx;
+            float *fAP, fAlpha, *fX;
+
+            fUplo = encodeUplo( uplo );
+            fN    = (int) N;
+            fIncx = (int) incx;
+            fAP = (float*) AP;
+            fX = (float*) X;
+
+            fAlpha = alpha;
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+            sspr(fUplo, fN, fAlpha, (fX + offx), fIncx, (fAP + offa));
+        }
+
+
+void
+    blasDspr(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        double alpha,
+        double* X,
+        size_t offx,
+        int incx,
+        double* AP,
+        size_t offa)
+        {
+            char fUplo;
+            int fN, fIncx;
+            double *fAP, fAlpha, *fX;
+
+            fUplo = encodeUplo( uplo );
+
+            fN    = (int) N;
+            fIncx = (int) incx;
+            fAP = (double*) AP;
+            fX = (double*) X;
+            fAlpha = alpha;
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+        dspr(fUplo, fN, fAlpha, (fX + offx), fIncx, (fAP + offa));
+
+
+}
+
+
+void
+    blasCher(
+        clblasOrder order,
+		clblasUplo uplo,
+        size_t N,
+        float alpha,
+        FloatComplex* x,
+        size_t offx,
+        int incx,
+        FloatComplex* A,
+        size_t offa,
+        size_t lda)
+        {
+	    	char fUplo;
+            int fN, fLda;
+            complex *fA, *fx ;
+			fUplo = encodeUplo( uplo );
+            fN = (int) N;
+            fLda= (int) lda;
+            fA = (complex*) A;
+            fx = (complex*) x;
+
+            if (order != clblasColumnMajor) {
+
+            	fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+			}
+            cher( fUplo, fN, alpha, (fx+offx), incx, (fA+offa), fLda );
+
+        }
+
+
+void
+    blasZher(
+        clblasOrder order,
+		clblasUplo uplo,
+        size_t N,
+        double alpha,
+        DoubleComplex* x,
+        size_t offx,
+        int incx,
+        DoubleComplex* A,
+        size_t offa,
+        size_t lda)
+        {
+            char fUplo;
+            int  fN, fLda;
+            doublecomplex *fA, *fx;
+			fUplo = encodeUplo( uplo );
+            fN = (int) N;
+            fLda= (int) lda;
+            fA =(doublecomplex*) A;
+            fx =(doublecomplex*) x;
+
+            if (order != clblasColumnMajor) {
+            fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+            abort();
+ }
+
+            zher( fUplo, fN, alpha, (fx+offx), incx, (fA+offa), fLda );
+        }
+
+void
+	blasSsyr2(
+		clblasOrder order,
+		clblasUplo uplo,
+		size_t N,
+		float alpha,
+		float* X,
+		size_t offx,
+		int incx,
+		float* Y,
+		size_t offy,
+		int incy,
+		float* A,
+		size_t offa,
+		size_t lda)
+		{
+			char fUplo;
+            int fN, fLda, fIncx, fIncy;
+			float *fA, fAlpha, *fX, *fY;
+
+            fUplo = encodeUplo( uplo );
+
+            fN    = (int) N;
+            fLda  = (int) lda;
+			fIncx = (int) incx;
+			fIncy = (int) incy;
+
+			fA = (float*) A;
+			fX = (float*) X;
+			fY = (float*) Y;
+
+			fAlpha = alpha;
+
+			if (order != clblasColumnMajor)
+			{
+    	        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+		        abort();
+			}
+
+			ssyr2(fUplo, fN, fAlpha, (fX + offx), fIncx, (fY + offy), fIncy, (fA + offa), fLda);
+		}
+
+
+void
+    blasDsyr2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        double alpha,
+        double* X,
+        size_t offx,
+        int incx,
+        double* Y,
+        size_t offy,
+        int incy,
+		double* A,
+        size_t offa,
+        size_t lda)
+        {
+            char fUplo;
+            int fN, fLda, fIncx, fIncy;
+            double *fA, fAlpha, *fX, *fY;
+
+            fUplo = encodeUplo( uplo );
+
+            fN    = (int) N;
+            fLda  = (int) lda;
+            fIncx = (int) incx;
+			fIncy = (int) incy;
+
+            fA = (double*) A;
+            fX = (double*) X;
+			fY = (double*) Y;
+
+            fAlpha = alpha;
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+
+            dsyr2(fUplo, fN, fAlpha, (fX + offx), fIncx, (fY + offy), fIncy, (fA + offa), fLda);
+        }
+
+//HER2
+void
+    blasCher2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* X,
+        size_t offx,
+        int incx,
+        FloatComplex* Y,
+        size_t offy,
+        int incy,
+        FloatComplex* A,
+        size_t offa,
+        size_t lda)
+        {
+            char fUplo;
+            int fN, fLda;
+            complex *fA, fAlpha, *fX, *fY;
+            fUplo = encodeUplo( uplo );
+            fN    = (int) N;
+            fLda  = (int) lda;
+            fA = (complex*) A;
+            fX = (complex*) X;
+            fY = (complex*) Y;
+
+	    fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha));
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+
+            cher2(fUplo, fN, &fAlpha, (fX + offx), incx, (fY + offy), incy, (fA + offa), fLda);
+        }
+
+void
+    blasZher2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* X,
+        size_t offx,
+        int incx,
+        DoubleComplex* Y,
+        size_t offy,
+        int incy,
+        DoubleComplex* A,
+        size_t offa,
+        size_t lda)
+        {
+            char fUplo;
+            int fN, fLda ;
+            doublecomplex *fA, fAlpha, *fX, *fY;
+            fUplo = encodeUplo( uplo );
+            fN    = (int) N;
+            fLda  = (int) lda;
+            fA = (doublecomplex*) A;
+            fX = (doublecomplex*) X;
+            fY = (doublecomplex*) Y;
+
+            fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha));
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+
+            zher2(fUplo, fN, &fAlpha, (fX + offx), incx, (fY + offy), incy, (fA + offa), fLda);
+        }
+
+
+
+void
+    blasChemv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* A,
+        size_t offa,
+        size_t lda,
+        FloatComplex* X,
+        size_t offx,
+        int incx,
+        FloatComplex beta,
+        FloatComplex* Y,
+        size_t offy,
+        int incy)
+        {
+            char fUplo;
+            int fN, fLda, fIncx, fIncy;
+            complex *fA, fAlpha, fBeta, *fX, *fY;
+
+		    fUplo = encodeUplo( uplo );
+
+            fN    = (int) N;
+            fLda  = (int) lda;
+            fIncx = (int) incx;
+            fIncy = (int) incy;
+
+		    fA = (complex*) A;
+            fX = (complex*) X;
+            fY = (complex*) Y;
+
+            fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha));
+            fBeta = compose_complex(CREAL(beta), CIMAG(beta));
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+            chemv(fUplo, fN, &fAlpha, (fA + offa), fLda, (fX + offx), fIncx, &fBeta, (fY + offy), fIncy);
+        }
+
+void
+    blasZhemv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex* X,
+        size_t offx,
+        int incx,
+        DoubleComplex beta,
+        DoubleComplex* Y,
+        size_t offy,
+        int incy)
+        {
+            char fUplo;
+            int fN, fLda, fIncx, fIncy;
+            doublecomplex *fA, fAlpha, fBeta, *fX, *fY;
+
+            fUplo = encodeUplo( uplo );
+
+            fN    = (int) N;
+            fLda  = (int) lda;
+            fIncx = (int) incx;
+            fIncy = (int) incy;
+
+            fA = (doublecomplex*) A;
+            fX = (doublecomplex*) X;
+            fY = (doublecomplex*) Y;
+
+            fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha));
+            fBeta = compose_doublecomplex(CREAL(beta), CIMAG(beta));
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+
+            zhemv(fUplo, fN, &fAlpha, (fA + offa), fLda, (fX + offx), fIncx, &fBeta, (fY + offy), fIncy);
+        }
+
+//HEMM
+void
+    blasChemm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* A,
+        size_t offa,
+        size_t lda,
+        FloatComplex* B,
+        size_t offb,
+        size_t ldb,
+        FloatComplex beta,
+        FloatComplex* C,
+        size_t offc,
+        size_t ldc)
+        {
+
+            char fSide, fUplo;
+            int fM, fN, fLda, fLdb, fLdc;
+            complex *fA, *fB, *fC, fAlpha, fBeta;
+
+            fSide = encodeSide( side );
+            fUplo = encodeUplo( uplo );
+
+            fM = (int) M;
+            fN = (int) N;
+            fLda= (int) lda;
+            fLdb = (int) ldb;
+            fLdc = (int) ldc;
+            fA = (complex*) A;
+            fB = (complex*) B;
+            fC = (complex*) C;
+            fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha));
+            fBeta = compose_complex(CREAL(beta), CIMAG(beta));
+
+            if (order != clblasColumnMajor) {
+
+            fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+            abort();
+						    }
+
+            chemm( fSide, fUplo, fM, fN, &fAlpha, (fA+offa), fLda, (fB+offb), fLdb, &fBeta, (fC+offc), fLdc );
+
+        }
+
+void
+    blasZhemm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex* B,
+        size_t offb,
+        size_t ldb,
+        DoubleComplex beta,
+        DoubleComplex* C,
+        size_t offc,
+        size_t ldc)
+        {
+
+            char fSide, fUplo;
+            int fM, fN, fLda, fLdb, fLdc;
+            doublecomplex *fA, *fB, *fC, fAlpha, fBeta;
+
+            fSide = encodeSide( side );
+            fUplo = encodeUplo( uplo );
+
+            fM = (int) M;
+            fN = (int) N;
+            fLda= (int) lda;
+            fLdb = (int) ldb;
+            fLdc = (int) ldc;
+            fA =(doublecomplex*) A;
+            fB =(doublecomplex*) B;
+            fC =(doublecomplex*) C;
+
+            fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha));
+			fBeta  = compose_doublecomplex(CREAL(beta), CIMAG(beta));
+
+            if (order != clblasColumnMajor) {
+
+            fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+            abort();
+
+                        }
+
+            zhemm( fSide, fUplo, fM, fN, &fAlpha, (fA+offa), fLda, (fB+offb), fLdb, &fBeta, (fC+offc), fLdc );
+
+        }
+
+
+void
+blasCherk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    float alpha,
+    const FloatComplex *A,
+    size_t lda,
+    float beta,
+    FloatComplex *C,
+    size_t ldc)
+{
+    char fUplo, fTransA;
+    int fN, fK;
+    int fLDA, fLDC;
+    complex *fA, *fC;
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fUplo = encodeUplo(uplo);
+    fTransA = encodeTranspose(transA);
+    fN = (int)N;
+    fK = (int)K;
+    fLDA = (int)lda;
+    fLDC = (int)ldc;
+
+    fA = (complex*)A;
+    fC = (complex*)C;
+
+	cherk(fUplo, fTransA, fN, fK, alpha, fA, fLDA, beta, fC, fLDC);
+}
+
+void
+blasZherk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    double alpha,
+    const DoubleComplex *A,
+	size_t lda,
+    double beta,
+    DoubleComplex *C,
+    size_t ldc)
+{
+    char fUplo, fTransA;
+    int fN, fK;
+    int fLDA, fLDC;
+    doublecomplex *fA, *fC;
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fUplo = encodeUplo(uplo);
+    fTransA = encodeTranspose(transA);
+    fN = (int)N;
+    fK = (int)K;
+    fLDA = (int)lda;
+    fLDC = (int)ldc;
+
+    fA = (doublecomplex*)A;
+    fC = (doublecomplex*)C;
+
+	zherk(fUplo, fTransA, fN, fK, alpha, fA, fLDA, beta, fC, fLDC);
+}
+
+
+void
+blasSspmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    float alpha,
+    const float *A,
+    size_t offa,
+    const float *X,
+    size_t offx,
+    int incx,
+    float beta,
+    float *Y,
+    size_t offy,
+    int incy)
+{
+    char fUplo;
+    int fN;
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fUplo = encodeUplo(uplo);
+    fN = (int)N;
+
+    sspmv(fUplo, fN, alpha, (float*)(A+offa), (float*)(X+offx), incx, beta, (Y+offy), incy);
+}
+
+void
+blasDspmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    double alpha,
+    const double *A,
+    size_t offa,
+    const double *X,
+    size_t offx,
+    int incx,
+    double beta,
+    double *Y,
+    size_t offy,
+    int incy)
+{
+    char fUplo;
+    int fN;
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fUplo = encodeUplo(uplo);
+    fN = (int)N;
+
+    dspmv(fUplo, fN, alpha, (double*)(A+offa),(double*)(X+offx), incx, beta, (Y+offy), incy);
+}
+
+void
+    blasChpmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* A,
+        size_t offa,
+        FloatComplex* X,
+        size_t offx,
+        int incx,
+        FloatComplex beta,
+        FloatComplex* Y,
+        size_t offy,
+        int incy)
+        {
+            char fUplo;
+            int fN, fIncx, fIncy;
+            complex *fA, fAlpha, fBeta, *fX, *fY;
+
+		    fUplo = encodeUplo( uplo );
+
+            fN    = (int) N;
+            fIncx = (int) incx;
+            fIncy = (int) incy;
+
+		    fA = (complex*) A;
+            fX = (complex*) X;
+            fY = (complex*) Y;
+
+            fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha));
+            fBeta = compose_complex(CREAL(beta), CIMAG(beta));
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+            chpmv(fUplo, fN, &fAlpha, (fA + offa), (fX + offx), fIncx, &fBeta, (fY + offy), fIncy);
+        }
+
+void
+    blasZhpmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* A,
+        size_t offa,
+        DoubleComplex* X,
+        size_t offx,
+        int incx,
+        DoubleComplex beta,
+        DoubleComplex* Y,
+        size_t offy,
+        int incy)
+        {
+            char fUplo;
+            int fN, fIncx, fIncy;
+            doublecomplex *fA, fAlpha, fBeta, *fX, *fY;
+
+            fUplo = encodeUplo( uplo );
+
+            fN    = (int) N;
+            fIncx = (int) incx;
+            fIncy = (int) incy;
+
+            fA = (doublecomplex*) A;
+            fX = (doublecomplex*) X;
+            fY = (doublecomplex*) Y;
+
+            fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha));
+            fBeta = compose_doublecomplex(CREAL(beta), CIMAG(beta));
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+
+            zhpmv(fUplo, fN, &fAlpha, (fA + offa), (fX + offx), fIncx, &fBeta, (fY + offy), fIncy);
+        }
+
+void
+    blasChpr(
+        clblasOrder order,
+		clblasUplo uplo,
+        size_t N,
+        float alpha,
+        FloatComplex* x,
+        size_t offx,
+        int incx,
+        FloatComplex* A,
+        size_t offa)
+        {
+	    	char fUplo;
+            int fN;
+            complex *fA, *fx ;
+			fUplo = encodeUplo( uplo );
+            fN = (int) N;
+            fA = (complex*) A;
+            fx = (complex*) x;
+
+            if (order != clblasColumnMajor) {
+
+            	fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+			}
+            chpr( fUplo, fN, alpha, (fx+offx), incx, (fA+offa));
+
+        }
+
+
+void
+    blasZhpr(
+        clblasOrder order,
+		clblasUplo uplo,
+        size_t N,
+        double alpha,
+        DoubleComplex* x,
+        size_t offx,
+        int incx,
+        DoubleComplex* A,
+        size_t offa)
+        {
+            char fUplo;
+            int  fN;
+            doublecomplex *fA, *fx;
+			fUplo = encodeUplo( uplo );
+            fN = (int) N;
+            fA =(doublecomplex*) A;
+            fx =(doublecomplex*) x;
+
+            if (order != clblasColumnMajor) {
+            fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+            abort();
+            }
+
+            zhpr( fUplo, fN, alpha, (fx+offx), incx, (fA+offa) );
+        }
+
+void
+	blasSspr2(
+		clblasOrder order,
+		clblasUplo uplo,
+		size_t N,
+		float alpha,
+		float* X,
+		size_t offx,
+		int incx,
+		float* Y,
+		size_t offy,
+		int incy,
+		float* A,
+		size_t offa)
+		{
+			char fUplo;
+            int fN, fIncx, fIncy;
+			float *fA, fAlpha, *fX, *fY;
+
+            fUplo = encodeUplo( uplo );
+
+            fN    = (int) N;
+			fIncx = (int) incx;
+			fIncy = (int) incy;
+
+			fA = (float*) A;
+			fX = (float*) X;
+			fY = (float*) Y;
+
+			fAlpha = alpha;
+
+			if (order != clblasColumnMajor)
+			{
+    	        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+		        abort();
+			}
+
+			sspr2(fUplo, fN, fAlpha, (fX + offx), fIncx, (fY + offy), fIncy, (fA + offa));
+		}
+
+
+void
+    blasDspr2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        double alpha,
+        double* X,
+        size_t offx,
+        int incx,
+        double* Y,
+        size_t offy,
+        int incy,
+		double* A,
+        size_t offa)
+        {
+            char fUplo;
+            int fN, fIncx, fIncy;
+            double *fA, fAlpha, *fX, *fY;
+
+            fUplo = encodeUplo( uplo );
+
+            fN    = (int) N;
+            fIncx = (int) incx;
+			fIncy = (int) incy;
+
+            fA = (double*) A;
+            fX = (double*) X;
+			fY = (double*) Y;
+
+            fAlpha = alpha;
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+
+            dspr2(fUplo, fN, fAlpha, (fX + offx), fIncx, (fY + offy), fIncy, (fA + offa));
+        }
+
+void
+    blasChpr2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* X,
+        size_t offx,
+        int incx,
+        FloatComplex* Y,
+        size_t offy,
+        int incy,
+        FloatComplex* A,
+        size_t offa)
+        {
+            char fUplo;
+            int fN;
+            complex *fA, fAlpha, *fX, *fY;
+            fUplo = encodeUplo( uplo );
+            fN    = (int) N;
+            fA = (complex*) A;
+            fX = (complex*) X;
+            fY = (complex*) Y;
+
+	        fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha));
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+
+            chpr2(fUplo, fN, &fAlpha, (fX + offx), incx, (fY + offy), incy, (fA + offa));
+        }
+
+void
+    blasZhpr2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* X,
+        size_t offx,
+        int incx,
+        DoubleComplex* Y,
+        size_t offy,
+        int incy,
+        DoubleComplex* A,
+        size_t offa)
+        {
+            char fUplo;
+            int fN ;
+            doublecomplex *fA, fAlpha, *fX, *fY;
+            fUplo = encodeUplo( uplo );
+            fN    = (int) N;
+            fA = (doublecomplex*) A;
+            fX = (doublecomplex*) X;
+            fY = (doublecomplex*) Y;
+
+            fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha));
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+
+            zhpr2(fUplo, fN, &fAlpha, (fX + offx), incx, (fY + offy), incy, (fA + offa));
+        }
+
+void
+blasSgbmv(
+        clblasOrder order,
+        clblasTranspose trans,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        float alpha,
+        float *A,
+        size_t offa,
+        size_t lda,
+        float *X,
+        size_t offx,
+        int incx,
+        float beta,
+        float *Y,
+        size_t offy,
+        int incy)
+        {
+            char fTrans;
+            int fN, fM, fKL, fKU, fLda;
+
+            fTrans = encodeTranspose(trans);
+            fN = (int) N;
+            fM = (int) M;
+            fKL = (int) KL;
+            fKU = (int) KU;
+            fLda = (int) lda;
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+
+            sgbmv(fTrans, fM, fN, fKL, fKU, alpha, (A+offa), fLda, (X+offx), incx, beta, (Y+offy), incy);
+        }
+
+void
+blasDgbmv(
+        clblasOrder order,
+        clblasTranspose trans,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        double alpha,
+        double *A,
+        size_t offa,
+        size_t lda,
+        double *X,
+        size_t offx,
+        int incx,
+        double beta,
+        double *Y,
+        size_t offy,
+        int incy)
+        {
+            char fTrans;
+            int fN, fM, fKL, fKU, fLda;
+
+            fTrans = encodeTranspose(trans);
+            fN = (int) N;
+            fM = (int) M;
+            fKL = (int) KL;
+            fKU = (int) KU;
+            fLda = (int) lda;
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+
+            dgbmv(fTrans, fM, fN, fKL, fKU, alpha, (A+offa), fLda, (X+offx), incx, beta, (Y+offy), incy);
+        }
+
+void
+blasCgbmv(
+        clblasOrder order,
+        clblasTranspose trans,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        FloatComplex alpha,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda,
+        FloatComplex *X,
+        size_t offx,
+        int incx,
+        FloatComplex beta,
+        FloatComplex *Y,
+        size_t offy,
+        int incy)
+        {
+            char fTrans;
+            int fN, fM, fKL, fKU, fLda;
+            complex *fA, *fX, *fY, fAlpha, fBeta;
+
+            fTrans = encodeTranspose(trans);
+            fN = (int) N;
+            fM = (int) M;
+            fKL = (int) KL;
+            fKU = (int) KU;
+            fLda = (int) lda;
+            fA = (complex*) (A + offa);
+            fX = (complex*) (X + offx);
+            fY = (complex*) (Y + offy);
+            fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha));
+            fBeta = compose_complex(CREAL(beta), CIMAG(beta));
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+
+            cgbmv(fTrans, fM, fN, fKL, fKU, &fAlpha, fA, fLda, fX, incx, &fBeta, fY, incy);
+        }
+
+void
+blasZgbmv(
+        clblasOrder order,
+        clblasTranspose trans,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        DoubleComplex alpha,
+        DoubleComplex *A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex *X,
+        size_t offx,
+        int incx,
+        DoubleComplex beta,
+        DoubleComplex *Y,
+        size_t offy,
+        int incy)
+        {
+            char fTrans;
+            int fN, fM, fKL, fKU, fLda;
+            doublecomplex *fA, *fX, *fY, fAlpha, fBeta;
+
+            fTrans = encodeTranspose(trans);
+            fN = (int) N;
+            fM = (int) M;
+            fKL = (int) KL;
+            fKU = (int) KU;
+            fLda = (int) lda;
+            fA = (doublecomplex*) (A + offa);
+            fX = (doublecomplex*) (X + offx);
+            fY = (doublecomplex*) (Y + offy);
+            fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha));
+            fBeta = compose_doublecomplex(CREAL(beta), CIMAG(beta));
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+
+            zgbmv(fTrans, fM, fN, fKL, fKU, &fAlpha, fA, fLda, fX, incx, &fBeta, fY, incy);
+        }
+
+
+//TBMV
+
+void
+blasStbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        float *A,
+        size_t offa,
+        size_t lda,
+        float *X,
+        size_t offx,
+        int incx)
+        {
+            char fTrans, fUplo, fDiag;
+            int fN, fK, fLda;
+
+            fTrans = encodeTranspose(trans);
+            fUplo = encodeUplo(uplo);
+            fDiag = encodeDiag(diag);
+            fN = (int) N;
+            fK = (int) K;
+            fLda = (int) lda;
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+
+            stbmv(fUplo, fTrans, fDiag, fN, fK, (A+offa), fLda, (X+offx), incx );
+        }
+
+void
+blasDtbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        double *A,
+        size_t offa,
+        size_t lda,
+        double *X,
+        size_t offx,
+        int incx)
+        {
+            char fTrans, fUplo, fDiag;
+            int fN, fK, fLda;
+
+            fTrans = encodeTranspose(trans);
+            fUplo = encodeUplo(uplo);
+            fDiag = encodeDiag(diag);
+            fN = (int) N;
+            fK = (int) K;
+            fLda = (int) lda;
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+
+            dtbmv(fUplo, fTrans, fDiag, fN, fK, (A+offa), fLda, (X+offx), incx );
+        }
+void
+blasCtbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda,
+        FloatComplex *X,
+        size_t offx,
+        int incx)
+        {
+            char fTrans, fUplo, fDiag;
+            int fN, fK, fLda;
+            complex *fA, *fX;
+
+            fUplo = encodeUplo(uplo);
+            fDiag = encodeDiag(diag);
+            fTrans = encodeTranspose(trans);
+            fN = (int) N;
+            fK = (int) K;
+            fLda = (int) lda;
+            fA = (complex*) (A + offa);
+            fX = (complex*) (X + offx);
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+
+            ctbmv(fUplo, fTrans, fDiag, fN, fK, fA, fLda, fX, incx );
+        }
+
+void
+blasZtbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        DoubleComplex *A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex *X,
+        size_t offx,
+        int incx)
+        {
+            char fTrans, fUplo, fDiag;
+            int fN, fK, fLda;
+            doublecomplex *fA, *fX;
+
+            fUplo = encodeUplo(uplo);
+            fDiag = encodeDiag(diag);
+            fTrans = encodeTranspose(trans);
+            fN = (int) N;
+            fK = (int) K;
+            fLda = (int) lda;
+            fA = (doublecomplex*) (A + offa);
+            fX = (doublecomplex*) (X + offx);
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+            ztbmv(fUplo, fTrans, fDiag, fN, fK, fA, fLda, fX, incx );
+        }
+
+
+//SBMV
+
+void
+blasSsbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        size_t K,
+        float alpha,
+        float *A,
+        size_t offa,
+        size_t lda,
+        float *X,
+        size_t offx,
+        int incx,
+        float beta,
+        float *Y,
+        size_t offy,
+        int incy)
+        {
+            char  fUplo;
+            int fN, fK, fLda;
+
+            fUplo = encodeUplo(uplo);
+            fN = (int) N;
+            fK = (int) K;
+            fLda = (int) lda;
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+
+            ssbmv( fUplo, fN, fK, alpha, (A+offa), fLda, (X+offx), incx, beta, (Y+offy), incy );
+        }
+
+void
+blasDsbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        size_t K,
+        double alpha,
+        double *A,
+        size_t offa,
+        size_t lda,
+        double *X,
+        size_t offx,
+        int incx,
+        double beta,
+        double *Y,
+        size_t offy,
+        int incy)
+        {
+            char fUplo;
+            int fN, fK, fLda;
+
+            fUplo = encodeUplo(uplo);
+            fN = (int) N;
+            fK = (int) K;
+            fLda = (int) lda;
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+
+            dsbmv(fUplo, fN, fK, alpha, (A+offa), fLda, (X+offx), incx, beta, (Y+offy), incy );
+        }
+
+//HBMV
+
+void
+blasChbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        size_t K,
+        FloatComplex alpha,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda,
+        FloatComplex *X,
+        size_t offx,
+        int incx,
+        FloatComplex beta,
+        FloatComplex *Y,
+        size_t offy,
+        int incy)
+        {
+            char fUplo;
+            int fN, fK, fLda;
+            complex *fA, *fX, *fY, fAlpha, fBeta;
+
+            fUplo = encodeUplo(uplo);
+            fN = (int) N;
+            fK = (int) K;
+            fLda = (int) lda;
+            fA = (complex*) (A + offa);
+            fX = (complex*) (X + offx);
+            fY = (complex*) (Y + offy);
+
+            fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha));
+            fBeta = compose_complex(CREAL(beta), CIMAG(beta));
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+
+            chbmv( fUplo, fN, fK, &fAlpha, fA, fLda, fX, incx, &fBeta, fY, incy );
+        }
+
+void
+blasZhbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        size_t K,
+        DoubleComplex alpha,
+        DoubleComplex *A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex *X,
+        size_t offx,
+        int incx,
+        DoubleComplex beta,
+        DoubleComplex *Y,
+        size_t offy,
+        int incy)
+        {
+            char fUplo;
+            int fN, fK, fLda;
+            doublecomplex *fA, *fX, *fY, fAlpha, fBeta;
+
+            fUplo = encodeUplo(uplo);
+            fN = (int) N;
+            fK = (int) K;
+            fLda = (int) lda;
+            fA = (doublecomplex*) (A + offa);
+            fX = (doublecomplex*) (X + offx);
+            fY = (doublecomplex*) (Y + offy);
+
+            fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha));
+            fBeta = compose_doublecomplex(CREAL(beta), CIMAG(beta));
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+            zhbmv(fUplo, fN, fK, &fAlpha, fA, fLda, fX, incx, &fBeta, fY, incy );
+        }
+
+
+//TBSV
+
+void
+blasStbsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        float *A,
+        size_t offa,
+        size_t lda,
+        float *X,
+        size_t offx,
+        int incx)
+        {
+            char fTrans, fUplo, fDiag;
+            int fN, fK, fLda;
+
+            fTrans = encodeTranspose(trans);
+            fUplo = encodeUplo(uplo);
+            fDiag = encodeDiag(diag);
+            fN = (int) N;
+            fK = (int) K;
+            fLda = (int) lda;
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+
+            stbsv(fUplo, fTrans, fDiag, fN, fK, (A+offa), fLda, (X+offx), incx );
+        }
+
+void
+blasDtbsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        double *A,
+        size_t offa,
+        size_t lda,
+        double *X,
+        size_t offx,
+        int incx)
+        {
+            char fTrans, fUplo, fDiag;
+            int fN, fK, fLda;
+
+            fTrans = encodeTranspose(trans);
+            fUplo = encodeUplo(uplo);
+            fDiag = encodeDiag(diag);
+            fN = (int) N;
+            fK = (int) K;
+            fLda = (int) lda;
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+
+            dtbsv(fUplo, fTrans, fDiag, fN, fK, (A+offa), fLda, (X+offx), incx );
+        }
+
+
+void
+blasCtbsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda,
+        FloatComplex *X,
+        size_t offx,
+        int incx)
+        {
+            char fTrans, fUplo, fDiag;
+            int fN, fK, fLda;
+             complex *fA, *fX;
+
+            fUplo = encodeUplo(uplo);
+            fDiag = encodeDiag(diag);
+            fTrans = encodeTranspose(trans);
+            fN = (int) N;
+            fK = (int) K;
+            fLda = (int) lda;
+            fA = (complex*) (A + offa);
+            fX = (complex*) (X + offx);
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+
+            ctbsv(fUplo, fTrans, fDiag, fN, fK, fA, fLda, fX, incx );
+        }
+
+void
+blasZtbsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        DoubleComplex *A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex *X,
+        size_t offx,
+        int incx)
+        {
+            char fTrans, fUplo, fDiag;
+            int fN, fK, fLda;
+            doublecomplex *fA, *fX;
+
+            fUplo = encodeUplo(uplo);
+            fDiag = encodeDiag(diag);
+            fTrans = encodeTranspose(trans);
+            fN = (int) N;
+            fK = (int) K;
+            fLda = (int) lda;
+            fA = (doublecomplex*) (A + offa);
+            fX = (doublecomplex*) (X + offx);
+
+            if (order != clblasColumnMajor)
+            {
+                fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+                abort();
+            }
+            ztbsv(fUplo, fTrans, fDiag, fN, fK, fA, fLda, fX, incx );
+        }
+
+void
+blasCher2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const FloatComplex *A,
+    size_t offa,
+    size_t lda,
+    const FloatComplex *B,
+    size_t offb,
+    size_t ldb,
+    float beta,
+    FloatComplex *C,
+    size_t offc,
+    size_t ldc)
+{
+    char fUplo, fTransA;
+    int fN, fK;
+    int fLDA, fLDC, fLDB;
+    complex *fA, *fC, *fB, *fAlpha;
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fUplo = encodeUplo(uplo);
+    fTransA = encodeTranspose(transA);
+    fN = (int)N;
+    fK = (int)K;
+    fLDA = (int)lda;
+    fLDB = (int)ldb;
+    fLDC = (int)ldc;
+
+    fA = (complex*)(A+offa);
+    fB = (complex*)(B+offb);
+    fC = (complex*)(C+offc);
+    fAlpha = (complex*)(&alpha);
+
+	cher2k(fUplo, fTransA, fN, fK, fAlpha, fA, fLDA, fB, fLDB, beta, fC, fLDC);
+}
+
+void
+blasZher2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const DoubleComplex *A,
+    size_t offa,
+	size_t lda,
+    const DoubleComplex *B,
+    size_t offb,
+	size_t ldb,
+    double beta,
+    DoubleComplex *C,
+    size_t offc,
+    size_t ldc)
+{
+    char fUplo, fTransA;
+    int fN, fK;
+    int fLDA, fLDC, fLDB;
+    doublecomplex *fA, *fC, *fB, *fAlpha;
+
+    if (order != clblasColumnMajor) {
+        fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n");
+        abort();
+    }
+
+    fUplo = encodeUplo(uplo);
+    fTransA = encodeTranspose(transA);
+    fN = (int)N;
+    fK = (int)K;
+    fLDA = (int)lda;
+    fLDB = (int)ldb;
+    fLDC = (int)ldc;
+
+    fA = (doublecomplex*)(A+offa);
+    fB = (doublecomplex*)(B+offb);
+    fC = (doublecomplex*)(C+offc);
+    fAlpha = (doublecomplex*)(&alpha);
+
+	zher2k(fUplo, fTransA, fN, fK, fAlpha, fA, fLDA, fB, fLDB, beta, fC, fLDC);
+}
+
+
+//COPY
+
+
+void
+blasScopy(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx,
+        float *Y,
+        size_t offy,
+        int incy)
+        {
+            int fN;
+            fN = (int) N;
+
+            scopy(fN, (X+offx), incx, (Y+offy), incy);
+        }
+
+
+
+void
+blasDcopy(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx,
+        double *Y,
+        size_t offy,
+        int incy)
+        {
+
+            int fN;
+            fN = (int) N;
+
+            dcopy( fN, (X+offx), incx, (Y+offy), incy );
+        }
+
+
+void
+blasCcopy(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx,
+        FloatComplex *Y,
+        size_t offy,
+        int incy)
+        {
+
+            int fN;
+            complex *fY, *fX;
+
+            fN = (int) N;
+            fY = (complex*) (Y + offy);
+            fX = (complex*) (X + offx);
+
+            ccopy( fN, fX, incx, fY, incy );
+        }
+
+
+void
+blasZcopy(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx,
+        DoubleComplex *Y,
+        size_t offy,
+        int incy)
+        {
+            int fN;
+            doublecomplex *fY, *fX;
+
+            fN = (int) N;
+            fY = (doublecomplex*) (Y + offy);
+            fX = (doublecomplex*) (X + offx);
+
+            zcopy(fN, fX, incx , fY, incy);
+        }
+
+
+//SWAP
+
+void
+blasSswap(
+        size_t N,
+        float *X,
+        size_t offBX,
+        int incx,
+        float *Y,
+        size_t offCY,
+        int incy)
+        {
+            int fN;
+            fN = (int) N;
+
+            sswap(fN, (X+offBX), incx, (Y+offCY), incy);
+        }
+
+
+
+void
+blasDswap(
+        size_t N,
+        double *X,
+        size_t offBX,
+        int incx,
+        double *Y,
+        size_t offCY,
+        int incy)
+        {
+
+            int fN;
+            fN = (int) N;
+
+            dswap( fN, (X+offBX), incx, (Y+offCY), incy );
+        }
+
+
+void
+blasCswap(
+        size_t N,
+        FloatComplex *X,
+        size_t offBX,
+        int incx,
+        FloatComplex *Y,
+        size_t offCY,
+        int incy)
+        {
+
+            int fN;
+            complex *fY, *fX;
+
+            fN = (int) N;
+            fY = (complex*) (Y + offCY);
+            fX = (complex*) (X + offBX);
+
+            cswap( fN, fX, incx, fY, incy );
+        }
+
+
+void
+blasZswap(
+        size_t N,
+        DoubleComplex *X,
+        size_t offBX,
+        int incx,
+        DoubleComplex *Y,
+        size_t offCY,
+        int incy)
+        {
+            int fN;
+            doublecomplex *fY, *fX;
+
+            fN = (int) N;
+            fY = (doublecomplex*) (Y + offCY);
+            fX = (doublecomplex*) (X + offBX);
+
+            zswap(fN, fX, incx , fY, incy);
+        }
+
+
+void
+	blasSscal(
+	    size_t N,
+        float alpha,
+        float *X,
+        size_t offx,
+        int incx)
+{
+    sscal((int)N, alpha, (X+offx), incx);
+}
+
+void
+	blasDscal(
+	    size_t N,
+        double alpha,
+        double *X,
+        size_t offx,
+        int incx)
+{
+    dscal((int)N, alpha, (X+offx), incx);
+}
+
+void
+	blasCscal(
+	    size_t N,
+        FloatComplex alpha,
+        FloatComplex *X,
+        size_t offx,
+        int incx)
+{
+    cscal((int)N, (complex*)(&alpha), (complex*)(X+offx), incx);
+}
+
+void
+	blasZscal(
+	    size_t N,
+        DoubleComplex alpha,
+        DoubleComplex *X,
+        size_t offx,
+        int incx)
+{
+    zscal((int)N, (doublecomplex*)(&alpha), (doublecomplex*)(X+offx), incx);
+}
+
+void
+	blasCsscal(
+	    size_t N,
+        float alpha,
+        FloatComplex *X,
+        size_t offx,
+        int incx)
+{
+    csscal((int)N, alpha, (complex*)(X+offx), incx);
+}
+
+void
+	blasZdscal(
+	    size_t N,
+        double alpha,
+        DoubleComplex *X,
+        size_t offx,
+        int incx)
+{
+    zdscal((int)N, alpha, (doublecomplex*)(X+offx), incx);
+}
+
+//DOT
+float
+blasSdot(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx,
+        float *Y,
+        size_t offy,
+        int incy)
+        {
+            return sdot((int)N, (X+offx), incx, (Y+offy), incy);
+        }
+
+double
+blasDdot(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx,
+        double *Y,
+        size_t offy,
+        int incy)
+        {
+            return ddot( (int)N, (X+offx), incx, (Y+offy), incy );
+        }
+
+FloatComplex
+blasCdotu(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx,
+        FloatComplex *Y,
+        size_t offy,
+        int incy)
+        {
+            complex ans = cdotu((int)N, (complex*)(X+offx), incx, (complex*)(Y+offy), incy);
+            FloatComplex ret;
+            CREAL(ret) = ans.real;
+            CIMAG(ret) = ans.imag;
+            return ret;
+        }
+
+DoubleComplex
+blasZdotu(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx,
+        DoubleComplex *Y,
+        size_t offy,
+        int incy)
+        {
+            doublecomplex answer = zdotu( (int)N, (doublecomplex*)(X+offx), incx, (doublecomplex*)(Y+offy), incy );
+            DoubleComplex ret2;
+            CREAL(ret2) = answer.real;
+            CIMAG(ret2) = answer.imag;
+            return ret2;
+        }
+
+//ASUM
+float
+blasSasum(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx)
+        {
+            return sasum((int)N, (X+offx), incx);
+        }
+
+double
+blasDasum(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx)
+        {
+            return dasum( (int)N, (X+offx), incx);
+        }
+
+float
+blasScasum(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx)
+        {
+            return scasum((int)N, (complex*)(X+offx), incx);
+        }
+
+double
+blasDzasum(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx)
+        {
+            return dzasum( (int)N, (doublecomplex*)(X+offx), incx);
+        }
+
+//DOTC
+FloatComplex
+blasCdotc(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx,
+        FloatComplex *Y,
+        size_t offy,
+        int incy)
+        {
+            complex ans = cdotc((int)N, (complex*)(X+offx), incx, (complex*)(Y+offy), incy);
+            FloatComplex ret;
+            CREAL(ret) = ans.real;
+            CIMAG(ret) = ans.imag;
+            return ret;
+        }
+
+DoubleComplex
+blasZdotc(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx,
+        DoubleComplex *Y,
+        size_t offy,
+        int incy)
+        {
+            doublecomplex answer = zdotc( (int)N, (doublecomplex*)(X+offx), incx, (doublecomplex*)(Y+offy), incy );
+            DoubleComplex ret2;
+            CREAL(ret2) = answer.real;
+            CIMAG(ret2) = answer.imag;
+            return ret2;
+        }
+
+
+void
+blasSaxpy(
+        size_t N,
+        float alpha,
+        const float *X,
+        size_t offBX,
+        int incx,
+        float *Y,
+        size_t offCY,
+        int incy)
+{
+    saxpy((int)N, alpha, (float*)(X+offBX), incx, (Y+offCY), incy);
+}
+
+void
+blasDaxpy(
+        size_t N,
+        double alpha,
+        const double *X,
+        size_t offBX,
+        int incx,
+        double *Y,
+        size_t offCY,
+        int incy)
+{
+    daxpy((int)N, alpha, (double*)(X+offBX), incx, (Y+offCY), incy);
+}
+
+void
+blasCaxpy(
+        size_t N,
+        FloatComplex alpha,
+        const FloatComplex *X,
+        size_t offBX,
+        int incx,
+        FloatComplex *Y,
+        size_t offCY,
+        int incy)
+{
+    caxpy((int)N, (complex*)(&alpha),(complex*)(X+offBX), incx, (complex*)(Y+offCY), incy);
+}
+
+void
+blasZaxpy(
+        size_t N,
+        DoubleComplex alpha,
+        const DoubleComplex *X,
+        size_t offBX,
+        int incx,
+        DoubleComplex *Y,
+        size_t offCY,
+        int incy)
+{
+    zaxpy((int)N, (doublecomplex*)(&alpha), (doublecomplex*)(X+offBX), incx, (doublecomplex*)(Y+offCY), incy);
+}
+
+
+//ROTG
+void
+blasSrotg(
+        float* SA,
+        size_t offSA,
+        float* SB,
+        size_t offSB,
+        float* C,
+        size_t offC,
+        float* S,
+        size_t offS)
+        {
+            srotg((SA+offSA), (SB+offSB), (C+offC), (S+offS));
+        }
+
+void
+blasDrotg(
+        double* SA,
+        size_t offSA,
+        double* SB,
+        size_t offSB,
+        double* C,
+        size_t offC,
+        double* S,
+        size_t offS)
+        {
+            drotg((SA+offSA), (SB+offSB), (C+offC), (S+offS));
+        }
+
+void
+blasCrotg(
+        FloatComplex* SA,
+        size_t offSA,
+        FloatComplex* SB,
+        size_t offSB,
+        float* C,
+        size_t offC,
+        FloatComplex* S,
+        size_t offS)
+        {
+            crotg((complex*)(SA+offSA), (complex*)(SB+offSB), (C+offC), (complex*)(S+offS));
+        }
+
+void
+blasZrotg(
+        DoubleComplex* SA,
+        size_t offSA,
+        DoubleComplex* SB,
+        size_t offSB,
+        double* C,
+        size_t offC,
+        DoubleComplex* S,
+        size_t offS)
+        {
+            zrotg((doublecomplex*)(SA+offSA), (doublecomplex*)(SB+offSB), (C+offC), (doublecomplex*)(S+offS));
+        }
+
+void
+blasSrotmg(
+        float *D1,
+        size_t offD1,
+        float *D2,
+        size_t offD2,
+        float *X1,
+        size_t offX1,
+        const float *Y1,
+        size_t offY1,
+        float *PARAM,
+        size_t offParam)
+        {
+            // C and fortran interface are different for rotmg..  FIXME
+            #if defined CORR_TEST_WITH_ACML
+                srotmg(D1[offD1], D2[offD2], X1[offX1], Y1[offY1],
+                        (PARAM+offParam));
+            #else
+                srotmg((D1+offD1), (D2+offD2), (X1+offX1), (Y1+offY1),
+                        (PARAM+offParam));
+            #endif
+        }
+
+void
+blasDrotmg(
+        double *D1,
+        size_t offD1,
+        double *D2,
+        size_t offD2,
+        double *X1,
+        size_t offX1,
+        const double *Y1,
+        size_t offY1,
+        double *PARAM,
+        size_t offParam)
+        {
+            // C and fortran interface are different for rotmg..  FIXME
+            #if defined CORR_TEST_WITH_ACML
+                drotmg(D1[offD1], D2[offD2], X1[offX1], Y1[offY1],
+                        (PARAM+offParam));
+            #else
+                drotmg((D1+offD1), (D2+offD2), (X1+offX1), (Y1+offY1),
+                        (PARAM+offParam));
+            #endif
+        }
+
+void
+blasSrotm(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx,
+        float *Y,
+        size_t offy,
+        int incy,
+        float *PARAM,
+        size_t offParam)
+        {
+            srotm(N, (X+offx), incx, (Y+offy), incy, (PARAM+offParam));
+        }
+
+void
+blasDrotm(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx,
+        double *Y,
+        size_t offy,
+        int incy,
+        double *PARAM,
+        size_t offParam)
+        {
+            drotm(N, (X+offx), incx, (Y+offy), incy, (PARAM+offParam));
+        }
+//ROT
+
+void
+blasSrot(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx,
+        float *Y,
+        size_t offy,
+        int incy,
+        float C,
+        float S)
+        {
+            srot(N, (X+offx), incx, (Y+offy), incy, C, S);
+        }
+
+void
+blasDrot(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx,
+        double *Y,
+        size_t offy,
+        int incy,
+        double C,
+        double S)
+        {
+            drot(N, (X+offx), incx, (Y+offy), incy, C, S);
+        }
+
+void
+blasCsrot(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx,
+        FloatComplex *Y,
+        size_t offy,
+        int incy,
+        float C,
+        float S)
+        {
+            csrot(N, (complex*)(X+offx), incx, (complex*)(Y+offy), incy, C, S);
+        }
+
+void
+blasZdrot(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx,
+        DoubleComplex *Y,
+        size_t offy,
+        int incy,
+        double C,
+        double S)
+        {
+            zdrot(N, (doublecomplex*)(X+offx), incx, (doublecomplex*)(Y+offy), incy, C, S);
+        }
+
+int
+blasiSamax(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx)
+        {
+            return isamax((int)N, (X+offx), incx);
+        }
+
+int
+blasiDamax(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx)
+        {
+            return idamax( (int)N, (X+offx), incx);
+        }
+
+int
+blasiCamax(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx)
+        {
+            return icamax((int)N, (complex*)(X+offx), incx);
+        }
+
+int
+blasiZamax(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx)
+        {
+            return izamax( (int)N, (doublecomplex*)(X+offx), incx);
+        }
+
+float
+blasSnrm2(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx)
+        {
+            return snrm2((int)N, (X+offx), incx);
+        }
+
+double
+blasDnrm2(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx)
+        {
+            return dnrm2( (int)N, (X+offx), incx);
+        }
+
+float
+blasScnrm2(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx)
+        {
+            return scnrm2((int)N, (complex*)(X+offx), incx);
+        }
+
+double
+blasDznrm2(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx)
+        {
+            return dznrm2( (int)N, (doublecomplex*)(X+offx), incx);
+        }
diff --git a/src/tests/clBLAS-wrapper.cpp b/src/tests/clBLAS-wrapper.cpp
new file mode 100644
index 0000000..1e4261f
--- /dev/null
+++ b/src/tests/clBLAS-wrapper.cpp
@@ -0,0 +1,3463 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <clBLAS.h>
+#include "clBLAS-wrapper.h"
+
+clblasStatus
+clMath::clblas::gemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    float beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasSgemv(order, transA, M, N, alpha, A, offA, lda, X,
+                            offx, incx, beta, Y, offy, incy,
+                            numCommandQueues, commandQueues,
+                            numEventsInWaitList, eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::gemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    double beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret =  clblasDgemv(order, transA, M, N, alpha, A, offA, lda, X,
+                            offx, incx, beta, Y, offy, incy,
+                            numCommandQueues, commandQueues,
+                            numEventsInWaitList, eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::gemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    FloatComplex beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasCgemv(order, transA, M, N, alpha, A, offA, lda, X,
+                            offx, incx, beta, Y, offy, incy,
+                            numCommandQueues, commandQueues,
+                            numEventsInWaitList, eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::gemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    DoubleComplex beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasZgemv(order, transA, M, N, alpha, A, offA, lda, X, offx,
+                            incx, beta, Y, offy, incy, numCommandQueues,
+                            commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+    return ret;
+}
+
+// SYMV wrappers
+clblasStatus
+clMath::clblas::symv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    float beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret =  clblasSsymv(order, uplo, N, alpha, A, offA, lda, X, offx,
+                            incx, beta, Y, offy, incy, numCommandQueues,
+                            commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::symv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    double beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasDsymv(order, uplo, N, alpha, A, offA, lda, X, offx,
+                            incx, beta, Y, offy, incy, numCommandQueues,
+                            commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::gemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    float beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasSgemm(order, transA, transB, M, N, K, alpha, A, offA,
+                            lda, B, offB, ldb, beta, C, offC, ldc,
+                            numCommandQueues, commandQueues,
+                            numEventsInWaitList, eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::gemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    double beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasDgemm(order, transA, transB, M, N, K, alpha, A, offA,
+                            lda, B, offB, ldb, beta, C, offC, ldc,
+                            numCommandQueues, commandQueues,
+                            numEventsInWaitList, eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::gemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    FloatComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasCgemm(order, transA, transB, M, N, K, alpha, A, offA,
+                            lda, B, offB, ldb, beta, C, offC, ldc,
+                            numCommandQueues, commandQueues,
+                            numEventsInWaitList, eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::gemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    DoubleComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasZgemm(order, transA, transB, M, N, K, alpha, A, offA,
+                            lda, B, offB, ldb, beta, C, offC, ldc,
+                            numCommandQueues, commandQueues,
+                            numEventsInWaitList, eventWaitList, events);
+
+    return ret;
+}
+
+#undef GEMMV2_VISIBLE // GEMM2 is not exported.
+
+clblasStatus
+clMath::clblas::gemm2(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    float beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret = clblasNotImplemented;
+
+#ifdef GEMMV2_VISIBLE //If GEMM2 is visible
+
+    if (!(offA || offB || offC)) {
+        ret = clblasSgemmV2(order, transA, transB, M, N, K, alpha, A, lda,
+                             B, ldb, beta, C, ldc, numCommandQueues,
+                             commandQueues, numEventsInWaitList, eventWaitList,
+                             events);
+    }
+    else {
+        ret = clblasSgemmExV2(order, transA, transB, M, N, K, alpha, A, offA,
+                               lda, B, offB, ldb, beta, C, offC, ldc,
+                               numCommandQueues, commandQueues,
+                               numEventsInWaitList, eventWaitList, events);
+    }
+#else //To avoid warnings
+    order = order;
+    transA = transA;
+    transB = transB;
+    M = M;
+    N = N;
+    K = K;
+    alpha = alpha;
+    lda = lda;
+    ldb = ldb;
+    beta = beta;
+    C = A;
+    C = B;
+    C = C;
+    ldc = ldc;
+    numCommandQueues = numCommandQueues;
+    commandQueues = commandQueues;
+    numEventsInWaitList = numEventsInWaitList;
+    eventWaitList = eventWaitList;
+    events = events;
+    offA = offA;
+    offB = offB;
+    offC = offC;
+#endif
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::gemm2(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    double beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret = clblasNotImplemented;
+
+#ifdef GEMMV2_VISIBLE
+    if (!(offA || offB || offC)) {
+        ret = clblasDgemmV2(order, transA, transB, M, N, K, alpha, A, lda,
+                             B, ldb, beta, C, ldc, numCommandQueues,
+                             commandQueues, numEventsInWaitList, eventWaitList,
+                             events);
+    }
+    else {
+        ret = clblasDgemmExV2(order, transA, transB, M, N, K, alpha, A, offA,
+                               lda, B, offB, ldb, beta, C, offC, ldc,
+                               numCommandQueues, commandQueues,
+                               numEventsInWaitList, eventWaitList, events);
+    }
+#else //To avoid warnings
+    order = order;
+    transA = transA;
+    transB = transB;
+    M = M;
+    N = N;
+    K = K;
+    alpha = alpha;
+    lda = lda;
+    ldb = ldb;
+    beta = beta;
+    C = A;
+    C = B;
+    C = C;
+    ldc = ldc;
+    numCommandQueues = numCommandQueues;
+    commandQueues = commandQueues;
+    numEventsInWaitList = numEventsInWaitList;
+    eventWaitList = eventWaitList;
+    events = events;
+    offA = offA;
+    offB = offB;
+    offC = offC;
+#endif
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::gemm2(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    FloatComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret = clblasNotImplemented;
+
+#ifdef GEMMV2_VISIBLE
+    if (!(offA || offB || offC)) {
+        ret = clblasCgemmV2(order, transA, transB, M, N, K, alpha, A, lda,
+                             B, ldb, beta, C, ldc, numCommandQueues,
+                             commandQueues, numEventsInWaitList, eventWaitList,
+                             events);
+    }
+    else {
+        ret = clblasCgemmExV2(order, transA, transB, M, N, K, alpha, A, offA,
+                               lda, B, offB, ldb, beta, C, offC, ldc,
+                               numCommandQueues, commandQueues,
+                               numEventsInWaitList, eventWaitList, events);
+    }
+#else //To avoid warnings
+    order = order;
+    transA = transA;
+    transB = transB;
+    M = M;
+    N = N;
+    K = K;
+    alpha = alpha;
+    lda = lda;
+    ldb = ldb;
+    beta = beta;
+    C = A;
+    C = B;
+    C = C;
+    ldc = ldc;
+    numCommandQueues = numCommandQueues;
+    commandQueues = commandQueues;
+    numEventsInWaitList = numEventsInWaitList;
+    eventWaitList = eventWaitList;
+    events = events;
+    offA = offA;
+    offB = offB;
+    offC = offC;
+#endif
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::gemm2(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    DoubleComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret = clblasNotImplemented;
+
+#ifdef GEMMV2_VISIBLE
+    if (!(offA || offB || offC)) {
+        ret = clblasZgemmV2(order, transA, transB, M, N, K, alpha, A, lda,
+                             B, ldb, beta, C, ldc, numCommandQueues,
+                             commandQueues, numEventsInWaitList, eventWaitList,
+                             events);
+    }
+    else {
+        ret = clblasZgemmExV2(order, transA, transB, M, N, K, alpha, A, offA,
+                               lda, B, offB, ldb, beta, C, offC, ldc,
+                               numCommandQueues, commandQueues,
+                               numEventsInWaitList, eventWaitList, events);
+    }
+
+#else //To avoid warnings
+    order = order;
+    transA = transA;
+    transB = transB;
+    M = M;
+    N = N;
+    K = K;
+    alpha = alpha;
+    lda = lda;
+    ldb = ldb;
+    beta = beta;
+    C = A;
+    C = B;
+    C = C;
+    ldc = ldc;
+    numCommandQueues = numCommandQueues;
+    commandQueues = commandQueues;
+    numEventsInWaitList = numEventsInWaitList;
+    eventWaitList = eventWaitList;
+    events = events;
+    offA = offA;
+    offB = offB;
+    offC = offC;
+#endif
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::trmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasStrmm(order, side, uplo, transA, diag, M, N, alpha, A,
+                            offA, lda, B, offB, ldb, numCommandQueues,
+                            commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::trmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasDtrmm(order, side, uplo, transA, diag, M, N, alpha, A,
+                            offA, lda, B, offB, ldb, numCommandQueues,
+                            commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::trmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasCtrmm(order, side, uplo, transA, diag, M, N, alpha, A,
+                            offA, lda, B, offB, ldb, numCommandQueues,
+                            commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::trmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasZtrmm(order, side, uplo, transA, diag, M, N, alpha, A,
+                            offA, lda, B, offB, ldb, numCommandQueues,
+                            commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::trsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasStrsm(order, side, uplo, transA, diag, M, N, alpha, A,
+                            offA, lda, B, offB, ldb, numCommandQueues,
+                            commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::trsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasDtrsm(order, side, uplo, transA, diag, M, N, alpha, A,
+                            offA, lda, B, offB, ldb, numCommandQueues,
+                            commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::trsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasCtrsm(order, side, uplo, transA, diag, M, N, alpha, A,
+                            offA, lda, B, offB, ldb, numCommandQueues,
+                            commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::trsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    cl_mem B,
+    size_t offB,
+    size_t ldb,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasZtrsm(order, side, uplo, transA, diag, M, N, alpha, A,
+                            offA, lda, B, offB, ldb, numCommandQueues,
+                            commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::syr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transAB,
+    size_t N,
+    size_t K,
+    float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    float beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasSsyr2k(order, uplo, transAB, N, K, alpha, A, offA, lda,
+                            B, offB, ldb, beta, C, offC, ldc,
+                            numCommandQueues, commandQueues,
+                            numEventsInWaitList, eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::syr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transAB,
+    size_t N,
+    size_t K,
+    double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    double beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasDsyr2k(order, uplo, transAB, N, K, alpha, A, offA, lda,
+                            B, offB, ldb, beta, C, offC, ldc,
+                            numCommandQueues, commandQueues,
+                            numEventsInWaitList, eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::syr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transAB,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    FloatComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasCsyr2k(order, uplo, transAB, N, K, alpha, A, offA, lda,
+                            B, offB, ldb, beta, C, offC, ldc,
+                            numCommandQueues, commandQueues,
+                            numEventsInWaitList, eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::syr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transAB,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    const cl_mem B,
+    size_t offB,
+    size_t ldb,
+    DoubleComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasZsyr2k(order, uplo, transAB, N, K, alpha, A, offA, lda,
+                            B, offB, ldb, beta, C, offC, ldc,
+                            numCommandQueues, commandQueues,
+                            numEventsInWaitList, eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::syrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    float beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasSsyrk(order, uplo, transA, N, K, alpha, A, offA, lda,
+                            beta, C, offC, ldc, numCommandQueues,
+                            commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::syrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    double beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasDsyrk(order, uplo, transA, N, K, alpha, A, offA, lda,
+                            beta, C, offC, ldc, numCommandQueues,
+                            commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::syrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    FloatComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasCsyrk(order, uplo, transA, N, K, alpha, A, offA, lda,
+                            beta, C, offC, ldc, numCommandQueues,
+                            commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::syrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    DoubleComplex beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasZsyrk(order, uplo, transA, N, K, alpha, A, offA, lda,
+                            beta, C, offC, ldc, numCommandQueues,
+                            commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::trmv(
+	DataType type,
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		switch(type)
+		{
+			case TYPE_FLOAT:
+				return clblasStrmv(
+					order, uplo, trans, diag, N, A, offa, lda, X,
+					offx, incx, scratchBuff,
+					numCommandQueues,commandQueues, numEventsInWaitList,
+					eventWaitList, events);
+
+			case TYPE_DOUBLE:
+				return clblasDtrmv(
+					order, uplo, trans, diag, N, A, offa, lda, X,
+					offx, incx, scratchBuff,
+					numCommandQueues, commandQueues, numEventsInWaitList,
+					eventWaitList, events);
+
+			case TYPE_COMPLEX_FLOAT:
+				return clblasCtrmv(
+					order, uplo, trans, diag, N, A, offa, lda, X,
+					offx, incx, scratchBuff,
+					numCommandQueues, commandQueues, numEventsInWaitList,
+					eventWaitList, events);
+
+			case TYPE_COMPLEX_DOUBLE:
+				return clblasZtrmv(
+					order, uplo, trans, diag, N, A, offa, lda, X,
+					offx, incx, scratchBuff,
+					numCommandQueues, commandQueues, numEventsInWaitList,
+					eventWaitList, events);
+
+			default:
+				return 	clblasInvalidValue;
+		}
+	}
+
+clblasStatus
+clMath::clblas::trsv(
+    DataType type,
+	clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+	switch(type)
+        {
+            case TYPE_FLOAT:
+                return clblasStrsv(
+                    order, uplo, trans, diag, N, A, offa, lda, X,
+                    offx, incx, numCommandQueues,
+                    commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+
+            case TYPE_DOUBLE:
+                return clblasDtrsv(
+                    order, uplo, trans, diag, N, A, offa, lda, X,
+                    offx, incx, numCommandQueues,
+                    commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+
+            case TYPE_COMPLEX_FLOAT:
+                return clblasCtrsv(
+                    order, uplo, trans, diag, N, A, offa, lda, X,
+                    offx, incx,numCommandQueues,
+                    commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+
+            case TYPE_COMPLEX_DOUBLE:
+                return clblasZtrsv(
+                    order, uplo, trans, diag, N, A, offa, lda, X,
+                    offx, incx, numCommandQueues,
+                    commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+
+            default:
+                return  clblasInvalidValue;
+        }
+    }
+
+clblasStatus
+clMath::clblas::tpsv(
+    DataType type,
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem A,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+    switch(type)
+        {
+            case TYPE_FLOAT:
+                return clblasStpsv(
+                    order, uplo, trans, diag, N, A, offa, X,
+                    offx, incx, numCommandQueues,
+                    commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+
+            case TYPE_DOUBLE:
+                return clblasDtpsv(
+                    order, uplo, trans, diag, N, A, offa, X,
+                    offx, incx, numCommandQueues,
+                    commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+
+            case TYPE_COMPLEX_FLOAT:
+                return clblasCtpsv(
+                    order, uplo, trans, diag, N, A, offa, X,
+                    offx, incx,numCommandQueues,
+                    commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+
+            case TYPE_COMPLEX_DOUBLE:
+                return clblasZtpsv(
+                    order, uplo, trans, diag, N, A, offa, X,
+                    offx, incx, numCommandQueues,
+                    commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+
+            default:
+                return  clblasInvalidValue;
+        }
+    }
+
+clblasStatus
+clMath::clblas::symm(
+	clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    size_t M,
+    size_t N,
+    float alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    float beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		return clblasSsymm( order, side, uplo, M, N, alpha,
+    							A, offa, lda, B, offb, ldb, beta, C, offc, ldc,
+    							numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+
+clblasStatus
+clMath::clblas::symm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    size_t M,
+    size_t N,
+    double alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    double beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasDsymm( order, side, uplo, M, N, alpha,
+                                A, offa, lda, B, offb, ldb, beta, C, offc, ldc,
+                                numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+
+clblasStatus
+clMath::clblas::symm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    FloatComplex beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasCsymm( order, side, uplo, M, N, alpha,
+                                A, offa, lda, B, offb, ldb, beta, C, offc, ldc,
+                                numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+
+clblasStatus
+clMath::clblas::symm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    DoubleComplex beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasZsymm( order, side, uplo, M, N, alpha,
+                                A, offa, lda, B, offb, ldb, beta, C, offc, ldc,
+                                numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+
+clblasStatus
+clMath::clblas::ger(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+        {
+                return clblasSger( order, M, N, alpha,
+                                        X, offx, incx, Y, offy, incy, A, offa, lda,
+                                        numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+        }
+
+
+clblasStatus
+clMath::clblas::ger(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasDger( order, M, N, alpha,
+                                X, offx, incx, Y, offy, incy, A, offa, lda,
+                                numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+clblasStatus
+clMath::clblas::ger(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasCgeru( order, M, N, alpha,
+                                X, offx, incx, Y, offy, incy, A, offa, lda,
+                                numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+clblasStatus
+clMath::clblas::ger(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasZgeru( order, M, N, alpha,
+                                X, offx, incx, Y, offy, incy, A, offa, lda,
+                                numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+clblasStatus
+clMath::clblas::gerc(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasCgerc( order, M, N, alpha,
+                                X, offx, incx, Y, offy, incy, A, offa, lda,
+                                numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+clblasStatus
+clMath::clblas::gerc(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasZgerc( order, M, N, alpha,
+                                X, offx, incx, Y, offy, incy, A, offa, lda,
+                                numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+
+clblasStatus
+clMath::clblas::syr(
+	clblasOrder order,
+	clblasUplo uplo,
+    size_t N,
+    float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+	cl_uint numCommandQueues,
+	cl_command_queue *commandQueues,
+	cl_uint numEventsInWaitList,
+	const cl_event *eventWaitList,
+	cl_event *events)
+	{
+		return clblasSsyr( order, uplo, N, alpha, X, offx, incx, A, offa, lda,
+								numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clMath::clblas::syr(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasDsyr( order, uplo, N, alpha, X, offx, incx, A, offa, lda,
+                              numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+//SPR
+
+clblasStatus
+clMath::clblas::her(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasCher( order, uplo, N, alpha,
+                                X, offx, incx, A, offa, lda,
+                                numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+
+clblasStatus
+clMath::clblas::her(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasZher( order, uplo, N, alpha,
+                                X, offx, incx, A, offa, lda,
+                                numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+
+clblasStatus
+clMath::clblas::syr2(
+	clblasOrder order,
+	clblasUplo uplo,
+    size_t N,
+    float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+	const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+	cl_uint numCommandQueues,
+	cl_command_queue *commandQueues,
+	cl_uint numEventsInWaitList,
+	const cl_event *eventWaitList,
+	cl_event *events)
+	{
+		return clblasSsyr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda,
+								numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clMath::clblas::syr2(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+	const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasDsyr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda,
+                              numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+clblasStatus
+clMath::clblas::her2(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasCher2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda,
+                                numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+clblasStatus
+clMath::clblas::her2(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem A,
+    size_t offa,
+    size_t lda,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasZher2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda,
+                              numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+clblasStatus
+clMath::clblas::hemv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    FloatComplex alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    FloatComplex beta,
+        cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasChemv( order, uplo, N, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy,
+                              numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+
+clblasStatus
+clMath::clblas::hemv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    DoubleComplex beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasZhemv( order, uplo, N, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy,
+                              numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+
+//HEMM
+clblasStatus
+clMath::clblas::hemm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    FloatComplex beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasChemm( order, side, uplo, M, N, alpha,
+                                A, offa, lda, B, offb, ldb, beta, C, offc, ldc,
+                                numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+
+clblasStatus
+clMath::clblas::hemm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+	 size_t ldb,
+    DoubleComplex beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasZhemm( order, side, uplo, M, N, alpha,
+                                A, offa, lda, B, offb, ldb, beta, C, offc, ldc,
+                                numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+clblasStatus
+clMath::clblas::herk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    float alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    float beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	clblasStatus ret;
+
+	ret = clblasCherk(order, uplo, transA, N, K, alpha, A, offA, lda,
+						beta, C, offC, ldc, numCommandQueues,
+                        commandQueues, numEventsInWaitList,
+                        eventWaitList, events);
+
+	return ret;
+}
+
+clblasStatus
+clMath::clblas::herk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    double alpha,
+    const cl_mem A,
+    size_t offA,
+    size_t lda,
+    double beta,
+    cl_mem C,
+    size_t offC,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasZherk(order, uplo, transA, N, K, alpha, A, offA, lda,
+                        beta, C, offC, ldc, numCommandQueues,
+                        commandQueues, numEventsInWaitList,
+                        eventWaitList, events);
+
+    return ret;
+}
+
+
+clblasStatus
+clMath::clblas::tpmv(
+	DataType type,
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose trans,
+    clblasDiag diag,
+    size_t N,
+    const cl_mem AP,
+    size_t offa,
+    cl_mem X,
+    size_t offx,
+    int incx,
+	cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+	{
+		switch(type)
+		{
+			case TYPE_FLOAT:
+				return clblasStpmv(
+					order, uplo, trans, diag, N, AP, offa, X,
+					offx, incx, scratchBuff,
+					numCommandQueues,commandQueues, numEventsInWaitList,
+					eventWaitList, events);
+
+			case TYPE_DOUBLE:
+				return clblasDtpmv(
+					order, uplo, trans, diag, N, AP, offa, X,
+					offx, incx, scratchBuff,
+					numCommandQueues, commandQueues, numEventsInWaitList,
+					eventWaitList, events);
+
+			case TYPE_COMPLEX_FLOAT:
+				return clblasCtpmv(
+					order, uplo, trans, diag, N, AP, offa, X,
+					offx, incx, scratchBuff,
+					numCommandQueues, commandQueues, numEventsInWaitList,
+					eventWaitList, events);
+
+			case TYPE_COMPLEX_DOUBLE:
+				return clblasZtpmv(
+					order, uplo, trans, diag, N, AP, offa, X,
+					offx, incx, scratchBuff,
+					numCommandQueues, commandQueues, numEventsInWaitList,
+					eventWaitList, events);
+
+			default:
+				return 	clblasInvalidValue;
+		}
+	}
+
+
+clblasStatus
+clMath::clblas::spmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_float alpha,
+    const cl_mem AP,
+    size_t offa,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_float beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret =  clblasSspmv(order, uplo, N, alpha, AP, offa, X, offx, incx,
+                          beta, Y, offy, incy, numCommandQueues,
+                          commandQueues, numEventsInWaitList, eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::spmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    cl_double alpha,
+    const cl_mem AP,
+    size_t offa,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_double beta,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasDspmv(order, uplo, N, alpha, AP, offa, X, offx, incx, beta,
+                         Y, offy, incy, numCommandQueues, commandQueues,
+                         numEventsInWaitList, eventWaitList, events);
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::hpmv(
+      clblasOrder order,
+      clblasUplo uplo,
+      size_t N,
+      FloatComplex alpha,
+      const cl_mem AP,
+      size_t offa,
+      const cl_mem X,
+      size_t offx,
+      int incx,
+      FloatComplex beta,
+      cl_mem Y,
+      size_t offy,
+      int incy,
+      cl_uint numCommandQueues,
+      cl_command_queue *commandQueues,
+      cl_uint numEventsInWaitList,
+      const cl_event *eventWaitList,
+      cl_event *events)
+{
+
+        return clblasChpmv(order, uplo, N, alpha, AP, offa,
+                                    X, offx, incx, beta, Y, offy, incy,
+                                    numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+clblasStatus
+clMath::clblas::hpmv(
+      clblasOrder order,
+      clblasUplo uplo,
+      size_t N,
+      DoubleComplex alpha,
+      const cl_mem AP,
+      size_t offa,
+      const cl_mem X,
+      size_t offx,
+      int incx,
+      DoubleComplex beta,
+      cl_mem Y,
+      size_t offy,
+      int incy,
+      cl_uint numCommandQueues,
+      cl_command_queue *commandQueues,
+      cl_uint numEventsInWaitList,
+      const cl_event *eventWaitList,
+      cl_event *events)
+{
+
+        return clblasZhpmv(order, uplo, N, alpha, AP, offa,
+                                    X, offx, incx, beta, Y, offy, incy,
+                                    numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events );
+}
+
+
+
+clblasStatus
+clMath::clblas::spr(
+	clblasOrder order,
+	clblasUplo uplo,
+    size_t N,
+    float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem AP,
+    size_t offa,
+	cl_uint numCommandQueues,
+	cl_command_queue *commandQueues,
+	cl_uint numEventsInWaitList,
+	const cl_event *eventWaitList,
+	cl_event *events)
+	{
+		return clblasSspr( order, uplo, N, alpha, X, offx, incx, AP, offa,
+								numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clMath::clblas::spr(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasDspr( order, uplo, N, alpha, X, offx, incx, AP, offa,
+                              numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+
+clblasStatus
+clMath::clblas::hpr(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasChpr( order, uplo, N, alpha,
+                                X, offx, incx, AP, offa,
+                                numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+
+clblasStatus
+clMath::clblas::hpr(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasZhpr( order, uplo, N, alpha,
+                                X, offx, incx, AP, offa,
+                                numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+
+clblasStatus
+clMath::clblas::spr2(
+	clblasOrder order,
+	clblasUplo uplo,
+    size_t N,
+    float alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+	const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem AP,
+    size_t offa,
+	cl_uint numCommandQueues,
+	cl_command_queue *commandQueues,
+	cl_uint numEventsInWaitList,
+	const cl_event *eventWaitList,
+	cl_event *events)
+	{
+		return clblasSspr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa,
+								numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+	}
+
+clblasStatus
+clMath::clblas::spr2(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    double alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+	const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasDspr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa,
+                              numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+clblasStatus
+clMath::clblas::hpr2(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    FloatComplex alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasChpr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa,
+                                numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+clblasStatus
+clMath::clblas::hpr2(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    DoubleComplex alpha,
+    const cl_mem X,
+    size_t offx,
+    int incx,
+    const cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem AP,
+    size_t offa,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+    {
+        return clblasZhpr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa,
+                              numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+
+clblasStatus
+clMath::clblas::gbmv(
+        clblasOrder order,
+        clblasTranspose trans,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        cl_float alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        cl_float beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+        {
+            return clblasSgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx,
+                                    beta, Y, offy, incy, numCommandQueues, commandQueues,
+                                        numEventsInWaitList, eventWaitList, events);
+        }
+
+clblasStatus
+clMath::clblas::gbmv(
+        clblasOrder order,
+        clblasTranspose trans,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        cl_double alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        cl_double beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+        {
+            return clblasDgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx,
+                                    beta, Y, offy, incy, numCommandQueues, commandQueues,
+                                        numEventsInWaitList, eventWaitList, events);
+        }
+
+clblasStatus
+clMath::clblas::gbmv(
+        clblasOrder order,
+        clblasTranspose trans,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        cl_float2 alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        cl_float2 beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+        {
+            return clblasCgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx,
+                                    beta, Y, offy, incy, numCommandQueues, commandQueues,
+                                        numEventsInWaitList, eventWaitList, events);
+        }
+
+clblasStatus
+clMath::clblas::gbmv(
+        clblasOrder order,
+        clblasTranspose trans,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        cl_double2 alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        cl_double2 beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+        {
+            return clblasZgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx,
+                                    beta, Y, offy, incy, numCommandQueues, commandQueues,
+                                        numEventsInWaitList, eventWaitList, events);
+        }
+
+clblasStatus
+clMath::clblas::tbmv(
+        DataType type,
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_mem scratchBuff,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+    {
+        switch(type)
+		{
+			case TYPE_FLOAT:
+				return clblasStbmv(
+					order, uplo, trans, diag, N, K, A, offa, lda, X,
+					offx, incx, scratchBuff,
+					numCommandQueues,commandQueues, numEventsInWaitList,
+					eventWaitList, events);
+
+			case TYPE_DOUBLE:
+				return clblasDtbmv(
+					order, uplo, trans, diag, N, K, A, offa, lda, X,
+					offx, incx, scratchBuff,
+					numCommandQueues, commandQueues, numEventsInWaitList,
+					eventWaitList, events);
+
+			case TYPE_COMPLEX_FLOAT:
+				return clblasCtbmv(
+					order, uplo, trans, diag, N, K, A, offa, lda, X,
+					offx, incx, scratchBuff,
+					numCommandQueues, commandQueues, numEventsInWaitList,
+					eventWaitList, events);
+
+			case TYPE_COMPLEX_DOUBLE:
+				return clblasZtbmv(
+					order, uplo, trans, diag, N, K, A, offa, lda, X,
+					offx, incx, scratchBuff,
+					numCommandQueues, commandQueues, numEventsInWaitList,
+					eventWaitList, events);
+
+			default:
+				return 	clblasInvalidValue;
+		}
+    }
+
+//SBMV
+
+clblasStatus
+clMath::clblas::sbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        size_t K,
+        cl_float alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        cl_float beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+        {
+            return clblasSsbmv( order, uplo, N, K, alpha, A, offa, lda, X, offx, incx,
+                                    beta, Y, offy, incy, numCommandQueues, commandQueues,
+                                        numEventsInWaitList, eventWaitList, events );
+        }
+
+clblasStatus
+clMath::clblas::sbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t M,
+        size_t K,
+        cl_double alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        cl_double beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+        {
+            return clblasDsbmv( order, uplo, M, K, alpha, A, offa, lda, X, offx, incx,
+                                    beta, Y, offy, incy, numCommandQueues, commandQueues,
+                                        numEventsInWaitList, eventWaitList, events);
+        }
+
+
+//HBMV
+
+clblasStatus
+clMath::clblas::hbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        size_t K,
+        cl_float2 alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        cl_float2 beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+        {
+            return clblasChbmv( order, uplo, N, K, alpha, A, offa, lda, X, offx, incx,
+                                    beta, Y, offy, incy, numCommandQueues, commandQueues,
+                                        numEventsInWaitList, eventWaitList, events);
+        }
+
+clblasStatus
+clMath::clblas::hbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        size_t K,
+        cl_double2 alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        cl_double2 beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+        {
+            return clblasZhbmv( order, uplo, N, K, alpha, A, offa, lda, X, offx, incx,
+                                    beta, Y, offy, incy, numCommandQueues, commandQueues,
+                                        numEventsInWaitList, eventWaitList, events);
+        }
+
+//TBSV
+
+clblasStatus
+clMath::clblas::tbsv(
+        DataType type,
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        //cl_mem scratchBuff,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+    {
+        switch(type)
+        {
+            case TYPE_FLOAT:
+                return clblasStbsv(
+                    order, uplo, trans, diag, N, K, A, offa, lda, X,
+                    offx, incx,
+                    numCommandQueues,commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+
+            case TYPE_DOUBLE:
+                return clblasDtbsv(
+                    order, uplo, trans, diag, N, K, A, offa, lda, X,
+                    offx, incx,
+                    numCommandQueues, commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+
+            case TYPE_COMPLEX_FLOAT:
+                return clblasCtbsv(
+                    order, uplo, trans, diag, N, K, A, offa, lda, X,
+                    offx, incx,
+                    numCommandQueues, commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+
+            case TYPE_COMPLEX_DOUBLE:
+                return clblasZtbsv(
+                    order, uplo, trans, diag, N, K, A, offa, lda, X,
+                    offx, incx,
+                    numCommandQueues, commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+
+            default:
+                return  clblasInvalidValue;
+        }
+    }
+
+
+clblasStatus
+clMath::clblas::her2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    cl_float2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_float beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+	clblasStatus ret;
+
+	ret = clblasCher2k(order, uplo, transA, N, K, alpha, A, offa, lda, B, offb, ldb,
+						beta, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+
+	return ret;
+}
+
+clblasStatus
+clMath::clblas::her2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    cl_double2 alpha,
+    const cl_mem A,
+    size_t offa,
+    size_t lda,
+    const cl_mem B,
+    size_t offb,
+    size_t ldb,
+    cl_double beta,
+    cl_mem C,
+    size_t offc,
+    size_t ldc,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+    clblasStatus ret;
+
+    ret = clblasZher2k(order, uplo, transA, N, K, alpha, A, offa, lda, B, offb, ldb,
+						beta, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+
+    return ret;
+}
+
+clblasStatus
+clMath::clblas::swap(
+	DataType type,
+    size_t N,
+    cl_mem X,
+	size_t offx,
+    int incx,
+	cl_mem Y,
+	size_t offy,
+    int incy,
+	cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+{
+		switch(type)
+		{
+			case TYPE_FLOAT:
+				return clblasSswap(
+					N, X, offx, incx, Y, offy, incy,
+					numCommandQueues, commandQueues, numEventsInWaitList,
+				    eventWaitList, events);
+
+			case TYPE_DOUBLE:
+				return clblasDswap(
+					N, X, offx, incx, Y, offy, incy,
+					numCommandQueues, commandQueues, numEventsInWaitList,
+				    eventWaitList, events);
+
+			case TYPE_COMPLEX_FLOAT:
+				return clblasCswap(
+					N, X, offx, incx, Y, offy, incy,
+					numCommandQueues, commandQueues, numEventsInWaitList,
+				    eventWaitList, events);
+
+			case TYPE_COMPLEX_DOUBLE:
+				return clblasZswap(
+					N, X, offx, incx, Y, offy, incy,
+					numCommandQueues, commandQueues, numEventsInWaitList,
+				    eventWaitList, events);
+
+			default:
+				return 	clblasInvalidValue;
+		}
+}
+
+ clblasStatus
+clMath::clblas::copy(
+	DataType type,
+    size_t N,
+    cl_mem X,
+	size_t offx,
+    int incx,
+	cl_mem Y,
+	size_t offy,
+    int incy,
+	cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+   {
+        switch(type)
+        {
+
+            case TYPE_FLOAT:
+                return clblasScopy(
+                    N, X, offx, incx, Y,
+                    offy, incy,
+                    numCommandQueues,commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+
+            case TYPE_DOUBLE:
+                return clblasDcopy(
+                    N, X, offx, incx, Y,
+                    offy, incy,
+                    numCommandQueues, commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+
+            case TYPE_COMPLEX_FLOAT:
+                return clblasCcopy(
+                    N, X, offx, incx, Y,
+                    offy, incy,
+                    numCommandQueues, commandQueues, numEventsInWaitList,
+                     eventWaitList, events);
+
+            case TYPE_COMPLEX_DOUBLE:
+                return clblasZcopy(
+                    N, X, offx, incx, Y,
+                    offy, incy,
+                    numCommandQueues, commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+
+            default:
+                return  clblasInvalidValue;
+        }
+    }
+
+
+// scal, csscal & zdscal wrappers
+clblasStatus
+clMath::clblas::scal(
+        bool is_css_zds,
+        size_t N,
+        cl_float alpha,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+{
+    is_css_zds = is_css_zds;     // Remove warning
+    return clblasSscal(N, alpha, X, offx, incx, numCommandQueues,
+                        commandQueues, numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clMath::clblas::scal(
+        bool is_css_zds,
+        size_t N,
+        cl_double alpha,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+{
+    is_css_zds = is_css_zds;     // Remove warning
+    return clblasDscal(N, alpha, X, offx, incx, numCommandQueues,
+                        commandQueues, numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clMath::clblas::scal(
+        bool is_css_zds,
+        size_t N,
+        FloatComplex alpha,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+{
+
+    if(is_css_zds) {
+        return clblasCsscal(N, CREAL(alpha), X, offx, incx, numCommandQueues,
+                        commandQueues, numEventsInWaitList, eventWaitList, events);
+    } else {
+        return clblasCscal(N, alpha, X, offx, incx, numCommandQueues,
+                        commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+}
+
+clblasStatus
+clMath::clblas::scal(
+        bool is_css_zds,
+        size_t N,
+        DoubleComplex alpha,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+{
+    if(is_css_zds) {
+        return clblasZdscal(N, CREAL(alpha), X, offx, incx, numCommandQueues,
+                        commandQueues, numEventsInWaitList, eventWaitList, events);
+    } else {
+        return clblasZscal(N, alpha, X, offx, incx, numCommandQueues,
+                        commandQueues, numEventsInWaitList, eventWaitList, events);
+    }
+}
+
+// DOT
+clblasStatus
+clMath::clblas::dot(
+    DataType type,
+    size_t N,
+    cl_mem dotProduct,
+    size_t offDP,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+
+    {
+            switch(type){
+
+            case TYPE_FLOAT:
+            return clblasSdot( N, dotProduct, offDP, X, offx, incx, Y,
+                            offy, incy, scratchBuff,
+                            numCommandQueues,commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+            case TYPE_DOUBLE:
+            return clblasDdot( N, dotProduct, offDP, X, offx, incx, Y,
+                                  offy, incy, scratchBuff,
+                                  numCommandQueues,commandQueues, numEventsInWaitList,
+                                  eventWaitList, events);
+
+            case TYPE_COMPLEX_FLOAT:
+            return clblasCdotu( N, dotProduct, offDP, X, offx, incx, Y,
+                            offy, incy, scratchBuff,
+                            numCommandQueues,commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+            case TYPE_COMPLEX_DOUBLE:
+            return clblasZdotu( N, dotProduct, offDP, X, offx, incx, Y,
+                                  offy, incy, scratchBuff,
+                                  numCommandQueues,commandQueues, numEventsInWaitList,
+                                  eventWaitList, events);
+
+            default:
+                   return clblasInvalidValue;
+            }
+
+    }
+
+
+//ASUM
+
+clblasStatus
+clMath::clblas::asum(
+    DataType type,
+    size_t N,
+    cl_mem asum,
+    size_t offAsum,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+
+    {
+            switch(type){
+
+            case TYPE_FLOAT:
+            return clblasSasum( N, asum, offAsum, X, offx, incx, scratchBuff,
+                            numCommandQueues,commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+            case TYPE_DOUBLE:
+            return clblasDasum( N, asum, offAsum, X, offx, incx, scratchBuff,
+                                  numCommandQueues,commandQueues, numEventsInWaitList,
+                                  eventWaitList, events);
+
+            case TYPE_COMPLEX_FLOAT:
+            return clblasScasum( N, asum, offAsum, X, offx, incx, scratchBuff,
+                            numCommandQueues,commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+            case TYPE_COMPLEX_DOUBLE:
+            return clblasDzasum( N, asum, offAsum, X, offx, incx, scratchBuff,
+                                  numCommandQueues,commandQueues, numEventsInWaitList,
+                                  eventWaitList, events);
+
+            default:
+                   return clblasInvalidValue;
+            }
+
+    }
+
+//DOTC
+clblasStatus
+clMath::clblas::dotc(
+    DataType type,
+    size_t N,
+    cl_mem dotProduct,
+    size_t offDP,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem Y,
+    size_t offy,
+    int incy,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+
+    {
+            switch(type){
+
+            case TYPE_COMPLEX_FLOAT:
+            return clblasCdotc( N, dotProduct, offDP, X, offx, incx, Y,
+                            offy, incy, scratchBuff,
+                            numCommandQueues,commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+            case TYPE_COMPLEX_DOUBLE:
+            return clblasZdotc( N, dotProduct, offDP, X, offx, incx, Y,
+                                  offy, incy, scratchBuff,
+                                  numCommandQueues,commandQueues, numEventsInWaitList,
+                                  eventWaitList, events);
+
+            default:
+                   return clblasInvalidValue;
+            }
+
+    }
+
+
+
+//axpy calls
+clblasStatus
+	clMath::clblas::axpy(
+		size_t N,
+        cl_float alpha,
+		cl_mem X,
+		size_t offBX,
+		int incx,
+		cl_mem Y,
+		size_t offCY,
+		int incy,
+		cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+{
+    return clblasSaxpy(N, alpha, X, offBX, incx, Y, offCY, incy, numCommandQueues,
+                        commandQueues, numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+	clMath::clblas::axpy(
+		size_t N,
+        cl_double alpha,
+		cl_mem X,
+		size_t offBX,
+		int incx,
+		cl_mem Y,
+		size_t offCY,
+		int incy,
+		cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+{
+
+    return clblasDaxpy(N, alpha, X, offBX, incx, Y, offCY, incy, numCommandQueues,
+                        commandQueues, numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+	clMath::clblas::axpy(
+		size_t N,
+        FloatComplex alpha,
+		cl_mem X,
+		size_t offBX,
+		int incx,
+		cl_mem Y,
+		size_t offCY,
+		int incy,
+		cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+{
+
+    return clblasCaxpy(N, alpha, X, offBX, incx, Y, offCY, incy, numCommandQueues,
+                        commandQueues, numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+	clMath::clblas::axpy(
+		size_t N,
+        DoubleComplex alpha,
+		cl_mem X,
+		size_t offBX,
+		int incx,
+		cl_mem Y,
+		size_t offCY,
+		int incy,
+		cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+{
+
+    return clblasZaxpy(N, alpha, X, offBX, incx, Y, offCY, incy, numCommandQueues,
+                        commandQueues, numEventsInWaitList, eventWaitList, events);
+}
+
+clblasStatus
+clMath::clblas::rotg(
+        DataType type,
+        cl_mem SA,
+        size_t offSA,
+        cl_mem SB,
+        size_t offSB,
+        cl_mem C,
+        size_t offC,
+        cl_mem S,
+        size_t offS,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+    {
+        switch(type)
+        {
+
+            case TYPE_FLOAT:
+                return clblasSrotg(
+                    SA, offSA, SB, offSB, C, offC, S, offS,
+                    numCommandQueues,commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+
+            case TYPE_DOUBLE:
+                return clblasDrotg(
+                    SA, offSA, SB, offSB, C, offC, S, offS,
+                    numCommandQueues, commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+
+            case TYPE_COMPLEX_FLOAT:
+                return clblasCrotg(
+                    SA, offSA, SB, offSB, C, offC, S, offS,
+                    numCommandQueues, commandQueues, numEventsInWaitList,
+                     eventWaitList, events);
+
+            case TYPE_COMPLEX_DOUBLE:
+                return clblasZrotg(
+                    SA, offSA, SB, offSB, C, offC, S, offS,
+                    numCommandQueues, commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+
+            default:
+                return  clblasInvalidValue;
+        }
+    }
+
+clblasStatus
+clMath::clblas::rotm(
+        DataType type,
+        size_t N,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_mem PARAM,
+        size_t offParam,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+    {
+        switch(type)
+        {
+
+            case TYPE_FLOAT:
+                return clblasSrotm(
+                    N, X, offx, incx, Y, offy, incy, PARAM, offParam,
+                    numCommandQueues,commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+
+            case TYPE_DOUBLE:
+                return clblasDrotm(
+
+                    N, X, offx, incx, Y, offy, incy, PARAM, offParam,
+					numCommandQueues, commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+            default:
+                return  clblasInvalidValue;
+        }
+    }
+
+
+clblasStatus
+clMath::clblas::rotmg(
+        DataType type,
+        cl_mem D1,
+        size_t offD1,
+        cl_mem D2,
+        size_t offD2,
+        cl_mem X1,
+        size_t offX1,
+        cl_mem Y1,
+        size_t offY1,
+        cl_mem PARAM,
+        size_t offParam,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+    {
+        switch(type)
+        {
+
+            case TYPE_FLOAT:
+                return clblasSrotmg(
+                    D1, offD1, D2, offD2, X1, offX1, Y1, offY1,
+                    PARAM, offParam, numCommandQueues, commandQueues,
+                    numEventsInWaitList, eventWaitList, events);
+
+            case TYPE_DOUBLE:
+                return clblasDrotmg(
+                    D1, offD1, D2, offD2, X1, offX1, Y1, offY1,
+                    PARAM, offParam, numCommandQueues, commandQueues,
+                    numEventsInWaitList, eventWaitList, events);
+
+            default:
+                return  clblasInvalidValue;
+        }
+    }
+
+//ROT
+clblasStatus
+clMath::clblas::rot(
+        size_t N,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+		float C,
+		float S,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+    {
+                return clblasSrot(
+                    N, X, offx, incx, Y, offy, incy, (C), (S),
+                    numCommandQueues,commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+    }
+
+
+
+clblasStatus
+clMath::clblas::rot(
+        size_t N,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        double C,
+        double S,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+    {
+                return clblasDrot(
+                    N, X, offx, incx, Y, offy, incy, (C), (S),
+                    numCommandQueues,commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+    }
+
+clblasStatus
+clMath::clblas::rot(
+        size_t N,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        FloatComplex C,
+        FloatComplex S,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+    {
+                return clblasCsrot(
+                    N, X, offx, incx, Y, offy, incy, CREAL(C), CREAL(S),
+                    numCommandQueues,commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+    }
+
+clblasStatus
+clMath::clblas::rot(
+        size_t N,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        DoubleComplex C,
+        DoubleComplex S,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events)
+    {
+                return clblasZdrot(
+                    N, X, offx, incx, Y, offy, incy, CREAL(C), CREAL(S),
+                    numCommandQueues,commandQueues, numEventsInWaitList,
+                    eventWaitList, events);
+    }
+
+
+// iAMAX
+clblasStatus
+clMath::clblas::iamax(
+    DataType type,
+    size_t N,
+    cl_mem iMax,
+    size_t offiMax,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuf,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+
+    {
+            switch(type){
+
+            case TYPE_FLOAT:
+            return clblasiSamax( N, iMax, offiMax, X, offx, incx, scratchBuf,
+                            numCommandQueues,commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+            case TYPE_DOUBLE:
+            return clblasiDamax( N, iMax, offiMax, X, offx, incx, scratchBuf,
+                                  numCommandQueues,commandQueues, numEventsInWaitList,
+                                  eventWaitList, events);
+
+            case TYPE_COMPLEX_FLOAT:
+            return clblasiCamax( N, iMax, offiMax, X, offx, incx, scratchBuf,
+                            numCommandQueues,commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+            case TYPE_COMPLEX_DOUBLE:
+            return clblasiZamax( N, iMax, offiMax, X, offx, incx, scratchBuf,
+                                  numCommandQueues,commandQueues, numEventsInWaitList,
+                                  eventWaitList, events);
+
+            default:
+                   return clblasInvalidValue;
+            }
+    }
+
+
+clblasStatus
+clMath::clblas::nrm2(
+    DataType type,
+    size_t N,
+    cl_mem NRM2,
+    size_t offNRM2,
+    cl_mem X,
+    size_t offx,
+    int incx,
+    cl_mem scratchBuff,
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *events)
+
+    {
+            switch(type){
+
+            case TYPE_FLOAT:
+            return clblasSnrm2( N, NRM2, offNRM2, X, offx, incx, scratchBuff,
+                            numCommandQueues,commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+            case TYPE_DOUBLE:
+            return clblasDnrm2( N, NRM2, offNRM2, X, offx, incx, scratchBuff,
+                                  numCommandQueues,commandQueues, numEventsInWaitList,
+                                  eventWaitList, events);
+
+            case TYPE_COMPLEX_FLOAT:
+            return clblasScnrm2( N, NRM2, offNRM2, X, offx, incx, scratchBuff,
+                            numCommandQueues,commandQueues, numEventsInWaitList,
+                            eventWaitList, events);
+
+            case TYPE_COMPLEX_DOUBLE:
+            return clblasDznrm2( N, NRM2, offNRM2, X, offx, incx, scratchBuff,
+                                  numCommandQueues,commandQueues, numEventsInWaitList,
+                                  eventWaitList, events);
+
+            default:
+                   return clblasInvalidValue;
+            }
+
+    }
diff --git a/src/tests/cmdline.c b/src/tests/cmdline.c
new file mode 100644
index 0000000..259a9f2
--- /dev/null
+++ b/src/tests/cmdline.c
@@ -0,0 +1,248 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <string.h>         /* strcmp */
+#include <stdlib.h>         /* atoi, strtol */
+#include <stdio.h>          /* printf */
+
+#include <cmdline.h>
+
+static const char *testUsage =
+    "<M N K> [--seed s] [--alpha a] [--beta b] "
+    "[--alpha-real a] [--beta-real b] [--alpha-imag a] [--beta-imag b] "
+    "[--use-images f] [--device dev] [--queues n]\n"
+    "\n"
+    "seed - seed for the random number generator"
+    "\n"
+    "alpha - alpha multiplier"
+    "\n"
+    "beta - beta multiplier"
+    "\n"
+    "alpha-real - alpha multiplier real part"
+    "\n"
+    "beta-real - beta multiplier real part"
+    "\n"
+    "alpha-imag - alpha-multiplier imaginary part"
+    "\n"
+    "beta-imag - beta-multiplier imaginary part"
+    "\n"
+    "use-images - allow the library to use images for computing"
+    "\n"
+    "device - device to run the test on, 'cpu' or 'gpu'(default)"
+    "\n"
+    "queues - number of command queues to use"
+    "\n"
+    "Parameters defined through the command line are kept over the whole "
+    "set of custom test cases. The use-images parameter value is ignored if "
+    "the target device is CPU\n\n";
+
+typedef struct SetterArg {
+    TestParams *params;
+    const char *arg;
+    long extra;
+} SetterArg;
+
+typedef struct CmdLineOpt {
+    const char *name;
+    unsigned int flagToSet;
+    int (*setter)(SetterArg*);
+    long setterExtra;
+} CmdLineOpt;
+
+enum {
+    MULT_ALPHA = 0x01,
+    MULT_BETA = 0x02,
+    MULT_REAL_ONLY = 0x04,
+    MULT_IMAG_ONLY = 0x08
+};
+
+static int
+doParseCmdLine(
+    int argc,
+    char *argv[],
+    const CmdLineOpt *opts,
+    unsigned int nrOpts,
+    TestParams *params)
+{
+    int i = 1, j = 0;
+    int ret = 0;
+    const CmdLineOpt *currOpt;
+    const char *currArg;
+    SetterArg sarg = {params, NULL, 0};
+
+    do {
+        currArg = (const char*)argv[i];
+        i++;
+
+        if (currArg[0] != '-') {
+            // some of size arguments
+            switch (j) {
+            case 0:
+                params->M = atoi(currArg);
+                params->optFlags |= SET_M;
+                break;
+            case 1:
+                params->N = atoi(currArg);
+                params->optFlags |= SET_N;
+                break;
+            case 2:
+                params->K = atoi(currArg);
+                params->optFlags |= SET_K;
+                break;
+            }
+            j++;
+            continue;
+        }
+        else if (currArg[1] != '-') {
+            // it can be some parameter of a used test framework, skip it
+            j = 0;
+            continue;
+        }
+
+        j = 0;
+
+        for (currOpt = opts; currOpt < opts + nrOpts; currOpt++) {
+            if (!strcmp(currOpt->name, &currArg[2])) {
+                if (i == argc) {
+                    printf("Error: parameter '%s' is not specified!\n",
+                           currOpt->name);
+                    ret = -1;
+                }
+                else {
+                    sarg.arg = argv[i++];
+                    sarg.extra = currOpt->setterExtra;
+                    ret = currOpt->setter(&sarg);
+                    params->optFlags |= currOpt->flagToSet;
+                }
+                break;
+            }
+        }
+    } while ((i < argc) && !ret);
+
+    return ret;
+}
+
+static int
+setSeed(SetterArg *sarg)
+{
+    sarg->params->seed = atoi(sarg->arg);
+
+    return 0;
+}
+
+static int
+setMult(SetterArg *sarg)
+{
+    ComplexLong *mult;
+    long val;
+    char *end;
+    long flags = sarg->extra;
+
+    mult = (flags & MULT_BETA) ? &sarg->params->beta :
+                                 &sarg->params->alpha;
+    mult->re = 0;
+    mult->imag = 0;
+
+    val = strtol(sarg->arg, &end, 10);
+    if (!(flags & MULT_IMAG_ONLY)) {
+        mult->re = val;
+    }
+    if (!(flags & MULT_REAL_ONLY)) {
+        mult->imag = val;
+    }
+
+    return 0;
+}
+
+static int
+setDevice(SetterArg *sarg)
+{
+    if (!strcmp(sarg->arg, "cpu")) {
+        sarg->params->devType = CL_DEVICE_TYPE_CPU;
+        sarg->params->devName = NULL;
+        return 0;
+    }
+    if (!strcmp(sarg->arg, "gpu")) {
+        sarg->params->devType = CL_DEVICE_TYPE_GPU;
+        sarg->params->devName = NULL;
+        return 0;
+    }
+    sarg->params->devName = sarg->arg;
+
+    return 0;
+}
+
+static int
+setNumCommandQueues(SetterArg *sarg)
+{
+    sarg->params->numCommandQueues = atoi(sarg->arg);
+
+    return 0;
+}
+
+static const CmdLineOpt opts[] = {
+    {"seed", SET_SEED, setSeed, 0},
+    {"alpha", SET_ALPHA, setMult, MULT_ALPHA | MULT_REAL_ONLY},
+    {"beta", SET_BETA, setMult, MULT_BETA | MULT_REAL_ONLY},
+    {"alpha-real", SET_ALPHA, setMult, MULT_ALPHA | MULT_REAL_ONLY},
+    {"alpha-imag", SET_ALPHA, setMult, MULT_ALPHA | MULT_IMAG_ONLY},
+    {"beta-real", SET_BETA, setMult, MULT_BETA | MULT_REAL_ONLY},
+    {"beta-imag", SET_BETA, setMult, MULT_BETA | MULT_IMAG_ONLY},
+    {"device", SET_DEVICE_TYPE, setDevice, 0},
+    {"queues", SET_NUM_COMMAND_QUEUES, setNumCommandQueues, 0},
+};
+static const unsigned int nrOpts = sizeof(opts) / sizeof(CmdLineOpt);
+
+int
+parseBlasCmdLineArgs(
+    int argc,
+    char *argv[],
+    TestParams *params)
+{
+    return doParseCmdLine(argc, argv, opts, nrOpts, params);
+}
+
+void
+printUsage(const char *appName)
+{
+    printf("%s %s\n", appName, testUsage);
+}
+
+void
+parseEnv(TestParams *params)
+{
+    const char *str;
+    int createImages = 0;
+
+    str = getenv("AMD_CLBLAS_GEMM_IMPLEMENTATION");
+    if ((str != NULL) && (strcmp(str, "1") == 0)) {
+        createImages = 1;
+    }
+    str = getenv("AMD_CLBLAS_TRMM_IMPLEMENTATION");
+    if ((str != NULL) && (strcmp(str, "1") == 0)) {
+        createImages = 1;
+    }
+    str = getenv("AMD_CLBLAS_TRSM_IMPLEMENTATION");
+    if ((str != NULL) && (strcmp(str, "1") == 0)) {
+        createImages = 1;
+    }
+
+	params->optFlags	= NO_FLAGS;
+    if (createImages) {
+        params->optFlags |= SET_USE_IMAGES;
+    }
+}
diff --git a/src/tests/common.cpp b/src/tests/common.cpp
new file mode 100644
index 0000000..759a588
--- /dev/null
+++ b/src/tests/common.cpp
@@ -0,0 +1,1011 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <iostream>
+#include <string.h>
+#include <clBLAS.h>
+
+#include <common.h>
+
+cl_context
+getQueueContext(cl_command_queue commandQueue, cl_int *error)
+{
+    cl_int err;
+    cl_context ctx = NULL;
+
+    err = clGetCommandQueueInfo(commandQueue, CL_QUEUE_CONTEXT,
+        sizeof(cl_context), &ctx, NULL);
+    if (error != NULL) {
+        *error = err;
+    }
+    return ctx;
+}
+
+cl_int
+waitForSuccessfulFinish(
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_event *events)
+{
+    cl_int err = CL_SUCCESS;
+    cl_uint i;
+
+    for (i = 0; i < numCommandQueues; i++) {
+        cl_int e;
+        cl_int status;
+
+        e = clFinish(commandQueues[i]);
+        if ((events != NULL) && (events[i] != NULL)) {
+            if (e == CL_SUCCESS) {
+        status = CL_COMPLETE;
+                e = clGetEventInfo(events[i], CL_EVENT_COMMAND_EXECUTION_STATUS,
+            sizeof(status), &status, NULL);
+                if ((e == CL_SUCCESS) && (status < 0)) {
+                    e = -status;
+        }
+        }
+            clReleaseEvent(events[i]);
+    }
+
+        if (err == CL_SUCCESS) {
+            err = e;
+}
+    }
+
+    return err;
+}
+
+cl_int
+flushAll(
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues)
+{
+    cl_int err;
+    cl_uint i;
+
+    for (i = 0; i < numCommandQueues; i++) {
+        err = clFlush(commandQueues[i]);
+        if (err != CL_SUCCESS) {
+            return err;
+        }
+    }
+
+    return CL_SUCCESS;
+}
+
+void
+printTestParams(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    bool useAlpha,
+    ComplexLong alpha,
+    size_t offA,
+    size_t lda,
+    size_t offB,
+    size_t ldb,
+    bool useBeta,
+    ComplexLong beta,
+    size_t offC,
+    size_t ldc)
+{
+    ::std::cerr << orderStr(order) << ", " << transStr(transA) << ", " <<
+        transStr(transB) << ::std::endl;
+    ::std::cerr << "M = " << M << ", N = " << N << ", K = " << K << ::std::endl;
+    ::std::cerr << "offA = " << offA << ", offB = " << offB << ", offC = " <<
+        offC << ::std::endl;
+    ::std::cerr << "lda = " << lda << ", ldb = " << ldb << ", ldc = " <<
+        ldc << ::std::endl;
+    if (useAlpha) {
+        ::std::cerr << "alpha = (" << alpha.re << "," << alpha.imag
+            << ")" << ::std::endl;
+    }
+    if (useBeta) {
+        ::std::cerr << "beta = (" << beta.re << "," << beta.imag
+            << ")" << ::std::endl;
+    }
+}
+
+void
+printTestParams(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    bool useAlpha,
+    ComplexLong alpha,
+    size_t offA,
+    size_t lda,
+    size_t offB,
+    size_t ldb)
+{
+    ::std::cerr << orderStr(order) << ", " << sideStr(side) << ", " <<
+        uploStr(uplo) << ", " << transStr(transA) << ", " <<
+        diagStr(diag) << ::std::endl;
+    ::std::cerr << "M = " << M << ", N = " << N << ::std::endl;
+    ::std::cerr << "offA = " << offA << ", offB = " << offB << ::std::endl;
+    ::std::cerr << "lda = " << lda << ", ldb = " << ldb << ::std::endl;
+    if (useAlpha) {
+        ::std::cerr << "alpha = (" << alpha.re << "," << alpha.imag
+            << ")" << ::std::endl;
+    }
+}
+
+//SYR
+void
+printTestParams(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+	double alpha,
+    size_t offx,
+    int incx,
+    size_t offa,
+    size_t lda)
+{
+	::std::cerr << orderStr(order) << ", " << uploStr(uplo) << ::std::endl;
+	::std::cerr << "N = " << N << ", offx = " << offx << ", incx = " << incx << ::std::endl;
+	::std::cerr << "offa = " << offa << ::std::endl;
+    if( lda )
+        ::std::cerr << ", lda = " << lda << ::std::endl;
+	::std::cerr << "alpha = " << alpha << ::std::endl;
+}
+
+//SPR
+void
+printTestParams(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    double alpha,
+    size_t offx,
+    int incx,
+    size_t offa)
+{
+    ::std::cerr << orderStr(order) << ", " << uploStr(uplo) << ::std::endl;
+    ::std::cerr << "N = " << N << ", offx = " << offx << ", incx = " << incx << ::std::endl;
+    ::std::cerr << "offa = " << offa << ::std::endl;
+    ::std::cerr << "alpha = " << alpha << ::std::endl;
+}
+
+//SYR2
+void
+printTestParams(
+	clblasOrder order,
+	clblasUplo  uplo,
+	size_t N,
+	double alpha,
+	size_t offx,
+	int incx,
+	size_t offy,
+	int incy,
+	size_t offa,
+	size_t lda)
+{
+	::std::cerr << orderStr(order) << ", " << uploStr(uplo) << ::std::endl;
+	::std::cerr << "N = " << N << ", offx = " << offx << ", incx = " << incx << ::std::endl;
+	::std::cerr << "offy = " << offy << ", incy = " << incy << ::std::endl;
+	::std::cerr << "offa = " << offa << ::std::endl;
+    if( lda )
+        ::std::cerr << ", lda = " << lda << ::std::endl;
+	::std::cerr << "alpha = " << alpha << ::std::endl;
+}
+
+//copy, dot, swap, dotc
+
+void
+printTestParams(
+    size_t N,
+    size_t offx,
+    int incx,
+    size_t offy,
+    int incy)
+{
+
+    ::std::cerr << "N = " << N << ", offx = " << offx << ", incx = " << incx << ::std::endl;
+    ::std::cerr << "offy = " << offy << ", incy = " << incy << ::std::endl;
+}
+
+//HER2
+void
+printTestParams(
+    clblasOrder order,
+    clblasUplo  uplo,
+    size_t N,
+    bool useAlpha,
+    cl_float2 alpha,
+    size_t offx,
+    int incx,
+    size_t offy,
+    int incy,
+    size_t offa,
+    size_t lda)
+{
+    ::std::cerr << orderStr(order) << ", " << uploStr(uplo) << ::std::endl;
+    ::std::cerr << "N = " << N << ", offx = " << offx << ", incx = " << incx << ::std::endl;
+    ::std::cerr << "offy = " << offy << ", incy = " << incy << ::std::endl;
+    ::std::cerr << "offa = " << offa << ::std::endl;
+    if( lda )
+        ::std::cerr << ", lda = " << lda << ::std::endl;
+        if(useAlpha)
+    ::std::cerr << "alpha = (" << CREAL(alpha) << ", " << CIMAG(alpha) << ")" << ::std::endl;
+}
+
+//HEMV
+void
+printTestParams(
+    clblasOrder order,
+    clblasUplo  uplo,
+    size_t N,
+    ComplexLong alpha,
+    size_t offa,
+    size_t lda,
+    size_t offx,
+    int incx,
+    ComplexLong beta,
+    size_t offy,
+    int incy)
+{
+    ::std::cerr << orderStr(order) << ", " << uploStr(uplo) << ::std::endl;
+    ::std::cerr << "N = " << N << ", offx = " << offx << ", incx = " << incx << ::std::endl;
+    ::std::cerr << "offy = " << offy << ", incy = " << incy << ::std::endl;
+    ::std::cerr << "offa = " << offa;
+    if( lda )
+        ::std::cerr << ", lda = " << lda;
+    ::std::cerr << ::std::endl << "alpha = (" << alpha.re << "," << alpha.imag << ")" << ::std::endl;
+    ::std::cerr << "beta = (" << beta.re << "," << beta.imag << ")" << ::std::endl;
+}
+//SYMM , HEMM
+void
+printTestParams(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    size_t M,
+    size_t N,
+    bool useAlpha,
+    ComplexLong alpha,
+    bool useBeta,
+    ComplexLong beta,
+    size_t lda,
+    size_t ldb,
+    size_t ldc,
+    size_t offa,
+    size_t offb,
+    size_t offc )
+{
+    ::std::cerr << orderStr(order) << ", " << sideStr(side) << ", " << uploStr(uplo) << ::std::endl;
+    ::std::cerr << "M = " << M << ", N = " << N << ::std::endl;
+    ::std::cerr << "lda = " << lda << ", ldb = " << ldb << ", ldc = " << ldc<< ::std::endl;
+    if (useAlpha) {
+        ::std::cerr << "alpha = (" << alpha.re << "," << alpha.imag << ")" << ::std::endl; }
+	 if (useBeta) {
+        ::std::cerr << "beta = (" << beta.re << "," << beta.imag  << ")" << ::std::endl; }
+	::std::cerr << "offa = " << offa << ", offb = " << offb << ", offc = " << offc<< ::std::endl;
+
+}
+
+//xHEMM
+void
+printTestParams(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    size_t M,
+    size_t N,
+    bool useAlpha,
+    cl_float2 alpha,
+    bool useBeta,
+    cl_float2 beta,
+    size_t lda,
+    size_t ldb,
+    size_t ldc,
+    size_t offa,
+    size_t offb,
+    size_t offc )
+{
+    ::std::cerr << orderStr(order) << ", " << sideStr(side) << ", " << uploStr(uplo) << ::std::endl;
+    ::std::cerr << "M = " << M << ", N = " << N << ::std::endl;
+    ::std::cerr << "lda = " << lda << ", ldb = " << ldb << ", ldc = " << ldc<< ::std::endl;
+    if (useAlpha) {
+        ::std::cerr << "alpha = (" << CREAL(alpha) << "," << CIMAG(alpha) << ")" << ::std::endl; }
+         if (useBeta) {
+        ::std::cerr << "beta = (" << CREAL(beta) << "," << CIMAG(beta)  << ")" << ::std::endl; }
+        ::std::cerr << "offa = " << offa << ", offb = " << offb << ", offc = " << offc<< ::std::endl;
+
+}
+
+
+
+void
+printTestParams(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    bool useAlpha,
+    ComplexLong alpha,
+    size_t lda,
+    int incx,
+    int incy,
+    size_t offa,
+    size_t offx,
+    size_t offy )
+{
+    ::std::cerr << orderStr(order) << ", " << ::std::endl;
+    ::std::cerr << "M = " << M << ", N = " << N << ::std::endl;
+    ::std::cerr << "lda = " << lda << ", incx = " << incx << ", incy = " << incy<< ::std::endl;
+    if (useAlpha) {
+        ::std::cerr << "alpha = (" << alpha.re << "," << alpha.imag << ")" << ::std::endl; }
+        ::std::cerr << "offa = " << offa << ", offx = " << offx << ", offy = " << offy << ::std::endl;
+
+}
+
+// xGBMV
+void
+printTestParams(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    size_t KL,
+    size_t KU,
+    ComplexLong alpha,
+    size_t offa,
+    size_t lda,
+    size_t offx,
+    int incx,
+    ComplexLong beta,
+    size_t offy,
+    int incy)
+{
+    ::std::cerr << orderStr(order) << ", " << transStr(transA) << ", " << ::std::endl;
+    ::std::cerr << "M = " << M << ", N = " << N << ", KL = " << KL << ", KU = " << KU << ::std::endl;
+    ::std::cerr << "lda = " << lda << ", incx = " << incx << ", incy = " << incy<< ::std::endl;
+    ::std::cerr << "offa = " << offa << ", offx = " << offx << ", offy = " << offy << ::std::endl;
+    ::std::cerr << "alpha = (" << alpha.re << "," << alpha.imag << ")" << ::std::endl;
+    ::std::cerr << "beta = (" << beta.re << "," << beta.imag << ")" << ::std::endl;
+}
+
+//HBMV
+//SBMV
+
+void
+printTestParams(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    size_t K,
+    ComplexLong alpha,
+    size_t offa,
+    size_t lda,
+    size_t offx,
+    int incx,
+    ComplexLong beta,
+    size_t offy,
+    int incy)
+{
+    ::std::cerr << orderStr(order) << ", " << uploStr(uplo) << ", " << ::std::endl;
+    ::std::cerr << ", N = " << N << ", K = " << K << ::std::endl;
+    ::std::cerr << "lda = " << lda << ", incx = " << incx << ", incy = " << incy<< ::std::endl;
+    ::std::cerr << "offa = " << offa << ", offx = " << offx << ", offy = " << offy << ::std::endl;
+    ::std::cerr << "alpha = (" << alpha.re << "," << alpha.imag << ")" << ::std::endl;
+    ::std::cerr << "beta = (" << beta.re << "," << beta.imag << ")" << ::std::endl;
+}
+
+
+//xTBMV
+void
+printTestParams(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    size_t KLU,
+    size_t offA,
+    size_t lda,
+    size_t offx,
+    int incx,
+    size_t offy,
+    int incy)
+{
+    ::std::cerr << orderStr(order) << ", " << uploStr(uplo) << ", " << transStr(transA) << ", " << diagStr(diag) << ::std::endl;
+    ::std::cerr << ", N = " << N << ", KL or KU = " << KLU << ::std::endl;
+    ::std::cerr << "lda = " << lda << ", incx = " << incx << ", incy = " << incy<< ::std::endl;
+    ::std::cerr << "offa = " << offA << ", offx = " << offx << ", offy = " << offy << ::std::endl;
+}
+
+//HER
+void
+printTestParams(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    bool useAlpha,
+    ComplexLong alpha,
+    size_t lda,
+    int incx,
+    size_t offa,
+    size_t offx)
+{
+    ::std::cerr << orderStr(order) << ", " << ::std::endl;
+    ::std::cerr << uploStr(uplo) << ", " << ::std::endl;
+    ::std::cerr << " N = " << N << ::std::endl;
+    ::std::cerr << "lda = " << lda << ", incx = " << incx << ::std::endl;
+    if (useAlpha) {
+        ::std::cerr << "alpha = (" << alpha.re << "," << alpha.imag << ")" << ::std::endl; }
+        ::std::cerr << "offa = " << offa << ", offx = " << offx << ::std::endl;
+
+}
+
+
+void
+printTestParams(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    size_t lda,
+    int incx,
+	size_t offa,
+	size_t offx)
+{
+    ::std::cerr << orderStr(order) << ", " << uploStr(uplo) << ", " << transStr(transA)
+				<< ", " <<diagStr(diag) << ::std::endl;
+    ::std::cerr << " N = " << N << ::std::endl;
+    ::std::cerr << "lda = " << lda << ", incx = " << incx << ::std::endl;
+	::std::cerr << "offa = " << offa << ", offx = " << offx << ::std::endl;
+}
+
+//xTPMV
+void
+printTestParams(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    int incx,
+    size_t offa,
+    size_t offx)
+{
+    ::std::cerr << orderStr(order) << ", " << uploStr(uplo) << ", " << transStr(transA)
+                << ", " <<diagStr(diag) << ::std::endl;
+    ::std::cerr << " N = " << N << ::std::endl;
+    ::std::cerr << " incx = " << incx << ::std::endl;
+    ::std::cerr << "offa = " << offa << ", offx = " << offx << ::std::endl;
+}
+
+
+void
+printTestParams(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    bool useAlpha,
+    ComplexLong alpha,
+    size_t offA,
+    size_t lda,
+    int incx,
+    bool useBeta,
+    ComplexLong beta,
+    int incy)
+{
+    ::std::cerr << orderStr(order) << ", " << transStr(transA) << ::std::endl;
+    ::std::cerr << "M = " << M << ", N = " << N << ::std::endl;
+    ::std::cerr << "offA = " << offA << ::std::endl;
+    ::std::cerr << "lda = " << lda << ::std::endl;
+    if (useAlpha) {
+        ::std::cerr << "alpha = (" << alpha.re << "," << alpha.imag
+            << ")" << ::std::endl;
+    }
+    if (useBeta) {
+        ::std::cerr << "beta = (" << beta.re << "," << beta.imag
+            << ")" << ::std::endl;
+    }
+    ::std::cerr << "incx = " << incx << ", incy = " << incy << ::std::endl;
+}
+
+void
+printTestParams(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    bool useAlpha,
+    ComplexLong alpha,
+    size_t offA,
+    size_t lda,
+    int incx,
+    bool useBeta,
+    ComplexLong beta,
+    int incy)
+{
+    ::std::cerr << orderStr(order) << ", " << uploStr(uplo) << ::std::endl;
+    ::std::cerr << "N = " << N << ::std::endl;
+    ::std::cerr << "offA = " << offA << ::std::endl;
+    if( lda )
+    ::std::cerr << "lda = " << lda << ::std::endl;
+    if (useAlpha) {
+        ::std::cerr << "alpha = (" << alpha.re << "," << alpha.imag
+            << ")" << ::std::endl;
+    }
+    if (useBeta) {
+        ::std::cerr << "beta = (" << beta.re << "," << beta.imag
+            << ")" << ::std::endl;
+    }
+    ::std::cerr << "incx = " << incx << ", incy = " << incy << ::std::endl;
+}
+
+void
+printTestParams(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    bool useAlpha,
+    ComplexLong alpha,
+    size_t offA,
+    size_t lda,
+    size_t offB,
+    size_t ldb,
+    bool useBeta,
+    ComplexLong beta,
+    size_t offC,
+    size_t ldc)
+{
+    ::std::cerr << orderStr(order) << ", " << uploStr(uplo)
+        << ", " << transStr(transA) << ::std::endl;
+    ::std::cerr << "N = " << N << ", K = " << K << ::std::endl;
+    ::std::cerr << "offA = " << offA << ", offB = " << offB
+            << ", offC = " << offC << ::std::endl;
+    ::std::cerr << "lda = " << lda << ", ldb = " << ldb
+        << ", ldc = " << ldc << ::std::endl;
+    if (useAlpha) {
+        ::std::cerr << "alpha = (" << alpha.re << "," << alpha.imag
+            << ")" << ::std::endl;
+    }
+    if (useBeta) {
+        ::std::cerr << "beta = (" << beta.re << "," << beta.imag
+            << ")" << ::std::endl;
+    }
+}
+
+void
+printTestParams(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    bool useAlpha,
+    ComplexLong alpha,
+    size_t offA,
+    size_t lda,
+    bool useBeta,
+    ComplexLong beta,
+    size_t offC,
+    size_t ldc)
+{
+    ::std::cerr << orderStr(order) << ", " << uploStr(uplo)
+        << ", " << transStr(transA) << ::std::endl;
+    ::std::cerr << "N = " << N << ", K = " << K << ::std::endl;
+    ::std::cerr << "offA = " << offA << ", offC = " << offC << ::std::endl;
+    ::std::cerr << "lda = " << lda << ", ldc = " << ldc << ::std::endl;
+    if (useAlpha) {
+        ::std::cerr << "alpha = (" << alpha.re << "," << alpha.imag
+            << ")" << ::std::endl;
+    }
+    if (useBeta) {
+        ::std::cerr << "beta = (" << beta.re << "," << beta.imag
+            << ")" << ::std::endl;
+    }
+}
+
+//For scal
+void
+printTestParams(
+    size_t N,
+    ComplexLong alpha,
+    size_t offx,
+    int incx)
+{
+    ::std::cerr << "N = " << N << ", alpha = (" << alpha.re << "," << alpha.imag << ")" << ::std::endl;
+    ::std::cerr << "offx = " << offx << ", incx = " << incx << ::std::endl;
+}
+
+//For axpy
+void
+printTestParams(
+    size_t N,
+    ComplexLong alpha,
+    size_t offx,
+    int incx,
+    size_t offy,
+    int incy)
+{
+    ::std::cerr << "N = " << N << ", alpha = (" << alpha.re << "," << alpha.imag << ")" << ::std::endl;
+    ::std::cerr << "offx = " << offx << ", incx = " << incx << ::std::endl;
+    ::std::cerr << "offy = " << offy << ", incy = " << incy << ::std::endl;
+}
+
+
+//xROT
+void
+printTestParams(
+    size_t N,
+    size_t offx,
+    int incx,
+	size_t offy,
+	int incy,
+	ComplexLong C,
+	ComplexLong S)
+{
+    ::std::cerr << "N = " << N << ::std::endl;
+	::std::cerr << "C = (" << C.re << "," << C.imag << ")" << ",S = (" << S.re << "," << S.imag << ")" << ::std::endl;
+    ::std::cerr << "offx = " << offx << ", incx = " << incx << ", offy = "<< offy << ", incy = " << incy <<  ::std::endl;
+}
+
+// xROTG
+void
+printTestParams(size_t offSA, size_t offSB, size_t offC, size_t offS)
+{
+    ::std::cerr << "offSA = " << offSA << ", offSB = " << offSB << ", offC = " << offC << ",offS = " << offS << std::endl;
+}
+
+//xROTM
+void
+printTestParams(size_t N, size_t offx, int incx, size_t offy, int incy, size_t offParam, ComplexLong sflagParam)
+{
+    ::std::cerr << "N = " << N << ", offx = " << offx << ", incx = " << incx << ", offy = " << offy
+                << ", incy = " << incy << ", offParam = " << offParam << ", PARAM[0] = " << sflagParam.re << std::endl;
+}
+
+//xROTMG
+void
+printTestParams(int offX, int offY, int offD1, int offD2, int offParam, ComplexLong sflagParam)
+{
+    ::std::cerr << "offX = " << offX << ", offY = " << offY << ", offD1 = " << offD1 << ", offD2 = " << offD2
+                << ", offParam = " << offParam << ", PARAM[0] = " << sflagParam.re << std::endl;
+}
+
+
+// xNRM2, xASUM, iXAMAX
+void
+printTestParams(
+    size_t N,
+    size_t offx,
+    int incx)
+{
+    ::std::cerr << "N = " << N << ", offx = " << offx << ", incx = " << incx << ::std::endl;
+}
+
+const char*
+orderStr(clblasOrder order)
+{
+    switch (order) {
+    case clblasColumnMajor:
+        return "clblasColumnMajor";
+    case clblasRowMajor:
+        return "clblasRowMajor";
+    default:
+        return NULL;
+    }
+}
+
+const char*
+sideStr(clblasSide side)
+{
+    switch (side) {
+    case clblasLeft:
+        return "clblasLeft";
+    case clblasRight:
+        return "clblasRight";
+    default:
+        return NULL;
+    }
+}
+
+const char*
+uploStr(clblasUplo uplo)
+{
+    switch (uplo) {
+    case clblasUpper:
+        return "clblasUpper";
+    case clblasLower:
+        return "clblasLower";
+    default:
+        return NULL;
+    }
+}
+
+const char*
+transStr(clblasTranspose trans)
+{
+    switch (trans) {
+    case clblasNoTrans:
+        return "clblasNoTrans";
+    case clblasTrans:
+        return "clblasTrans";
+    case clblasConjTrans:
+        return "clblasConjTrans";
+    default:
+        return NULL;
+    }
+}
+
+const char*
+diagStr(clblasDiag diag)
+{
+    switch (diag) {
+    case clblasNonUnit:
+        return "clblasNonUnit";
+    case clblasUnit:
+        return "clblasUnit";
+    default:
+        return NULL;
+    }
+}
+
+char
+encodeTranspose(clblasTranspose value)
+{
+    switch (value) {
+    case clblasNoTrans:      return 'N';
+    case clblasTrans:        return 'T';
+    case clblasConjTrans:    return 'C';
+    }
+    return '\0';
+}
+
+char
+encodeUplo(clblasUplo value)
+{
+    switch (value) {
+    case clblasUpper:  return 'U';
+    case clblasLower:  return 'L';
+    }
+    return '\0';
+}
+
+char
+encodeDiag(clblasDiag value)
+{
+    switch (value) {
+    case clblasUnit:       return 'U';
+    case clblasNonUnit:    return 'N';
+    }
+    return '\0';
+}
+
+char
+encodeSide(clblasSide value)
+{
+    switch (value) {
+    case clblasLeft:   return 'L';
+    case clblasRight:  return 'R';
+    }
+    return '\0';
+}
+
+int
+functionBlasLevel(BlasFunctionID funct) {
+    switch (funct) {
+
+    case FN_SSCAL:
+    case FN_DSCAL:
+    case FN_CSCAL:
+    case FN_ZSCAL:
+    case FN_CSSCAL:
+    case FN_ZDSCAL:
+
+    case FN_SSWAP:
+    case FN_DSWAP:
+    case FN_CSWAP:
+    case FN_ZSWAP:
+
+    case FN_SAXPY:
+    case FN_DAXPY:
+    case FN_CAXPY:
+    case FN_ZAXPY:
+
+	case FN_SDOT:
+    case FN_DDOT:
+    case FN_CDOTU:
+    case FN_ZDOTU:
+    case FN_CDOTC:
+    case FN_ZDOTC:
+
+	case FN_SCOPY:
+    case FN_DCOPY:
+    case FN_CCOPY:
+    case FN_ZCOPY:
+
+    case FN_SROTG:
+    case FN_DROTG:
+    case FN_CROTG:
+    case FN_ZROTG:
+
+    case FN_SROT:
+    case FN_DROT:
+    case FN_CSROT:
+    case FN_ZDROT:
+
+    case FN_SASUM:
+    case FN_DASUM:
+    case FN_SCASUM:
+    case FN_DZASUM:
+
+    case FN_SROTM:
+    case FN_DROTM:
+
+    case FN_SROTMG:
+    case FN_DROTMG:
+
+    case FN_SNRM2:
+    case FN_DNRM2:
+    case FN_SCNRM2:
+    case FN_DZNRM2:
+
+    case FN_iSAMAX:
+    case FN_iDAMAX:
+    case FN_iCAMAX:
+    case FN_iZAMAX:
+
+    return 1;
+
+    case FN_SGEMV:
+    case FN_DGEMV:
+    case FN_CGEMV:
+    case FN_ZGEMV:
+
+    case FN_SSYMV:
+    case FN_DSYMV:
+    case FN_SSPMV:
+    case FN_DSPMV:
+
+    case FN_STRMV:
+    case FN_DTRMV:
+    case FN_CTRMV:
+    case FN_ZTRMV:
+
+    case FN_STPMV:
+    case FN_DTPMV:
+    case FN_CTPMV:
+    case FN_ZTPMV:
+
+    case FN_STRSV:
+    case FN_DTRSV:
+    case FN_CTRSV:
+    case FN_ZTRSV:
+
+    case FN_STPSV:
+    case FN_DTPSV:
+    case FN_CTPSV:
+    case FN_ZTPSV:
+
+    case FN_SGER:
+    case FN_DGER:
+    case FN_CGERU:
+    case FN_ZGERU:
+    case FN_CGERC:
+    case FN_ZGERC:
+
+    case FN_CHER:
+    case FN_ZHER:
+    case FN_CHER2:
+    case FN_ZHER2:
+
+    case FN_CHPR:
+    case FN_ZHPR:
+    case FN_CHPR2:
+    case FN_ZHPR2:
+
+	case FN_SSYR:
+	case FN_DSYR:
+    case FN_SSPR:
+	case FN_DSPR:
+
+	case FN_SSYR2:
+	case FN_DSYR2:
+    case FN_SSPR2:
+	case FN_DSPR2:
+
+	case FN_CHEMV:
+	case FN_ZHEMV:
+    case FN_CHPMV:
+    case FN_ZHPMV:
+
+    case FN_SGBMV:
+	case FN_DGBMV:
+	case FN_CGBMV:
+	case FN_ZGBMV:
+
+	case FN_STBMV:
+	case FN_DTBMV:
+	case FN_CTBMV:
+	case FN_ZTBMV:
+
+	case FN_SSBMV:
+	case FN_DSBMV:
+
+	case FN_CHBMV:
+	case FN_ZHBMV:
+
+	case FN_STBSV:
+	case FN_DTBSV:
+	case FN_CTBSV:
+	case FN_ZTBSV:
+
+    return 2;
+
+	case FN_CHEMM:
+    case FN_ZHEMM:
+
+    case FN_SSYMM:
+    case FN_DSYMM:
+    case FN_CSYMM:
+    case FN_ZSYMM:
+
+    case FN_SGEMM:
+    case FN_DGEMM:
+    case FN_CGEMM:
+    case FN_ZGEMM:
+
+    case FN_SGEMM_2:
+    case FN_DGEMM_2:
+    case FN_CGEMM_2:
+    case FN_ZGEMM_2:
+
+    case FN_STRMM:
+    case FN_DTRMM:
+    case FN_CTRMM:
+    case FN_ZTRMM:
+
+    case FN_STRSM:
+    case FN_DTRSM:
+    case FN_CTRSM:
+    case FN_ZTRSM:
+
+    case FN_SSYR2K:
+    case FN_DSYR2K:
+    case FN_CSYR2K:
+    case FN_ZSYR2K:
+
+    case FN_SSYRK:
+    case FN_DSYRK:
+    case FN_CSYRK:
+    case FN_ZSYRK:
+
+	case FN_CHERK:
+	case FN_ZHERK:
+	case FN_CHER2K:
+	case FN_ZHER2K:
+
+        return 3;
+    default:
+        return 0;
+    }
+}
diff --git a/src/tests/correctness/BlasBase-corr.cpp b/src/tests/correctness/BlasBase-corr.cpp
new file mode 100644
index 0000000..2bc1494
--- /dev/null
+++ b/src/tests/correctness/BlasBase-corr.cpp
@@ -0,0 +1,41 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <BlasBase.h>
+
+namespace clMath {
+
+clblasStatus
+BlasBase::addScratchImages(void)
+{
+    //clblasStatus status;
+
+    //// Height must be less than 1024
+    //imageA_ = clblasAddScratchImage(context_, 2048, 512, &status);
+    //if (imageA_) {
+    //    imageB_ = clblasAddScratchImage(context_, 2048, 512, &status);
+    //}
+
+    //return status;
+	return clblasNotImplemented;
+}
+
+}   // namespace
diff --git a/src/tests/correctness/blas-lapack.c b/src/tests/correctness/blas-lapack.c
new file mode 100644
index 0000000..a010b7b
--- /dev/null
+++ b/src/tests/correctness/blas-lapack.c
@@ -0,0 +1,870 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * cblas to lapack's blas interface adapter
+ */
+
+#include <blas-cblas.h>
+
+#if !defined CORR_TEST_WITH_ACML
+
+#include "blas-lapack.h"
+
+void
+sgemv(char transa, int m, int n, float alpha, float *a, int lda, float *x, int incx, float beta, float *y, int incy)
+{
+    sgemv_(&transa, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy);
+}
+
+void
+dgemv(char transa, int m, int n, double alpha, double *a, int lda, double *x, int incx, double beta, double *y, int incy)
+{
+    dgemv_(&transa, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy);
+}
+
+void
+cgemv(char transa, int m, int n, complex *alpha, complex *a, int lda, complex *x, int incx, complex *beta, complex *y, int incy)
+{
+    cgemv_(&transa, &m, &n, alpha, a, &lda, x, &incx, beta, y, &incy);
+}
+
+void
+zgemv(char transa, int m, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *x, int incx, doublecomplex *beta, doublecomplex *y, int incy)
+{
+    zgemv_(&transa, &m, &n, alpha, a, &lda, x, &incx, beta, y, &incy);
+}
+
+void
+ssymv(char uplo, int n, float alpha, float *a, int lda, float *x, int incx, float beta, float *y, int incy)
+{
+    ssymv_(&uplo, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy);
+}
+
+void
+dsymv(char uplo, int n, double alpha, double *a, int lda, double *x, int incx, double beta, double *y, int incy)
+{
+    dsymv_(&uplo, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy);
+}
+
+void
+sgemm(char transa, char transb, int m, int n, int k, float alpha, float *a, int lda, float *b, int ldb, float beta, float *c, int ldc)
+{
+    sgemm_(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+}
+
+void
+dgemm(char transa, char transb, int m, int n, int k, double alpha, double *a, int lda, double *b, int ldb, double beta, double *c, int ldc)
+{
+    dgemm_(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+}
+
+void
+cgemm(char transa, char transb, int m, int n, int k, complex *alpha, complex *a, int lda, complex *b, int ldb, complex *beta, complex *c, int ldc)
+{
+    cgemm_(&transa, &transb, &m, &n, &k, alpha, a, &lda, b, &ldb, beta, c, &ldc);
+}
+
+void
+zgemm(char transa, char transb, int m, int n, int k, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb, doublecomplex *beta, doublecomplex *c, int ldc)
+{
+    zgemm_(&transa, &transb, &m, &n, &k, alpha, a, &lda, b, &ldb, beta, c, &ldc);
+}
+
+void
+strmm(char side, char uplo, char transa, char diag, int m, int n, float alpha, float *a, int lda, float *b, int ldb)
+{
+    strmm_(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb);
+}
+
+void
+dtrmm(char side, char uplo, char transa, char diag, int m, int n, double alpha, double *a, int lda, double *b, int ldb)
+{
+    dtrmm_(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb);
+}
+
+void
+ctrmm(char side, char uplo, char transa, char diag, int m, int n, complex *alpha, complex *a, int lda, complex *b, int ldb)
+{
+    ctrmm_(&side, &uplo, &transa, &diag, &m, &n, alpha, a, &lda, b, &ldb);
+}
+
+void
+ztrmm(char side, char uplo, char transa, char diag, int m, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb)
+{
+    ztrmm_(&side, &uplo, &transa, &diag, &m, &n, alpha, a, &lda, b, &ldb);
+}
+
+void
+strsm(char side, char uplo, char transa, char diag, int m, int n, float alpha, float *a, int lda, float *b, int ldb)
+{
+    strsm_(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb);
+}
+
+void
+dtrsm(char side, char uplo, char transa, char diag, int m, int n, double alpha, double *a, int lda, double *b, int ldb)
+{
+    dtrsm_(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb);
+}
+
+void
+ctrsm(char side, char uplo, char transa, char diag, int m, int n, complex *alpha, complex *a, int lda, complex *b, int ldb)
+{
+    ctrsm_(&side, &uplo, &transa, &diag, &m, &n, alpha, a, &lda, b, &ldb);
+}
+
+void
+ztrsm(char side, char uplo, char transa, char diag, int m, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb)
+{
+    ztrsm_(&side, &uplo, &transa, &diag, &m, &n, alpha, a, &lda, b, &ldb);
+}
+
+void
+ssyr2k(char uplo, char transa, int n, int k, float alpha, float *a, int lda, float *b, int ldb, float beta, float *c, int ldc)
+{
+    ssyr2k_(&uplo, &transa, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+}
+
+void
+dsyr2k(char uplo, char transa, int n, int k, double alpha, double *a, int lda, double *b, int ldb, double beta, double *c, int ldc)
+{
+    dsyr2k_(&uplo, &transa, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+}
+
+void
+csyr2k(char uplo, char transa, int n, int k, complex *alpha, complex *a, int lda, complex *b, int ldb, complex *beta, complex *c, int ldc)
+{
+    csyr2k_(&uplo, &transa, &n, &k, alpha, a, &lda, b, &ldb, beta, c, &ldc);
+}
+
+void
+zsyr2k(char uplo, char transa, int n, int k, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb, doublecomplex *beta, doublecomplex *c, int ldc)
+{
+    zsyr2k_(&uplo, &transa, &n, &k, alpha, a, &lda, b, &ldb, beta, c, &ldc);
+}
+
+void
+ssyrk(char uplo, char transa, int n, int k, float alpha, float *a, int lda, float beta, float *c, int ldc)
+{
+    ssyrk_(&uplo, &transa, &n, &k, &alpha, a, &lda, &beta, c, &ldc);
+}
+
+void
+dsyrk(char uplo, char transa, int n, int k, double alpha, double *a, int lda, double beta, double *c, int ldc)
+{
+    dsyrk_(&uplo, &transa, &n, &k, &alpha, a, &lda, &beta, c, &ldc);
+}
+
+void
+csyrk(char uplo, char transa, int n, int k, complex *alpha, complex *a, int lda, complex *beta, complex *c, int ldc)
+{
+    csyrk_(&uplo, &transa, &n, &k, alpha, a, &lda, beta, c, &ldc);
+}
+
+void
+zsyrk(char uplo, char transa, int n, int k, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *beta, doublecomplex *c, int ldc)
+{
+    zsyrk_(&uplo, &transa, &n, &k, alpha, a, &lda, beta, c, &ldc);
+}
+
+void
+strmv(char uplo, char transa, char diag, int n, float *a, int lda, float *x, int incx)
+{
+   strmv_( &uplo, &transa, &diag, &n, a, &lda, x, &incx);
+}
+
+void
+dtrmv(char uplo, char transa, char diag, int n, double *a, int lda, double *x, int incx)
+{
+   dtrmv_( &uplo, &transa, &diag, &n, a, &lda, x, &incx);
+}
+
+void
+ctrmv(char uplo, char transa, char diag, int n, complex *a, int lda, complex *x, int incx)
+{
+   ctrmv_( &uplo, &transa, &diag, &n, a, &lda, x, &incx);
+}
+
+void
+ztrmv(char uplo, char transa, char diag, int n, doublecomplex *a, int lda, doublecomplex *x, int incx)
+{
+   ztrmv_( &uplo, &transa, &diag, &n, a, &lda, x, &incx);
+}
+
+void
+strsv(char uplo, char transa, char diag, int n, float *a, int lda, float *x, int incx)
+{
+   strsv_( &uplo, &transa, &diag, &n, a, &lda, x, &incx);
+}
+
+void
+dtrsv(char uplo, char transa, char diag, int n, double *a, int lda, double *x, int incx)
+{
+   dtrsv_( &uplo, &transa, &diag, &n, a, &lda, x, &incx);
+}
+
+void
+ctrsv(char uplo, char transa, char diag, int n, complex *a, int lda, complex *x, int incx)
+{
+   ctrsv_( &uplo, &transa, &diag, &n, a, &lda, x, &incx);
+}
+
+void
+ztrsv(char uplo, char transa, char diag, int n, doublecomplex *a, int lda, doublecomplex *x, int incx)
+{
+   ztrsv_( &uplo, &transa, &diag, &n, a, &lda, x, &incx);
+}
+
+void
+ssymm(char side, char uplo, int m, int n, float alpha, float *a, int lda, float *b, int ldb, float beta, float *c, int ldc)
+{
+   ssymm_( &side, &uplo, &m, &n, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+}
+
+void
+dsymm(char side, char uplo, int m, int n, double alpha, double *a, int lda, double *b, int ldb, double beta, double *c, int ldc)
+{
+   dsymm_( &side, &uplo, &m, &n, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+}
+
+void
+csymm(char side, char uplo, int m, int n, complex *alpha, complex *a, int lda, complex *b, int ldb, complex *beta, complex *c, int ldc)
+{
+   csymm_( &side, &uplo, &m, &n, alpha, a, &lda, b, &ldb, beta, c, &ldc);
+}
+
+void
+zsymm(char side, char uplo, int m, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb, doublecomplex *beta, doublecomplex *c, int ldc)
+{
+   zsymm_( &side, &uplo, &m, &n, alpha, a, &lda, b, &ldb, beta, c, &ldc);
+}
+
+void
+sger(int m, int n, float alpha, float *x, int incx, float *y, int incy, float *a, int lda)
+{
+   sger_( &m, &n, &alpha, x, &incx, y, &incy, a, &lda);
+}
+
+void
+dger(int m, int n, double alpha, double *x, int incx, double *y, int incy, double *a, int lda)
+{
+   dger_( &m, &n, &alpha, x, &incx, y, &incy, a, &lda);
+}
+
+void
+cgeru(int m, int n, complex *alpha, complex *x, int incx, complex *y, int incy, complex *a, int lda)
+{
+   cgeru_( &m, &n, alpha, x, &incx, y, &incy, a, &lda);
+}
+
+void
+zgeru(int m, int n, doublecomplex *alpha, doublecomplex *x, int incx, doublecomplex *y, int incy, doublecomplex *a, int lda)
+{
+   zgeru_( &m, &n, alpha, x, &incx, y, &incy, a, &lda);
+}
+
+void
+cgerc(int m, int n, complex *alpha, complex *x, int incx, complex *y, int incy, complex *a, int lda)
+{
+   cgerc_( &m, &n, alpha, x, &incx, y, &incy, a, &lda);
+}
+
+void
+zgerc(int m, int n, doublecomplex *alpha, doublecomplex *x, int incx, doublecomplex *y, int incy, doublecomplex *a, int lda)
+{
+   zgerc_( &m, &n, alpha, x, &incx, y, &incy, a, &lda);
+}
+
+void
+ssyr(char uplo, int n, float alpha, float *x, int incx, float *a, int lda)
+{
+   ssyr_( &uplo, &n, &alpha, x, &incx, a, &lda);
+}
+
+void
+dsyr(char uplo, int n, double alpha, double *x, int incx, double *a, int lda)
+{
+   dsyr_( &uplo, &n, &alpha, x, &incx, a, &lda);
+}
+
+void
+ssyr2(char uplo, int n, float alpha, float *x, int incx, float *y, int incy, float *a, int lda)
+{
+   ssyr2_( &uplo, &n, &alpha, x, &incx, y, &incy, a, &lda);
+}
+
+void
+dsyr2(char uplo, int n, double alpha, double *x, int incx, double *y, int incy, double *a, int lda)
+{
+   dsyr2_( &uplo, &n, &alpha, x, &incx, y, &incy, a, &lda);
+}
+
+void
+cher(char uplo, int n, float alpha, complex *x, int incx, complex *a, int lda)
+{
+   cher_( &uplo, &n, &alpha, x, &incx, a, &lda);
+}
+
+void
+zher(char uplo, int n, double alpha, doublecomplex *x, int incx, doublecomplex *a, int lda)
+{
+   zher_( &uplo, &n, &alpha, x, &incx, a, &lda);
+}
+
+void
+cher2(char uplo, int n, complex *alpha, complex *x, int incx, complex *y, int incy, complex *a, int lda)
+{
+   cher2_( &uplo, &n, alpha, x, &incx, y, &incy, a, &lda);
+}
+
+void
+zher2(char uplo, int n, doublecomplex *alpha, doublecomplex *x, int incx, doublecomplex *y, int incy, doublecomplex *a, int lda)
+{
+   zher2_( &uplo, &n, alpha, x, &incx, y, &incy, a, &lda);
+}
+
+void
+chemv(char uplo, int n, complex *alpha, complex *a, int lda, complex *x, int incx, complex *beta, complex *y, int incy)
+{
+   chemv_( &uplo, &n, alpha, a, &lda, x, &incx, beta, y, &incy );
+}
+
+void
+zhemv(char uplo, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *x, int incx, doublecomplex *beta, doublecomplex *y, int incy)
+{
+   zhemv_( &uplo, &n, alpha, a, &lda, x, &incx, beta, y, &incy );
+}
+
+void
+stpmv(char uplo, char transa, char diag, int n, float *ap, float *x, int incx)
+{
+   stpmv_( &uplo, &transa, &diag, &n, ap, x, &incx);
+}
+
+void
+dtpmv(char uplo, char transa, char diag, int n, double *ap, double *x, int incx)
+{
+   dtpmv_( &uplo, &transa, &diag, &n, ap, x, &incx);
+}
+
+void
+ctpmv(char uplo, char transa, char diag, int n, complex *ap, complex *x, int incx)
+{
+   ctpmv_( &uplo, &transa, &diag, &n, ap, x, &incx);
+}
+
+void
+ztpmv(char uplo, char transa, char diag, int n, doublecomplex *ap, doublecomplex *x, int incx)
+{
+   ztpmv_( &uplo, &transa, &diag, &n, ap, x, &incx);
+}
+
+void
+stpsv(char uplo, char transa, char diag, int n, float *ap, float *x, int incx)
+{
+   stpsv_( &uplo, &transa, &diag, &n, ap, x, &incx);
+}
+
+void
+dtpsv(char uplo, char transa, char diag, int n, double *ap, double *x, int incx)
+{
+   dtpsv_( &uplo, &transa, &diag, &n, ap, x, &incx);
+}
+
+void
+ctpsv(char uplo, char transa, char diag, int n, complex *ap, complex *x, int incx)
+{
+   ctpsv_( &uplo, &transa, &diag, &n, ap, x, &incx);
+}
+
+void
+ztpsv(char uplo, char transa, char diag, int n, doublecomplex *ap, doublecomplex *x, int incx)
+{
+   ztpsv_( &uplo, &transa, &diag, &n, ap, x, &incx);
+}
+
+void
+sspr(char uplo, int n, float alpha, float *x, int incx, float *ap )
+{
+   sspr_( &uplo, &n, &alpha, x, &incx, ap );
+}
+
+void
+dspr(char uplo, int n, double alpha, double *x, int incx, double *ap )
+{
+   dspr_( &uplo, &n, &alpha, x, &incx, ap );
+}
+
+void
+sspmv(char uplo, int n, float alpha, float *ap, float *x, int incx, float beta, float *y, int incy)
+{
+   sspmv_( &uplo, &n, &alpha, ap, x, &incx, &beta, y, &incy );
+}
+
+void
+dspmv(char uplo, int n, double alpha, double *ap, double *x, int incx, double beta, double *y, int incy)
+{
+   dspmv_( &uplo, &n, &alpha, ap, x, &incx, &beta, y, &incy );
+}
+
+void
+chpmv(char uplo, int n, complex *alpha, complex *ap, complex *x, int incx, complex *beta, complex *y, int incy)
+{
+   chpmv_( &uplo, &n, alpha, ap, x, &incx, beta, y, &incy );
+}
+
+void
+zhpmv(char uplo, int n, doublecomplex *alpha, doublecomplex *ap, doublecomplex *x, int incx, doublecomplex *beta, doublecomplex *y, int incy)
+{
+   zhpmv_( &uplo, &n, alpha, ap, x, &incx, beta, y, &incy );
+}
+
+void
+chpr(char uplo, int n, float alpha, complex *x, int incx, complex *ap )
+{
+   chpr_( &uplo, &n, &alpha, x, &incx, ap );
+}
+
+void
+zhpr(char uplo, int n, double alpha, doublecomplex *x, int incx, doublecomplex *ap )
+{
+   zhpr_( &uplo, &n, &alpha, x, &incx, ap );
+}
+
+void
+sspr2(char uplo, int n, float alpha, float *x, int incx, float *y, int incy, float *a )
+{
+   sspr2_( &uplo, &n, &alpha, x, &incx, y, &incy, a );
+}
+void
+dspr2(char uplo, int n, double alpha, double *x, int incx, double *y, int incy, double *a )
+{
+   dspr2_( &uplo, &n, &alpha, x, &incx, y, &incy, a );
+}
+void
+chpr2(char uplo, int n, complex *alpha, complex *x, int incx, complex *y, int incy, complex *a )
+{
+   chpr2_( &uplo, &n, alpha, x, &incx, y, &incy, a );
+}
+void
+zhpr2(char uplo, int n, doublecomplex *alpha, doublecomplex *x, int incx, doublecomplex *y, int incy, doublecomplex *a )
+{
+   zhpr2_( &uplo, &n, alpha, x, &incx, y, &incy, a );
+}
+
+void
+sgbmv(char trans, int m, int n, int kl, int ku, float alpha, float *a, int inca, float *x, int incx, float beta, float *y, int incy )
+{
+   sgbmv_( &trans, &m, &n, &kl, &ku, &alpha, a, &inca, x, &incx, &beta, y, &incy );
+}
+void
+dgbmv(char trans, int m, int n, int kl, int ku, double alpha, double *a, int inca, double *x, int incx, double beta, double *y, int incy )
+{
+   dgbmv_( &trans, &m, &n, &kl, &ku, &alpha, a, &inca, x, &incx, &beta, y, &incy );
+}
+void
+cgbmv(char trans, int m, int n, int kl, int ku, complex *alpha, complex *a, int inca, complex *x, int incx, complex *beta, complex *y, int incy )
+{
+   cgbmv_( &trans, &m, &n, &kl, &ku, alpha, a, &inca, x, &incx, beta, y, &incy );
+}
+void
+zgbmv(char trans, int m, int n, int kl, int ku, doublecomplex *alpha, doublecomplex *a, int inca, doublecomplex *x, int incx, doublecomplex *beta, doublecomplex *y, int incy )
+{
+   zgbmv_( &trans, &m, &n, &kl, &ku, alpha, a, &inca, x, &incx, beta, y, &incy );
+}
+
+void
+stbmv(char uplo, char trans, char diag, int n, int k, float *a, int lda, float *x, int incx )
+{
+   stbmv_( &uplo, &trans, &diag, &n, &k, a, &lda, x, &incx );
+}
+
+void
+dtbmv(char uplo, char trans, char diag, int n, int k, double *a, int lda, double *x, int incx )
+{
+   dtbmv_( &uplo, &trans, &diag, &n, &k, a, &lda, x, &incx );
+}
+
+void
+ctbmv(char uplo, char trans, char diag, int n, int k, complex *a, int lda, complex *x, int incx )
+{
+   ctbmv_( &uplo, &trans, &diag, &n, &k, a, &lda, x, &incx );
+}
+
+void
+ztbmv(char uplo, char trans, char diag, int n, int k, doublecomplex *a, int lda, doublecomplex *x, int incx )
+{
+   ztbmv_( &uplo, &trans, &diag, &n, &k, a, &lda, x, &incx );
+}
+
+void
+ssbmv(char uplo, int n, int k, float alpha, float *a, int lda, float *x, int incx, float beta, float *y, int incy )
+{
+   ssbmv_( &uplo, &n, &k, &alpha, a, &lda, x, &incx, &beta, y, &incy );
+}
+
+void
+dsbmv(char uplo, int n, int k, double alpha, double *a, int lda, double *x, int incx, double beta, double *y, int incy )
+{
+   dsbmv_( &uplo, &n, &k, &alpha, a, &lda, x, &incx, &beta, y, &incy );
+}
+
+void
+chbmv(char uplo, int n, int k, complex *alpha, complex *a, int lda, complex *x, int incx, complex *beta, complex *y, int incy )
+{
+   chbmv_( &uplo, &n, &k, alpha, a, &lda, x, &incx, beta, y, &incy );
+}
+
+void
+zhbmv(char uplo, int n, int k, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *x, int incx, doublecomplex *beta, doublecomplex *y, int incy )
+{
+   zhbmv_( &uplo, &n, &k, alpha, a, &lda, x, &incx, beta, y, &incy );
+}
+
+void
+stbsv(char uplo, char trans, char diag, int n, int k, float *a, int lda, float *x, int incx )
+{
+   stbsv_( &uplo, &trans, &diag, &n, &k, a, &lda, x, &incx );
+}
+
+void
+dtbsv(char uplo, char trans, char diag, int n, int k, double *a, int lda, double *x, int incx )
+{
+   dtbsv_( &uplo, &trans, &diag, &n, &k, a, &lda, x, &incx );
+}
+
+void
+ctbsv(char uplo, char trans, char diag, int n, int k, complex *a, int lda, complex *x, int incx )
+{
+   ctbsv_( &uplo, &trans, &diag, &n, &k, a, &lda, x, &incx );
+}
+
+void
+ztbsv(char uplo, char trans, char diag, int n, int k, doublecomplex *a, int lda, doublecomplex *x, int incx )
+{
+   ztbsv_( &uplo, &trans, &diag, &n, &k, a, &lda, x, &incx );
+}
+
+void
+chemm(char side, char uplo, int m, int n, complex *alpha, complex *a, int lda, complex *b, int ldb, complex *beta, complex *c, int ldc)
+{
+   chemm_( &side, &uplo, &m, &n, alpha, a, &lda, b, &ldb, beta, c, &ldc);
+}
+
+void
+zhemm(char side, char uplo, int m, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb, doublecomplex *beta, doublecomplex *c, int ldc)
+{
+   zhemm_( &side, &uplo, &m, &n, alpha, a, &lda, b, &ldb, beta, c, &ldc);
+}
+
+void
+cherk(char uplo, char transa, int n, int k, float alpha, complex *a, int lda, float beta, complex *c, int ldc)
+{
+   cherk_( &uplo, &transa, &n, &k, &alpha, a, &lda, &beta, c, &ldc);
+}
+
+void
+zherk(char uplo, char transa, int n, int k, double alpha, doublecomplex *a, int lda, double beta, doublecomplex *c, int ldc)
+{
+   zherk_( &uplo, &transa, &n, &k, &alpha, a, &lda, &beta, c, &ldc);
+}
+
+void
+cher2k(char uplo, char transa, int n, int k, complex *alpha, complex *a, int lda, complex *b, int ldb, float beta, complex *c, int ldc)
+{
+   cher2k_( &uplo, &transa, &n, &k, alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+}
+
+void
+zher2k(char uplo, char transa, int n, int k, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb, double beta, doublecomplex *c, int ldc)
+{
+   zher2k_( &uplo, &transa, &n, &k, alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+}
+
+void sscal( int n, float alpha, float *x, int incx)
+{
+    sscal_(&n, &alpha, x, &incx);
+}
+
+void dscal( int n, double alpha, double *x, int incx)
+{
+    dscal_(&n, &alpha, x, &incx);
+}
+
+void cscal( int n, complex* alpha, complex *x, int incx)
+{
+    cscal_(&n, alpha, x, &incx);
+}
+
+void zscal( int n, doublecomplex* alpha, doublecomplex *x, int incx)
+{
+    zscal_(&n, alpha, x, &incx);
+}
+
+void csscal( int n, float alpha, complex *x, int incx)
+{
+    csscal_(&n, &alpha, x, &incx);
+}
+
+void zdscal( int n, double alpha, doublecomplex *x, int incx)
+{
+    zdscal_(&n, &alpha, x, &incx);
+}
+
+float sdot( int n, float *x, int incx,  float *y, int incy)
+{
+    return sdot_(&n, x, &incx, y, &incy);
+}
+
+double ddot( int n, double *x, int incx,  double *y, int incy)
+{
+    return ddot_(&n, x, &incx, y, &incy);
+}
+
+complex cdotu( int n, complex *x, int incx, complex *y, int incy)
+{
+    complex ans;
+
+    #if defined( _WIN32 ) || defined( _WIN64 )
+        ans = cdotu_(&n, x, &incx, y, &incy);
+    #else
+        cdotusub_(&n, x, &incx, y, &incy, &ans);
+    #endif
+
+    return ans;
+}
+
+doublecomplex zdotu( int n, doublecomplex *x, int incx,  doublecomplex *y, int incy)
+{
+    doublecomplex ans;
+
+    #if defined( _WIN32 ) || defined( _WIN64 )
+        ans = zdotu_(&n, x, &incx, y, &incy);
+    #else
+        zdotusub_(&n, x, &incx, y, &incy, &ans);
+    #endif
+
+    return ans;
+}
+
+complex cdotc( int n, complex *x, int incx, complex *y, int incy)
+{
+    complex ans;
+
+    #if defined( _WIN32 ) || defined( _WIN64 )
+        ans = cdotc_(&n, x, &incx, y, &incy);
+    #else
+        cdotcsub_(&n, x, &incx, y, &incy, &ans);
+    #endif
+
+    return ans;
+}
+
+doublecomplex zdotc( int n, doublecomplex *x, int incx,  doublecomplex *y, int incy)
+{
+    doublecomplex ans;
+
+    #if defined( _WIN32 ) || defined( _WIN64 )
+        ans = zdotc_(&n, x, &incx, y, &incy);
+    #else
+        zdotcsub_(&n, x, &incx, y, &incy, &ans);
+    #endif
+
+    return ans;
+}
+
+void scopy( int n, float *x, int incx,  float *y, int incy)
+{
+    scopy_(&n, x, &incx, y, &incy);
+}
+
+void dcopy( int n, double *x, int incx,  double *y, int incy)
+{
+    dcopy_(&n, x, &incx, y, &incy);
+}
+
+void ccopy( int n, complex *x, int incx,  complex *y, int incy)
+{
+    ccopy_(&n, x, &incx, y, &incy);
+}
+
+void zcopy( int n, doublecomplex *x, int incx,  doublecomplex *y, int incy)
+{
+    zcopy_(&n, x, &incx, y, &incy);
+}
+
+void sswap( int n, float *x, int incx,  float *y, int incy)
+{
+    sswap_(&n, x, &incx, y, &incy);
+}
+
+void dswap( int n, double *x, int incx,  double *y, int incy)
+{
+    dswap_(&n, x, &incx, y, &incy);
+}
+
+void cswap( int n, complex *x, int incx,  complex *y, int incy)
+{
+    cswap_(&n, x, &incx, y, &incy);
+}
+
+void zswap( int n, doublecomplex *x, int incx,  doublecomplex *y, int incy)
+{
+    zswap_(&n, x, &incx, y, &incy);
+}
+
+void saxpy( int n, float alpha, float *x, int incx,  float *y, int incy)
+{
+    saxpy_(&n, &alpha, x, &incx, y, &incy);
+}
+
+void daxpy( int n, double alpha, double *x, int incx,  double *y, int incy)
+{
+    daxpy_(&n, &alpha, x, &incx, y, &incy);
+}
+
+void caxpy( int n, complex *alpha, complex *x, int incx,  complex *y, int incy)
+{
+    caxpy_(&n, alpha, x, &incx, y, &incy);
+}
+
+void zaxpy( int n, doublecomplex *alpha, doublecomplex *x, int incx,  doublecomplex *y, int incy)
+{
+    zaxpy_(&n, alpha, x, &incx, y, &incy);
+}
+
+void srotg(float *A, float *B, float *C, float *S)
+{
+    srotg_(A, B, C, S);
+}
+
+void drotg(double *A, double *B, double *C, double *S)
+{
+    drotg_(A, B, C, S);
+}
+
+void crotg(complex *A, complex *B, float *C, complex *S)
+{
+    crotg_(A, B, C, S);
+}
+
+void zrotg(doublecomplex *A, doublecomplex *B, double *C, doublecomplex *S)
+{
+    zrotg_(A, B, C, S);
+}
+
+void srotmg(float *D1, float *D2, float *X1, const float *Y1, float *PARAM)
+{
+    srotmg_(D1, D2, X1, (float*)Y1, PARAM);
+}
+
+void drotmg(double *D1, double *D2, double *X1, const double *Y1, double *PARAM)
+{
+    drotmg_(D1, D2, X1, (double*)Y1, PARAM);
+}
+
+void srot(int N, float *x, int incx, float *y, int incy, float c, float s)
+{
+    srot_(&N, x, &incx, y, &incy, &c, &s);
+}
+
+void drot(int N, double *x, int incx, double *y, int incy, double c, double s)
+{
+    drot_(&N, x, &incx, y, &incy, &c, &s);
+}
+
+void csrot(int N, complex *x, int incx, complex *y, int incy, float c, float s)
+{
+    csrot_(&N, x, &incx, y, &incy, &c, &s);
+}
+
+void zdrot(int N, doublecomplex *cx, int incx, doublecomplex *cy, int incy, double c, double s)
+{
+    zdrot_(&N, cx, &incx, cy, &incy, &c, &s);
+}
+
+void srotm(int N, float *X, int incx, float *Y, int incy, float* PARAM)
+{
+    srotm_(&N, X, &incx, Y, &incy, PARAM);
+}
+
+void drotm(int N, double *X, int incx, double *Y, int incy, double* PARAM)
+{
+    drotm_(&N, X, &incx, Y, &incy, PARAM);
+}
+
+int isamax( int n, float *x, int incx)
+{
+    return isamax_(&n, x, &incx);
+}
+
+int idamax( int n, double *x, int incx)
+{
+    return idamax_(&n, x, &incx);
+}
+
+int icamax( int n, complex *x, int incx)
+{
+    return icamax_(&n, x, &incx);
+}
+
+int izamax( int n, doublecomplex *x, int incx)
+{
+    return izamax_(&n, x, &incx);
+}
+
+float snrm2( int n, float *x, int incx)
+{
+    return snrm2_(&n, x, &incx);
+}
+
+double dnrm2( int n, double *x, int incx)
+{
+    return dnrm2_(&n, x, &incx);
+}
+
+float scnrm2( int n, complex *x, int incx)
+{
+    return scnrm2_(&n, x, &incx);
+}
+
+double dznrm2( int n, doublecomplex *x, int incx)
+{
+    return dznrm2_(&n, x, &incx);
+}
+
+float sasum( int n, float *x, int incx)
+{
+    return sasum_(&n, x, &incx);
+}
+
+double dasum( int n, double *x, int incx)
+{
+    return dasum_(&n, x, &incx);
+}
+
+float scasum( int n, complex *x, int incx)
+{
+    return scasum_(&n, x, &incx);
+}
+
+double dzasum( int n, doublecomplex *x, int incx)
+{
+    return dzasum_(&n, x, &incx);
+}
+
+#endif
diff --git a/src/tests/correctness/blas-lapack.h b/src/tests/correctness/blas-lapack.h
new file mode 100644
index 0000000..6dc55ee
--- /dev/null
+++ b/src/tests/correctness/blas-lapack.h
@@ -0,0 +1,1225 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef BLAS_LAPACK_H_
+#define BLAS_LAPACK_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* BLAS-2 functions */
+
+void sgemv_(
+    const char *transA,
+    const int *M,
+    const int *N,
+    const float *alpha,
+    const float *A,
+    const int *lda,
+    const float *X,
+    const int *incx,
+    const float *beta,
+    float *Y,
+    const int *incy);
+
+void dgemv_(
+    const char *transA,
+    const int *M,
+    const int *N,
+    const double *alpha,
+    const double *A,
+    const int *lda,
+    const double *X,
+    const int *incx,
+    const double *beta,
+    double *Y,
+    const int *incy);
+
+void cgemv_(
+    const char *transA,
+    const int *M,
+    const int *N,
+    const complex *alpha,
+    const complex *A,
+    const int *lda,
+    const complex *X,
+    const int *incx,
+    const complex *beta,
+    complex *Y,
+    const int *incy);
+
+void zgemv_(
+    const char *transA,
+    const int *M,
+    const int *N,
+    const doublecomplex *alpha,
+    const doublecomplex *A,
+    const int *lda,
+    const doublecomplex *X,
+    const int *incx,
+    const doublecomplex *beta,
+    doublecomplex *Y,
+    const int *incy);
+
+void ssymv_(
+    const char *uplo,
+    const int *N,
+    const float *alpha,
+    const float *A,
+    const int *lda,
+    const float *X,
+    const int *incx,
+    const float *beta,
+    float *Y,
+    int *incy);
+
+void dsymv_(
+    const char *uplo,
+    const int *N,
+    const double *alpha,
+    const double *A,
+    const int *lda,
+    const double *X,
+    const int *incx,
+    const double *beta,
+    double *Y,
+    const int *incy);
+
+/* BLAS-3 functions */
+
+void sgemm_(
+    const char *transA,
+    const char *transB,
+    const int *M,
+    const int *N,
+    const int *K,
+    const float *alpha,
+    const float *A,
+    const int *lda,
+    const float *B,
+    const int *ldb,
+    const float *beta,
+    float *C,
+    const int *ldc);
+
+void dgemm_(
+    const char *transA,
+    const char *transB,
+    const int *M,
+    const int *N,
+    const int *K,
+    const double *alpha,
+    const double *A,
+    const int *lda,
+    const double *B,
+    const int *ldb,
+    const double *beta,
+    double *C,
+    const int *ldc);
+
+void cgemm_(
+    const char *transA,
+    const char *transB,
+    const int *M,
+    const int *N,
+    const int *K,
+    const complex *alpha,
+    const complex *A,
+    const int *lda,
+    const complex *B,
+    const int *ldb,
+    const complex *beta,
+    complex *C,
+    const int *ldc);
+
+void zgemm_(
+    const char *transA,
+    const char *transB,
+    const int *M,
+    const int *N,
+    const int *K,
+    const doublecomplex *alpha,
+    const doublecomplex *A,
+    const int *lda,
+    const doublecomplex *B,
+    const int *ldb,
+    const doublecomplex *beta,
+    doublecomplex *C,
+    const int *ldc);
+
+void strmm_(
+    const char *side,
+    const char *uplo,
+    const char *transA,
+    const char *diag,
+    const int *M,
+    const int *N,
+    const float *alpha,
+    const float *A,
+    const int *lda,
+    float *B,
+    const int *ldb);
+
+void dtrmm_(
+    const char *side,
+    const char *uplo,
+    const char *transA,
+    const char *diag,
+    const int *M,
+    const int *N,
+    const double *alpha,
+    const double *A,
+    const int *lda,
+    double *B,
+    const int *ldb);
+
+void ctrmm_(
+    const char *side,
+    const char *uplo,
+    const char *transA,
+    const char *diag,
+    const int *M,
+    const int *N,
+    const complex *alpha,
+    const complex *A,
+    const int *lda,
+    complex *B,
+    const int *ldb);
+
+void ztrmm_(
+    const char *side,
+    const char *uplo,
+    const char *transA,
+    const char *diag,
+    const int *M,
+    const int *N,
+    const doublecomplex *alpha,
+    const doublecomplex *A,
+    const int *lda,
+    doublecomplex *B,
+    const int *ldb);
+
+void strsm_(
+    const char *side,
+    const char *uplo,
+    const char *transA,
+    const char *diag,
+    const int *M,
+    const int *N,
+    const float *aplha,
+    const float *A,
+    const int *lda,
+    float *B,
+    const int *ldb);
+
+void dtrsm_(
+    const char *side,
+    const char *uplo,
+    const char *transA,
+    const char *diag,
+    const int *M,
+    const int *N,
+    const double *alpha,
+    const double *A,
+    const int *lda,
+    double *B,
+    const int *ldb);
+
+void ctrsm_(
+    const char *side,
+    const char *uplo,
+    const char *transA,
+    const char *diag,
+    const int *M,
+    const int *N,
+    const complex *alpha,
+    const complex *A,
+    const int *lda,
+    complex *B,
+    const int *ldb);
+
+void ztrsm_(
+    const char *side,
+    const char *uplo,
+    const char *transA,
+    const char *diag,
+    const int *M,
+    const int *N,
+    const doublecomplex *alpha,
+    const doublecomplex *A,
+    const int *lda,
+    doublecomplex *B,
+    const int *ldb);
+
+void ssyr2k_(
+    const char *uplo,
+    const char *transA,
+    const int *N,
+    const int *K,
+    const float *alpha,
+    const float *A,
+    const int *lda,
+    const float *B,
+    const int *ldb,
+    const float *beta,
+    float *C,
+    const int *ldc);
+
+void dsyr2k_(
+    const char *uplo,
+    const char *transA,
+    const int *N,
+    const int *K,
+    const double *alpha,
+    const double *A,
+    const int *lda,
+    const double *B,
+    const int *ldb,
+    const double *beta,
+    double *C,
+    const int *ldc);
+
+void csyr2k_(
+    const char *uplo,
+    const char *transA,
+    const int *N,
+    const int *K,
+    const complex *alpha,
+    const complex *A,
+    const int *lda,
+    const complex *B,
+    const int *ldb,
+    const complex *beta,
+    complex *C,
+    const int *ldc);
+
+void zsyr2k_(
+    const char *uplo,
+    const char *transA,
+    const int *N,
+    const int *K,
+    const doublecomplex *alpha,
+    const doublecomplex *A,
+    const int *lda,
+    const doublecomplex *B,
+    const int *ldb,
+    const doublecomplex *beta,
+    doublecomplex *C,
+    const int *ldc);
+
+void ssyrk_(
+    const char *uplo,
+    const char *transA,
+    const int *N,
+    const int *K,
+    const float *alpha,
+    const float *A,
+    const int *lda,
+    const float *beta,
+    float *C,
+    const int *ldc);
+
+void dsyrk_(
+    const char *uplo,
+    const char *transA,
+    const int *N,
+    const int *K,
+    const double *alpha,
+    const double *A,
+    const int *lda,
+    const double *beta,
+    double *C,
+    const int *ldc);
+
+void csyrk_(
+    const char *uplo,
+    const char *transA,
+    const int *N,
+    const int *K,
+    const complex *alpha,
+    const complex *A,
+    const int *lda,
+    const complex *beta,
+    complex *C,
+    const int *ldc);
+
+void zsyrk_(
+    const char *uplo,
+    const char *transA,
+    const int *N,
+    const int *K,
+    const doublecomplex *alpha,
+    const doublecomplex *A,
+    const int *lda,
+    const doublecomplex *beta,
+    doublecomplex *C,
+    const int *ldc);
+
+void strmv_(
+    const char *uplo,
+    const char *transa,
+    const char *diag,
+    const int *n,
+    const float *a,
+    const int *lda,
+    float *x,
+    const int *incx);
+
+void dtrmv_(
+    const char *uplo,
+    const char *transa,
+    const char *diag,
+    const int *n,
+    const double *a,
+    const int *lda,
+    double *x,
+    const int *incx);
+
+void ctrmv_(
+    const char *uplo,
+    const char *transa,
+    const char *diag,
+    const int *n,
+    const complex *a,
+    const int *lda,
+    complex *x,
+    const int *incx);
+
+void ztrmv_(
+    const char *uplo,
+    const char *transa,
+    const char *diag,
+    const int *n,
+    const doublecomplex *a,
+    const int *lda,
+    doublecomplex *x,
+    const int *incx);
+
+void strsv_(
+    const char *uplo,
+    const char *transa,
+    const char *diag,
+    const int *n,
+    const float *a,
+    const int *lda,
+    float *x,
+    const int *incx);
+
+void dtrsv_(
+    const char *uplo,
+    const char *transa,
+    const char *diag,
+    const int *n,
+    const double *a,
+    const int *lda,
+    double *x,
+    const int *incx);
+
+void ctrsv_(
+    const char *uplo,
+    const char *transa,
+    const char *diag,
+    const int *n,
+    const complex *a,
+    const int *lda,
+    complex *x,
+    const int *incx);
+
+void ztrsv_(
+    const char *uplo,
+    const char *transa,
+    const char *diag,
+    const int *n,
+    const doublecomplex *a,
+    const int *lda,
+    doublecomplex *x,
+    const int *incx);
+
+void ssymm_(
+    const char *side,
+    const char *uplo,
+    const int *m,
+    const int *n,
+    const float *alpha,
+    const float *a,
+    const int *lda,
+    const float *b,
+    const int *ldb,
+    const float *beta,
+    float *c,
+    const int *ldc);
+
+void dsymm_(
+    const char *side,
+    const char *uplo,
+    const int *m,
+    const int *n,
+    const double *alpha,
+    const double *a,
+    const int *lda,
+    const double *b,
+    const int *ldb,
+    const double *beta,
+    double *c,
+    const int *ldc);
+
+void csymm_(
+    const char *side,
+    const char *uplo,
+    const int *m,
+    const int *n,
+    const complex *alpha,
+    const complex *a,
+    const int *lda,
+    const complex *b,
+    const int *ldb,
+    const complex *beta,
+    complex *c,
+    const int *ldc);
+
+void zsymm_(
+    const char *side,
+    const char *uplo,
+    const int *m,
+    const int *n,
+    const doublecomplex *alpha,
+    const doublecomplex *a,
+    const int *lda,
+    const doublecomplex *b,
+    const int *ldb,
+    const doublecomplex *beta,
+    doublecomplex *c,
+    const int *ldc);
+
+void sger_(
+    const int *m,
+    const int *n,
+    const float *alpha,
+    const float *x,
+    const int *incx,
+    const float *y,
+    const int *incy,
+    float *a,
+    const int *lda);
+
+void dger_(
+    const int *m,
+    const int *n,
+    const double *alpha,
+    const double *x,
+    const int *incx,
+    const double *y,
+    const int *incy,
+    double *a,
+    const int *lda);
+
+void cgeru_(
+    const int *m,
+    const int *n,
+    const complex *alpha,
+    const complex *x,
+    const int *incx,
+    const complex *y,
+    const int *incy,
+    complex *a,
+    const int *lda);
+
+void zgeru_(
+    const int *m,
+    const int *n,
+    const doublecomplex *alpha,
+    const doublecomplex *x,
+    const int *incx,
+    const doublecomplex *y,
+    const int *incy,
+    doublecomplex *a,
+    const int *lda);
+
+void cgerc_(
+    const int *m,
+    const int *n,
+    const complex *alpha,
+    const complex *x,
+    const int *incx,
+    const complex *y,
+    const int *incy,
+    complex *a,
+    const int *lda);
+
+void zgerc_(
+    const int *m,
+    const int *n,
+    const doublecomplex *alpha,
+    const doublecomplex *x,
+    const int *incx,
+    const doublecomplex *y,
+    const int *incy,
+    doublecomplex *a,
+    const int *lda);
+
+void ssyr_(
+    const char *uplo,
+    const int *n,
+    const float *alpha,
+    const float *x,
+    const int *incx,
+    float *a,
+    const int *lda);
+
+void dsyr_(
+    const char *uplo,
+    const int *n,
+    const double *alpha,
+    const double *x,
+    const int *incx,
+    double *a,
+    const int *lda);
+
+void ssyr2_(
+    const char *uplo,
+    const int *n,
+    const float *alpha,
+    const float *x,
+    const int *incx,
+    const float *y,
+    const int *incy,
+    float *a,
+    const int *lda);
+
+void dsyr2_(
+    const char *uplo,
+    const int *n,
+    const double *alpha,
+    const double *x,
+    const int *incx,
+    const double *y,
+    const int *incy,
+    double *a,
+    const int *lda);
+
+void cher_(
+    const char *uplo,
+    const int *n,
+    const float *alpha,
+    const complex *x,
+    const int *incx,
+    complex *a,
+    const int *lda);
+
+void zher_(
+    const char *uplo,
+    const int *n,
+    const double *alpha,
+    const doublecomplex *x,
+    const int *incx,
+    doublecomplex *a,
+    const int *lda);
+
+void cher2_(
+    const char *uplo,
+    const int *n,
+    const complex *alpha,
+    const complex *x,
+    const int *incx,
+    const complex *y,
+    const int *incy,
+    complex *a,
+    const int *lda);
+
+void zher2_(
+    const char *uplo,
+    const int *n,
+    const doublecomplex *alpha,
+    const doublecomplex *x,
+    const int *incx,
+    const doublecomplex *y,
+    const int *incy,
+    doublecomplex *a,
+    const int *lda);
+
+void chemv_(
+    const char *uplo,
+    const int *n,
+    const complex *alpha,
+    const complex *a,
+    const int *lda,
+    const complex *x,
+    const int *incx,
+    const complex *beta,
+    complex *y,
+    const int *incy);
+
+void zhemv_(
+    const char *uplo,
+    const int *n,
+    const doublecomplex *alpha,
+    const doublecomplex *a,
+    const int *lda,
+    const doublecomplex *x,
+    const int *incx,
+    const doublecomplex *beta,
+    doublecomplex *y,
+    const int *incy);
+
+void stpmv_(
+    const char *uplo,
+    const char *transa,
+    const char *diag,
+    const int *n,
+    const float *ap,
+    float *x,
+    const int *incx);
+
+void dtpmv_(
+    const char *uplo,
+    const char *transa,
+    const char *diag,
+    const int *n,
+    const double *ap,
+    double *x,
+    const int *incx);
+
+void ctpmv_(
+    const char *uplo,
+    const char *transa,
+    const char *diag,
+    const int *n,
+    const complex *ap,
+    complex *x,
+    const int *incx);
+
+void ztpmv_(
+    const char *uplo,
+    const char *transa,
+    const char *diag,
+    const int *n,
+    const doublecomplex *ap,
+    doublecomplex *x,
+    const int *incx);
+
+void stpsv_(
+    const char *uplo,
+    const char *transa,
+    const char *diag,
+    const int *n,
+    const float *ap,
+    float *x,
+    const int *incx);
+
+void dtpsv_(
+    const char *uplo,
+    const char *transa,
+    const char *diag,
+    const int *n,
+    const double *ap,
+    double *x,
+    const int *incx);
+
+void ctpsv_(
+    const char *uplo,
+    const char *transa,
+    const char *diag,
+    const int *n,
+    const complex *ap,
+    complex *x,
+    const int *incx);
+
+void ztpsv_(
+    const char *uplo,
+    const char *transa,
+    const char *diag,
+    const int *n,
+    const doublecomplex *ap,
+    doublecomplex *x,
+    const int *incx);
+
+void sspr_(
+    const char *uplo,
+    const int *n,
+    const float *alpha,
+    const float *x,
+    const int *incx,
+    float *ap);
+
+void dspr_(
+    const char *uplo,
+    const int *n,
+    const double *alpha,
+    const double *x,
+    const int *incx,
+    double *ap);
+
+void
+sspmv_(
+    const char *uplo,
+    const int *n,
+    const float *alpha,
+    const float *ap,
+    const float *x,
+    const int *incx,
+    const float *beta,
+    float *y,
+    const int *incy);
+
+void
+dspmv_(
+    const char *uplo,
+    const int *n,
+    const double *alpha,
+    const double *ap,
+    const double *x,
+    const int *incx,
+    const double *beta,
+    double *y,
+    const int *incy);
+
+void
+chpmv_(
+    const char *uplo,
+    const int *n,
+    const complex *alpha,
+    const complex *ap,
+    const complex *x,
+    const int *incx,
+    const complex *beta,
+    complex *y,
+    const int *incy);
+
+void
+zhpmv_(
+    const char *uplo,
+    const int *n,
+    const doublecomplex *alpha,
+    const doublecomplex *ap,
+    const doublecomplex *x,
+    const int *incx,
+    const doublecomplex *beta,
+    doublecomplex *y,
+    const int *incy);
+
+void chpr_(
+    const char *uplo,
+    const int *n,
+    const float *alpha,
+    const complex *x,
+    const int *incx,
+    complex *ap);
+
+void zhpr_(
+    const char *uplo,
+    const int *n,
+    const double *alpha,
+    const doublecomplex *x,
+    const int *incx,
+    doublecomplex *ap);
+
+void sspr2_(
+    const char *uplo,
+    const int *n,
+    const float *alpha,
+    const float *x,
+    const int *incx,
+    const float *y,
+    const int *incy,
+    float *a );
+
+void dspr2_(
+    const char *uplo,
+    const int *n,
+    const double *alpha,
+    const double *x,
+    const int *incx,
+    const double *y,
+    const int *incy,
+    double *a );
+
+void chpr2_(
+    const char *uplo,
+    const int *n,
+    const complex *alpha,
+    const complex *x,
+    const int *incx,
+    const complex *y,
+    const int *incy,
+    complex *a );
+
+void zhpr2_(
+    const char *uplo,
+    const int *n,
+    const doublecomplex *alpha,
+    const doublecomplex *x,
+    const int *incx,
+    const doublecomplex *y,
+    const int *incy,
+    doublecomplex *a );
+
+void sgbmv_(
+    const char *trans,
+    const int *m,
+    const int *n,
+    const int *kl,
+    const int *ku,
+    const float *alpha,
+    const float *a,
+    const int *inca,
+    const float *x,
+    const int *incx,
+    const float *beta,
+    float *y,
+    const int *incy );
+
+void dgbmv_(
+    const char *trans,
+    const int *m,
+    const int *n,
+    const int *kl,
+    const int *ku,
+    const double *alpha,
+    const double *a,
+    const int *inca,
+    const double *x,
+    const int *incx,
+    const double *beta,
+    double *y,
+    const int *incy );
+
+void cgbmv_(
+    const char *trans,
+    const int *m,
+    const int *n,
+    const int *kl,
+    const int *ku,
+    const complex *alpha,
+    const complex *a,
+    const int *inca,
+    const complex *x,
+    const int *incx,
+    const complex *beta,
+    complex *y,
+    const int *incy );
+
+void zgbmv_(
+    const char *trans,
+    const int *m,
+    const int *n,
+    const int *kl,
+    const int *ku,
+    const doublecomplex *alpha,
+    const doublecomplex *a,
+    const int *inca,
+    const doublecomplex *x,
+    const int *incx,
+    const doublecomplex *beta,
+    doublecomplex *y,
+    const int *incy );
+
+void stbmv_(
+    const char *uplo,
+    const char *trans,
+    const char *diag,
+    const int *n,
+    const int *k,
+    const float *a,
+    const int *lda,
+    float *x,
+    const int *incx );
+
+void dtbmv_(
+    const char *uplo,
+    const char *trans,
+    const char *diag,
+    const int *n,
+    const int *k,
+    const double *a,
+    const int *lda,
+    double *x,
+    const int *incx );
+
+void ctbmv_(
+    const char *uplo,
+    const char *trans,
+    const char *diag,
+    const int *n,
+    const int *k,
+    const complex *a,
+    const int *lda,
+    complex *x,
+    const int *incx );
+
+void ztbmv_(
+    const char *uplo,
+    const char *trans,
+    const char *diag,
+    const int *n,
+    const int *k,
+    const doublecomplex *a,
+    const int *lda,
+    doublecomplex *x,
+    const int *incx );
+
+void ssbmv_(
+    const char *uplo,
+    const int *n,
+    const int *k,
+    const float *alpha,
+    const float *a,
+    const int *lda,
+    const float *x,
+    const int *incx,
+    const float *beta,
+    float *y,
+    const int *incy );
+
+void dsbmv_(
+    const char *uplo,
+    const int *n,
+    const int *k,
+    const double *alpha,
+    const double *a,
+    const int *lda,
+    const double *x,
+    const int *incx,
+    const double *beta,
+    double *y,
+    const int *incy );
+
+void chbmv_(
+    const char *uplo,
+    const int *n,
+    const int *k,
+    const complex *alpha,
+    const complex *a,
+    const int *lda,
+    const complex *x,
+    const int *incx,
+    const complex *beta,
+    complex *y,
+    const int *incy );
+
+void zhbmv_(
+    const char *uplo,
+    const int *n,
+    const int *k,
+    const doublecomplex *alpha,
+    const doublecomplex *a,
+    const int *lda,
+    const doublecomplex *x,
+    const int *incx,
+    const doublecomplex *beta,
+    doublecomplex *y,
+    const int *incy );
+
+void stbsv_(
+    const char *uplo,
+    const char *trans,
+    const char *diag,
+    const int *n,
+    const int *k,
+    const float *a,
+    const int *lda,
+    float *x,
+    const int *incx );
+
+void dtbsv_(
+    const char *uplo,
+    const char *trans,
+    const char *diag,
+    const int *n,
+    const int *k,
+    const double *a,
+    const int *lda,
+    double *x,
+    const int *incx );
+
+void ctbsv_(
+    const char *uplo,
+    const char *trans,
+    const char *diag,
+    const int *n,
+    const int *k,
+    const complex *a,
+    const int *lda,
+    complex *x,
+    const int *incx );
+
+void ztbsv_(
+    const char *uplo,
+    const char *trans,
+    const char *diag,
+    const int *n,
+    const int *k,
+    const doublecomplex *a,
+    const int *lda,
+    doublecomplex *x,
+    const int *incx );
+
+void chemm_(
+    const char *side,
+    const char *uplo,
+    const int *m,
+    const int *n,
+    const complex *alpha,
+    const complex *a,
+    const int *lda,
+    const complex *b,
+    const int *ldb,
+    const complex *beta,
+    complex *c,
+    const int *ldc);
+
+void zhemm_(
+    const char *side,
+    const char *uplo,
+    const int *m,
+    const int *n,
+    const doublecomplex *alpha,
+    const doublecomplex *a,
+    const int *lda,
+    const doublecomplex *b,
+    const int *ldb,
+    const doublecomplex *beta,
+    doublecomplex *c,
+    const int *ldc);
+
+void cherk_(
+    const char *uplo,
+    const char *transa,
+    const int *n,
+    const int *k,
+    const float *alpha,
+    const complex *a,
+    const int *lda,
+    const float *beta,
+    complex *c,
+    const int *ldc);
+
+void zherk_(
+    const char *uplo,
+    const char *transa,
+    const int *n,
+    const int *k,
+    const double *alpha,
+    const doublecomplex *a,
+    const int *lda,
+    const double *beta,
+    doublecomplex *c,
+    const int *ldc);
+
+void cher2k_(
+    const char *uplo,
+    const char *transa,
+    const int *n,
+    const int *k,
+    const complex *alpha,
+    const complex *a,
+    const int *lda,
+    const complex *b,
+    const int *ldb,
+    const float *beta,
+    complex *c,
+    const int *ldc);
+
+void zher2k_(
+    const char *uplo,
+    const char *transa,
+    const int *n,
+    const int *k,
+    const doublecomplex *alpha,
+    const doublecomplex *a,
+    const int *lda,
+    const doublecomplex *b,
+    const int *ldb,
+    const double *beta,
+    doublecomplex *c,
+    const int *ldc);
+
+void sscal_(int *n, float *alpha, float *x, int *incx);
+void dscal_(int *n, double *alpha, double *x, int *incx);
+void cscal_(int *n, complex *alpha, complex *x, int *incx);
+void zscal_(int *n, doublecomplex *alpha, doublecomplex *x, int *incx);
+
+void csscal_(int *n, float *alpha, complex *x, int *incx);
+void zdscal_(int *n, double *alpha, doublecomplex *x, int *incx);
+
+void scopy_(int *n, float *x, int *incx, float* y, int *incy);
+void dcopy_(int *n, double *x, int *incx, double* y, int *incy);
+void ccopy_(int *n, complex *x, int *incx, complex *y, int *incy);
+void zcopy_(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy);
+
+float sdot_(int *n, float *x, int *incx, float* y, int *incy);
+double ddot_(int *n, double *x, int *incx, double* y, int *incy);
+
+#if defined( _WIN32 ) || defined( _WIN64 )
+    complex cdotu_(int *n, complex *x, int *incx, complex* y, int *incy);
+    doublecomplex zdotu_(int *n, doublecomplex *x, int *incx, doublecomplex* y, int *incy);
+    complex cdotc_(int *n, complex *x, int *incx, complex* y, int *incy);
+    doublecomplex zdotc_(int *n, doublecomplex *x, int *incx, doublecomplex* y, int *incy);
+#else
+    void cdotusub_(int *n, complex *x, int *incx, complex* y, int *incy, complex *ans);
+    void zdotusub_(int *n, doublecomplex *x, int *incx, doublecomplex* y, int *incy, doublecomplex *ans);
+    void cdotcsub_(int *n, complex *x, int *incx, complex* y, int *incy, complex *ans);
+    void zdotcsub_(int *n, doublecomplex *x, int *incx, doublecomplex* y, int *incy, doublecomplex *ans);
+#endif
+
+void sswap_(int *n, float *x, int *incx, float* y, int *incy);
+void dswap_(int *n, double *x, int *incx, double* y, int *incy);
+void cswap_(int *n, complex *x, int *incx, complex *y, int *incy);
+void zswap_(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy);
+
+void saxpy_(int *n, float *alpha, float *x, int *incx, float* y, int *incy);
+void daxpy_(int *n, double *alpha, double *x, int *incx, double* y, int *incy);
+void caxpy_(int *n, complex *alpha, complex *x, int *incx, complex *y, int *incy);
+void zaxpy_(int *n, doublecomplex *alpha, doublecomplex *x, int *incx, doublecomplex *y, int *incy);
+
+
+void srotg_(float *A, float *B, float *C, float *S);
+void drotg_(double *A, double *B, double *C, double *S);
+void crotg_(complex *A, complex *B, float *C, complex *S);
+void zrotg_(doublecomplex *A, doublecomplex *B, double *C, doublecomplex *S);
+
+void srotmg_(float *D1, float *D2, float *X1, float *Y1, float *PARAM);
+void drotmg_(double *D1, double *D2, double *X1, double *Y1, double *PARAM);
+
+void srot_(int *n, float *x, int *incx, float *y, int *incy, float *c, float *s);
+void drot_(int *n, double *x, int *incx, double *y, int *incy, double *c, double *s);
+void csrot_(int *n, complex *x, int *incx, complex *y, int *incy, float *c, float *s);
+void zdrot_(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy, double *c, double *s);
+
+void srotm_(int* N, float *X, int* incx, float *Y, int* incy, float* PARAM);
+void drotm_(int* N, double *X, int* incx, double *Y, int* incy, double* PARAM);
+
+float sasum_(int *n, float *x, int *incx);
+double dasum_(int *n, double *x, int *incx);
+float scasum_(int *n, complex *x, int *incx);
+double dzasum_(int *n, doublecomplex *x, int *incx);
+
+int isamax_(int *n, float *x, int *incx);
+int idamax_(int *n, double *x, int *incx);
+int icamax_(int *n, complex *x, int *incx);
+int izamax_(int *n, doublecomplex *x, int *incx);
+
+float snrm2_(int *n, float *x, int *incx);
+double dnrm2_(int *n, double *x, int *incx);
+float scnrm2_(int *n, complex *x, int *incx);
+double dznrm2_(int *n, doublecomplex *x, int *incx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* BLAS_LAPACK_H */
diff --git a/src/tests/correctness/corr-asum.cpp b/src/tests/correctness/corr-asum.cpp
new file mode 100644
index 0000000..81da8e0
--- /dev/null
+++ b/src/tests/correctness/corr-asum.cpp
@@ -0,0 +1,212 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <asum.h>
+
+static void
+releaseMemObjects(cl_mem objX, cl_mem objAsum, cl_mem objScratch)
+{
+    if(objX != NULL)
+ 	{
+        clReleaseMemObject(objX);
+	}
+	if(objAsum != NULL)
+    {
+        clReleaseMemObject(objAsum);
+    }
+	if(objScratch != NULL)
+    {
+        clReleaseMemObject(objScratch);
+    }
+}
+
+template <typename T> static void
+deleteBuffers(T *blasX, T *blasAsum=NULL, T *clblasAsum=NULL)
+{
+	if(blasX != NULL)
+	{
+        delete[] blasX;
+    }
+	if(clblasAsum != NULL)
+    {
+        delete[] clblasAsum;
+    }
+	if(blasAsum != NULL)
+    {
+        delete(blasAsum);
+    }
+}
+
+template <typename T1, typename T2>
+void
+asumCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T1 *blasX;
+    T2 *clblasAsum, *blasAsum;
+    cl_mem bufX, bufAsum, scratchBuff;
+    clMath::BlasBase *base;
+    cl_event *events;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T1) == typeid(cl_double) ||
+         typeid(T1) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));
+
+    blasX 	= new T1[lengthX + params->offBX ];
+	blasAsum = new T2[1];
+    clblasAsum = new T2[1 + params->offa];
+
+	if((blasX == NULL) || (clblasAsum == NULL) || (blasAsum == NULL))
+	{
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        deleteBuffers<T1>(blasX);
+        deleteBuffers<T2>(blasAsum,  clblasAsum);
+		delete[] events;
+		SUCCEED();
+        return;
+	}
+
+    srand(params->seed);
+    ::std::cerr << "Generating input data... ";
+
+	randomVectors<T1>(params->N, (blasX + params->offBX), params->incx, (T1*)NULL, 0, true);
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE);
+    bufAsum = base->createEnqueueBuffer(NULL, (1 + params->offa) * sizeof(T2), 0, CL_MEM_READ_WRITE);
+	scratchBuff = base->createEnqueueBuffer(NULL, (lengthX * sizeof(T1)), 0, CL_MEM_READ_WRITE);
+
+    ::std::cerr << "Calling reference xASUM routine... ";
+
+	*blasAsum = ::clMath::blas::asum( params->N, blasX, params->offBX, params->incx);
+    ::std::cerr << "Done" << ::std::endl;
+
+    if ((bufX == NULL) || (bufAsum == NULL) || (scratchBuff == NULL)) {
+        releaseMemObjects(bufX, bufAsum, scratchBuff);
+        deleteBuffers<T1>(blasX);
+        deleteBuffers<T2>(blasAsum, clblasAsum);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xASUM routine... ";
+
+    DataType type;
+    type = ( typeid(T1) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T1) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T1) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE;
+
+    // Should use bufXTemp as well
+    err = (cl_int)::clMath::clblas::asum( type, params->N,  bufAsum, params->offa, bufX,
+    					params->offBX, params->incx, scratchBuff, params->numCommandQueues, base->commandQueues(),
+    					0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufX, bufAsum, scratchBuff);
+        deleteBuffers<T1>(blasX );
+        deleteBuffers<T2>(blasAsum, clblasAsum);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::ASUM() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufX, bufAsum, scratchBuff);
+        deleteBuffers<T1>(blasX );
+        deleteBuffers<T2>(blasAsum, clblasAsum);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufAsum, CL_TRUE, 0,
+        (1 + params->offa) * sizeof(*clblasAsum), clblasAsum, 0,
+        NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "ASUM: Reading results failed...." << std::endl;
+	}
+    releaseMemObjects(bufX, bufAsum, scratchBuff);
+
+    compareMatrices<T2>(clblasColumnMajor, 1 , 1, (blasAsum), (clblasAsum+params->offa), 1);
+    deleteBuffers<T1>(blasX);
+    deleteBuffers<T2>(blasAsum, clblasAsum);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(ASUM, sasum) {
+    TestParams params;
+
+    getParams(&params);
+    asumCorrectnessTest<cl_float, cl_float>(&params);
+}
+
+TEST_P(ASUM, dasum) {
+    TestParams params;
+
+    getParams(&params);
+    asumCorrectnessTest<cl_double, cl_double>(&params);
+}
+
+TEST_P(ASUM, scasum) {
+    TestParams params;
+
+    getParams(&params);
+    asumCorrectnessTest<FloatComplex, cl_float>(&params);
+}
+
+TEST_P(ASUM, dzasum) {
+    TestParams params;
+
+    getParams(&params);
+    asumCorrectnessTest<DoubleComplex, cl_double>(&params);
+}
diff --git a/src/tests/correctness/corr-axpy.cpp b/src/tests/correctness/corr-axpy.cpp
new file mode 100644
index 0000000..c5816bc
--- /dev/null
+++ b/src/tests/correctness/corr-axpy.cpp
@@ -0,0 +1,217 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <axpy.h>
+
+static void
+releaseMemObjects(cl_mem objX,  cl_mem objY)
+{
+  if(objX != NULL)
+  {
+  	clReleaseMemObject(objX);
+  }
+  if(objY != NULL)
+  {
+    clReleaseMemObject(objY);
+  }
+}
+
+template <typename T> static void
+deleteBuffers(T *X, T *Y,  T *blasX, T *blasY)
+{
+    if(X != NULL)
+    {
+    delete[] X;
+    }
+	if(blasX != NULL)
+	{
+    delete[] blasX;
+	}
+    if(Y != NULL)
+    {
+    delete[] Y;
+    }
+	if(blasY != NULL)
+	{
+    delete[] blasY;
+	}
+}
+
+template <typename T>
+void
+axpyCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *X, *Y; //For OpenCL implementation
+    T *blasX, *blasY;// For reference implementation
+    cl_mem bufX, bufY;
+    clMath::BlasBase *base;
+    cl_event *events;
+    T alpha;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));
+    size_t lengthY = (1 + ((params->N -1) * abs(params->incy)));
+
+    X 		= new T[lengthX + params->offBX ];
+    Y 		= new T[lengthY + params->offCY ];
+    blasX 	= new T[lengthX + params->offBX ];
+    blasY	= new T[lengthY + params->offCY ];
+
+	if((X == NULL) || (blasX == NULL) || (Y == NULL) || (blasY == NULL))
+	{
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        deleteBuffers<T>(X, Y, blasX, blasY);
+		delete[] events;
+		SUCCEED();
+        return;
+	}
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+    // Populate X and Y
+    randomVectors(params->N, (X+params->offBX), params->incx, (Y+params->offCY), params->incy);
+
+	memcpy(blasX, X, (lengthX + params->offBX) * sizeof(T));
+	memcpy(blasY, Y, (lengthY + params->offCY) * sizeof(T));
+    alpha = convertMultiplier<T>(params->alpha);
+
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(T), 0, CL_MEM_READ_ONLY);
+    bufY = base->createEnqueueBuffer(Y, (lengthY + params->offCY)* sizeof(T), 0, CL_MEM_READ_WRITE);
+
+	if ((bufX == NULL) || (bufY == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufX, bufY);
+        deleteBuffers<T>(X, Y, blasX, blasY);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling reference xAXPY routine... ";
+
+	::clMath::blas::axpy((size_t)params->N, alpha, blasX, (size_t)params->offBX, params->incx,
+						 blasY, (size_t)params->offCY, params->incy);
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    ::std::cerr << "Calling clblas xAXPY routine... ";
+
+    err = (cl_int)::clMath::clblas::axpy(params->N, alpha, bufX, params->offBX, params->incx, bufY, params->offCY, params->incy,
+										  params->numCommandQueues, base->commandQueues(), 0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufX, bufY);
+        deleteBuffers<T>(X, Y, blasX, blasY);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::AXPY() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufX, bufY);
+        deleteBuffers<T>(X, Y, blasX, blasY);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0,
+        (lengthY + params->offCY) * sizeof(T), Y, 0, NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "AXPY: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufX, bufY);
+
+    compareMatrices<T>(clblasRowMajor, lengthY , 1, (blasY + params->offCY), (Y + params->offCY), 1);
+    deleteBuffers<T>(X, Y, blasX, blasY);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(AXPY, saxpy) {
+    TestParams params;
+
+    getParams(&params);
+    axpyCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(AXPY, daxpy) {
+    TestParams params;
+
+    getParams(&params);
+    axpyCorrectnessTest<cl_double>(&params);
+}
+
+TEST_P(AXPY, caxpy) {
+    TestParams params;
+
+    getParams(&params);
+    axpyCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(AXPY, zaxpy) {
+    TestParams params;
+
+    getParams(&params);
+    axpyCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-copy.cpp b/src/tests/correctness/corr-copy.cpp
new file mode 100644
index 0000000..2ee46c0
--- /dev/null
+++ b/src/tests/correctness/corr-copy.cpp
@@ -0,0 +1,211 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <copy.h>
+
+static void
+releaseMemObjects(cl_mem objX, cl_mem objY)
+{
+    if(objX != NULL)
+ 	{
+        clReleaseMemObject(objX);
+	}
+	if(objY != NULL)
+    {
+        clReleaseMemObject(objY);
+	}
+}
+
+template <typename T> static void
+deleteBuffers(T *blasX, T *blasY, T *clblasY)
+{
+	if(blasX != NULL)
+	{
+        delete[] blasX;
+    }
+	if(blasY != NULL)
+	{
+	    delete[] blasY;
+	}
+	if(clblasY != NULL)
+	{
+		delete[] clblasY;
+	}
+}
+
+template <typename T>
+void
+copyCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *blasX, *blasY, *clblasY;
+    cl_mem bufX, bufY;
+    clMath::BlasBase *base;
+    cl_event *events;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));
+	size_t lengthY = (1 + ((params->N -1) * abs(params->incy)));
+
+    blasX 	= new T[lengthX + params->offBX ];
+    blasY 	= new T[lengthY + params->offCY ];
+	clblasY = new T[lengthY + params->offCY ];
+
+	if((blasX == NULL) || (blasY == NULL) || (clblasY == NULL))
+	{
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        deleteBuffers<T>(blasX, blasY, clblasY);
+		delete[] events;
+		SUCCEED();
+        return;
+	}
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+    // Populate A and blasX
+    randomVectors( params->N, (blasX+params->offBX), params->incx, (blasY+params->offCY), params->incy );
+	memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY));
+
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE);
+    bufY = base->createEnqueueBuffer(blasY, (lengthY + params->offCY)* sizeof(*blasY), 0, CL_MEM_READ_WRITE);
+
+    ::std::cerr << "Calling reference xCOPY routine... ";
+
+	::clMath::blas::copy( params->N, blasX, params->offBX, params->incx, blasY, params->offCY, params->incy);
+    ::std::cerr << "Done" << ::std::endl;
+
+    if ((bufX == NULL) || (bufY == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufX, bufY);
+        deleteBuffers<T>(blasX, blasY, clblasY);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xCOPY routine... ";
+
+    DataType type;
+    type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE;
+
+    // Should use bufXTemp as well
+    err = (cl_int)::clMath::clblas::copy(type, params->N, bufX,
+    					params->offBX, params->incx, bufY, params->offCY, params->incy, params->numCommandQueues, base->commandQueues(),
+    					0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufX, bufY);
+        deleteBuffers<T>(blasX, blasY, clblasY);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::COPY() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufX, bufY);
+        deleteBuffers<T>(blasX, blasY, clblasY);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0,
+        ((lengthY + params->offCY) * sizeof(*blasY)), clblasY, 0,
+        NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "COPY: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufX, bufY);
+
+    compareMatrices<T>(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY), lengthY, NULL);
+    deleteBuffers<T>(blasX, blasY, clblasY);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(COPY, scopy) {
+    TestParams params;
+
+    getParams(&params);
+    copyCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(COPY, dcopy) {
+    TestParams params;
+
+    getParams(&params);
+    copyCorrectnessTest<cl_double>(&params);
+}
+
+TEST_P(COPY, ccopy) {
+    TestParams params;
+
+    getParams(&params);
+    copyCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(COPY, zcopy) {
+    TestParams params;
+
+    getParams(&params);
+    copyCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-dot.cpp b/src/tests/correctness/corr-dot.cpp
new file mode 100644
index 0000000..c496925
--- /dev/null
+++ b/src/tests/correctness/corr-dot.cpp
@@ -0,0 +1,217 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <dot.h>
+
+static void
+releaseMemObjects(cl_mem objX, cl_mem objY, cl_mem objDP, cl_mem objScratch)
+{
+    if(objX != NULL)
+ 	{
+        clReleaseMemObject(objX);
+	}
+	if(objY != NULL)
+    {
+        clReleaseMemObject(objY);
+	}
+	if(objDP != NULL)
+    {
+        clReleaseMemObject(objDP);
+    }
+	if(objScratch != NULL)
+    {
+        clReleaseMemObject(objScratch);
+    }
+}
+
+template <typename T> static void
+deleteBuffers(T *blasX, T *blasY, T *blasDP, T *clblasDP)
+{
+	if(blasX != NULL)
+	{
+        delete[] blasX;
+    }
+	if(blasY != NULL)
+	{
+	    delete[] blasY;
+	}
+	if(clblasDP != NULL)
+    {
+        delete[] clblasDP;
+    }
+	if(blasDP != NULL)
+    {
+        delete(blasDP);
+    }
+}
+
+template <typename T>
+void
+dotCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *blasX, *blasY, *clblasDP, *blasDP;
+    cl_mem bufX, bufY, bufDP, scratchBuff;
+    clMath::BlasBase *base;
+    cl_event *events;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));
+	size_t lengthY = (1 + ((params->N -1) * abs(params->incy)));
+
+    blasX 	= new T[lengthX + params->offBX ];
+    blasY 	= new T[lengthY + params->offCY ];
+	blasDP = new T[1];
+    clblasDP = new T[1 + params->offa];
+
+	if((blasX == NULL) || (blasY == NULL) || (clblasDP == NULL) || (blasDP == NULL))
+	{
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        deleteBuffers<T>(blasX, blasY, blasDP,  clblasDP);
+		delete[] events;
+		SUCCEED();
+        return;
+	}
+
+    srand(params->seed);
+    ::std::cerr << "Generating input data... ";
+
+	randomVectors(params->N, (blasX + params->offBX), params->incx, (blasY + params->offCY), params->incy, true);
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE);
+    bufY = base->createEnqueueBuffer(blasY, (lengthY + params->offCY)* sizeof(*blasY), 0, CL_MEM_READ_WRITE);
+    bufDP = base->createEnqueueBuffer(NULL, (1 + params->offa) * sizeof(T), 0, CL_MEM_READ_WRITE);
+	scratchBuff = base->createEnqueueBuffer(NULL, (lengthX * sizeof(T)), 0, CL_MEM_READ_WRITE);
+
+    ::std::cerr << "Calling reference xDOT routine... ";
+
+	*blasDP  = ::clMath::blas::dot( params->N, blasX, params->offBX, params->incx, blasY, params->offCY, params->incy);
+    ::std::cerr << "Done" << ::std::endl;
+
+    if ((bufX == NULL) || (bufY == NULL) || (bufDP == NULL) || (scratchBuff == NULL)) {
+        releaseMemObjects(bufX, bufY, bufDP, scratchBuff);
+        deleteBuffers<T>(blasX, blasY, blasDP, clblasDP);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xDOT routine... ";
+
+    DataType type;
+    type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE;
+
+    // Should use bufXTemp as well
+    err = (cl_int)::clMath::clblas::dot( type, params->N,  bufDP, params->offa, bufX,
+    					params->offBX, params->incx, bufY, params->offCY, params->incy, scratchBuff, params->numCommandQueues, base->commandQueues(),
+    					0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufX, bufY, bufDP, scratchBuff);
+        deleteBuffers<T>(blasX, blasY, blasDP, clblasDP);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::DOT() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufX, bufY, bufDP, scratchBuff);
+        deleteBuffers<T>(blasX, blasY, blasDP, clblasDP);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufDP, CL_TRUE, 0,
+        (1 + params->offa) * sizeof(*clblasDP), clblasDP, 0,
+        NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "DOT: Reading results failed...." << std::endl;
+	}
+    releaseMemObjects(bufX, bufY, bufDP, scratchBuff);
+
+    compareMatrices<T>(clblasColumnMajor, 1 , 1, (blasDP), (clblasDP+params->offa), 1);
+    deleteBuffers<T>(blasX, blasY, blasDP, clblasDP);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(DOT, sdot) {
+    TestParams params;
+
+    getParams(&params);
+    dotCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(DOT, ddot) {
+    TestParams params;
+
+    getParams(&params);
+    dotCorrectnessTest<cl_double>(&params);
+}
+
+TEST_P(DOT, cdotu) {
+    TestParams params;
+
+    getParams(&params);
+    dotCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(DOT, zdotu) {
+    TestParams params;
+
+    getParams(&params);
+    dotCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-dotc.cpp b/src/tests/correctness/corr-dotc.cpp
new file mode 100644
index 0000000..d4a68b1
--- /dev/null
+++ b/src/tests/correctness/corr-dotc.cpp
@@ -0,0 +1,204 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <dotc.h>
+
+static void
+releaseMemObjects(cl_mem objX, cl_mem objY, cl_mem objDP, cl_mem objScratch)
+{
+    if(objX != NULL)
+ 	{
+        clReleaseMemObject(objX);
+	}
+	if(objY != NULL)
+    {
+        clReleaseMemObject(objY);
+	}
+	if(objDP != NULL)
+    {
+        clReleaseMemObject(objDP);
+    }
+	if(objScratch != NULL)
+    {
+        clReleaseMemObject(objScratch);
+    }
+}
+
+template <typename T> static void
+deleteBuffers(T *blasX, T *blasY, T *blasDP, T *clblasDP)
+{
+	if(blasX != NULL)
+	{
+        delete[] blasX;
+    }
+	if(blasY != NULL)
+	{
+	    delete[] blasY;
+	}
+	if(clblasDP != NULL)
+    {
+        delete[] clblasDP;
+    }
+	if(blasDP != NULL)
+    {
+        delete(blasDP);
+    }
+}
+
+template <typename T>
+void
+dotcCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *blasX, *blasY, *clblasDP, *blasDP;
+    cl_mem bufX, bufY, bufDP, scratchBuff;
+    clMath::BlasBase *base;
+    cl_event *events;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));
+	size_t lengthY = (1 + ((params->N -1) * abs(params->incy)));
+
+    blasX 	= new T[lengthX + params->offBX ];
+    blasY 	= new T[lengthY + params->offCY ];
+	blasDP = new T[1];
+    clblasDP = new T[1 + params->offa];
+
+	if((blasX == NULL) || (blasY == NULL) || (clblasDP == NULL) || (blasDP == NULL))
+	{
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        deleteBuffers<T>(blasX, blasY, blasDP,  clblasDP);
+		delete[] events;
+		SUCCEED();
+        return;
+	}
+
+    srand(params->seed);
+    ::std::cerr << "Generating input data... ";
+
+	randomVectors(params->N, (blasX + params->offBX), params->incx, (blasY + params->offCY), params->incy, true);
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE);
+    bufY = base->createEnqueueBuffer(blasY, (lengthY + params->offCY)* sizeof(*blasY), 0, CL_MEM_READ_WRITE);
+    bufDP = base->createEnqueueBuffer(NULL, (1 + params->offa) * sizeof(T), 0, CL_MEM_READ_WRITE);
+	scratchBuff = base->createEnqueueBuffer(NULL, (lengthX * sizeof(T)), 0, CL_MEM_READ_WRITE);
+
+    ::std::cerr << "Calling reference xDOTC routine... ";
+
+	*blasDP  = ::clMath::blas::dotc( params->N, blasX, params->offBX, params->incx, blasY, params->offCY, params->incy);
+    ::std::cerr << "Done" << ::std::endl;
+
+    if ((bufX == NULL) || (bufY == NULL) || (bufDP == NULL) || (scratchBuff == NULL)) {
+        releaseMemObjects(bufX, bufY, bufDP, scratchBuff);
+        deleteBuffers<T>(blasX, blasY, blasDP, clblasDP);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xDOTC routine... ";
+
+    DataType type;
+    type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE;
+
+    // Should use bufXTemp as well
+    err = (cl_int)::clMath::clblas::dotc( type, params->N,  bufDP, params->offa, bufX,
+    					params->offBX, params->incx, bufY, params->offCY, params->incy, scratchBuff, params->numCommandQueues, base->commandQueues(),
+    					0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufX, bufY, bufDP, scratchBuff);
+        deleteBuffers<T>(blasX, blasY, blasDP, clblasDP);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::DOTC() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufX, bufY, bufDP, scratchBuff);
+        deleteBuffers<T>(blasX, blasY, blasDP, clblasDP);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufDP, CL_TRUE, 0,
+        (1 + params->offa) * sizeof(*clblasDP), clblasDP, 0,
+        NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "DOTC: Reading results failed...." << std::endl;
+	}
+    releaseMemObjects(bufX, bufY, bufDP, scratchBuff);
+
+    compareMatrices<T>(clblasColumnMajor, 1 , 1, (blasDP), (clblasDP+params->offa), 1);
+    deleteBuffers<T>(blasX, blasY, blasDP, clblasDP);
+    delete[] events;
+}
+
+// Instantiate the test
+
+
+TEST_P(DOTC, cdotc) {
+    TestParams params;
+
+    getParams(&params);
+    dotcCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(DOTC, zdotc) {
+    TestParams params;
+
+    getParams(&params);
+    dotcCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-gbmv.cpp b/src/tests/correctness/corr-gbmv.cpp
new file mode 100644
index 0000000..efa8b4b
--- /dev/null
+++ b/src/tests/correctness/corr-gbmv.cpp
@@ -0,0 +1,248 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <gbmv.h>
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY)
+{
+    if(objA != NULL)
+	{
+        clReleaseMemObject(objA);
+    }
+	if(objX != NULL)
+	{
+        clReleaseMemObject(objX);
+	}
+	if(objY != NULL)
+	{
+	    clReleaseMemObject(objY);
+    }
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *X, T *blasY, T *clblasY)
+{
+    if(A != NULL)
+	{
+        delete[] A;
+	}
+    if(X != NULL)
+	{
+        delete[] X;
+	}
+	if(blasY != NULL)
+	{
+	    delete[] blasY;
+	}
+    if(clblasY != NULL)
+	{
+        delete[] clblasY; // To hold clblas GBMV call results
+    }
+}
+
+template <typename T>
+void
+gbmvCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *X, *blasY, *clblasY;
+    cl_mem bufA, bufX, bufY;
+    clMath::BlasBase *base;
+    cl_event *events;
+	T alpha, beta;
+	size_t lengthX, lengthY, lengthA;
+
+    base = clMath::BlasBase::getInstance();
+
+    if (( (typeid(T) == typeid(DoubleComplex)) || (typeid(T) == typeid(cl_double)) ) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    lengthA = ((params->order == clblasColumnMajor)? params->N : params->M) * params->lda;
+
+    if (params->transA == clblasNoTrans) {
+        lengthX = (params->N - 1)*abs(params->incx) + 1;
+        lengthY = (params->M - 1)*abs(params->incy) + 1;
+    }
+    else {
+        lengthX = (params->M - 1)*abs(params->incx) + 1;
+        lengthY = (params->N - 1)*abs(params->incy) + 1;
+    }
+
+    A 	= new T[lengthA + params->offA ];
+    X 	= new T[lengthX + params->offBX ];
+    blasY  		= new T[lengthY + params->offCY ];
+	clblasY 	= new T[lengthY + params->offCY ];
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+	if((A == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL))
+	{
+		deleteBuffers<T>(A, X, blasY, clblasY);
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        delete[] events;
+        SUCCEED();
+        return;
+	}
+
+	alpha = convertMultiplier<T>(params->alpha);
+	beta = convertMultiplier<T>(params->beta);
+
+    randomGbmvMatrices(params->order, params->transA, params->M, params->N, &alpha, &beta,
+                        (A + params->offA), params->lda, (X+params->offBX), params->incx, (blasY+params->offCY), params->incy );
+    // Copy blasY to clblasY
+    memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY));
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufA = base->createEnqueueBuffer(A, (lengthA + params->offA)* sizeof(*A), 0, CL_MEM_READ_ONLY);
+    bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY);
+    bufY = base->createEnqueueBuffer(clblasY, (lengthY + params->offCY) * sizeof(*clblasY), 0, CL_MEM_READ_WRITE);
+
+    ::std::cerr << "Calling reference xGBMV routine... ";
+
+	clblasOrder fOrder;
+	clblasTranspose fTrans;
+	fOrder = params->order;
+	fTrans = params->transA;
+	size_t fM = params->M, fN = params->N, fKL = params->KL, fKU = params->KU;
+
+	if (fOrder != clblasColumnMajor)
+    {
+        fOrder = clblasColumnMajor;
+        fTrans = (params->transA == clblasNoTrans)? clblasTrans : clblasNoTrans;
+        fM = params->N;
+        fN = params->M;
+        fKL = params->KU;
+        fKU = params->KL;
+
+		if( params->transA == clblasConjTrans )
+            doConjugate( (A+params->offa), 1, lengthA, params->lda );
+   	}
+	clMath::blas::gbmv(fOrder, fTrans, fM, fN, fKL, fKU, alpha, A, params->offA, params->lda,
+							X, params->offBX, params->incx, beta, blasY, params->offCY, params->incy);
+    ::std::cerr << "Done" << ::std::endl;
+
+    if ((bufA == NULL) || (bufX == NULL) || (bufY == NULL)) {
+        // Skip the test, the most probable reason is
+        //     matrix too big for a device.
+
+        releaseMemObjects(bufA, bufX, bufY);
+        deleteBuffers<T>(A, X, blasY, clblasY);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xGBMV routine... ";
+
+    err = (cl_int)clMath::clblas::gbmv(params->order, params->transA, params->M, params->N, params->KL, params->KU,
+                                        alpha, bufA, params->offA, params->lda, bufX, params->offBX, params->incx,
+                                        beta, bufY, params->offCY, params->incy,
+                                        params->numCommandQueues, base->commandQueues(), 0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufX, bufY);
+        deleteBuffers<T>(A, X, blasY, clblasY);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::GBMV() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufX, bufY);
+        deleteBuffers<T>(A, X, blasY, clblasY);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0,
+        (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0,
+        NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "GBMV: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufA, bufX, bufY);
+    compareMatrices<T>(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY),
+                       lengthY);
+    deleteBuffers<T>(A, X, blasY, clblasY);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(GBMV, sgbmv) {
+    TestParams params;
+
+    getParams(&params);
+    gbmvCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(GBMV, dgbmv) {
+    TestParams params;
+
+    getParams(&params);
+    gbmvCorrectnessTest<cl_double>(&params);
+}
+
+TEST_P(GBMV, cgbmv) {
+    TestParams params;
+
+    getParams(&params);
+    gbmvCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(GBMV, zgbmv) {
+    TestParams params;
+
+    getParams(&params);
+    gbmvCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-gemm.cpp b/src/tests/correctness/corr-gemm.cpp
new file mode 100644
index 0000000..5837bed
--- /dev/null
+++ b/src/tests/correctness/corr-gemm.cpp
@@ -0,0 +1,233 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <gemm.h>
+
+#include "tcase-filter.h"
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objB, cl_mem objC)
+{
+    clReleaseMemObject(objA);
+    clReleaseMemObject(objB);
+    clReleaseMemObject(objC);
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *B, T *blasC, T *clblasC)
+{
+    delete[] A;
+    delete[] B;
+    delete[] blasC;
+    delete[] clblasC;
+}
+
+template <typename T>
+void
+gemmCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *B, *blasC, *clblasC;
+    T alpha, beta;
+    cl_mem bufA, bufB, bufC;
+    clMath::BlasBase *base;
+    bool useAlpha;
+    bool useBeta;
+    cl_event *events;
+    bool isComplex;
+
+    base = clMath::BlasBase::getInstance();
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    isComplex = ((typeid(T) == typeid(FloatComplex)) ||
+                 (typeid(T) == typeid(DoubleComplex)));
+
+    if (canCaseBeSkipped(params, isComplex)) {
+        std::cerr << ">> Test is skipped because it has no importance for this "
+                     "level of coverage" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    useAlpha = base->useAlpha();
+    useBeta = base->useBeta();
+    alpha = ZERO<T>();
+    beta = ZERO<T>();
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    A = new T[params->rowsA * params->columnsA];
+    B = new T[params->rowsB * params->columnsB];
+    blasC = new T[params->rowsC * params->columnsC];
+    clblasC = new T[params->rowsC * params->columnsC];
+
+    srand(params->seed);
+    if (useAlpha) {
+        alpha = convertMultiplier<T>(params->alpha);
+    }
+    if (useBeta) {
+        beta = convertMultiplier<T>(params->beta);
+    }
+
+    ::std::cerr << "Generating input data... ";
+    randomGemmMatrices<T>(params->order, params->transA, params->transB,
+        params->M, params->N, params->K, useAlpha, &alpha, A, params->lda,
+        B, params->ldb, useBeta, &beta, blasC, params->ldc);
+    memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC));
+    ::std::cerr << "Done" << ::std::endl;
+
+    ::std::cerr << "Calling reference xGEMM routine... ";
+    if (params->order == clblasColumnMajor) {
+        ::clMath::blas::gemm(clblasColumnMajor, params->transA, params->transB,
+                          params->M, params->N, params->K, alpha, A,
+                          params->lda, B, params->ldb, beta, blasC, params->ldc);
+    }
+    else {
+        T *reorderedA = new T[params->rowsA * params->columnsA];
+        T *reorderedB = new T[params->rowsB * params->columnsB];
+        T *reorderedC = new T[params->rowsC * params->columnsC];
+
+        reorderMatrix<T>(clblasRowMajor, params->rowsA, params->columnsA,
+                         A, reorderedA);
+        reorderMatrix<T>(clblasRowMajor, params->rowsB, params->columnsB,
+                         B, reorderedB);
+        reorderMatrix<T>(clblasRowMajor, params->rowsC, params->columnsC,
+                         blasC, reorderedC);
+        ::clMath::blas::gemm(clblasColumnMajor, params->transA, params->transB,
+                          params->M, params->N, params->K, alpha, reorderedA,
+                          params->rowsA, reorderedB, params->rowsB,
+                          beta, reorderedC, params->rowsC);
+        reorderMatrix<T>(clblasColumnMajor, params->rowsC, params->columnsC,
+                         reorderedC, blasC);
+
+        delete[] reorderedC;
+        delete[] reorderedB;
+        delete[] reorderedA;
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA *
+                                        sizeof(*A), params->offA * sizeof(*A),
+                                     CL_MEM_READ_ONLY);
+    bufB = base->createEnqueueBuffer(B, params->rowsB * params->columnsB *
+                                        sizeof(*B), params->offBX * sizeof(*B),
+                                     CL_MEM_READ_ONLY);
+    bufC = base->createEnqueueBuffer(clblasC, params->rowsC * params->columnsC *
+                                              sizeof(*clblasC),
+                                     params->offCY * sizeof(*clblasC),
+                                     CL_MEM_READ_WRITE);
+    if ((bufA == NULL) || (bufB == NULL) || (bufC == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufA, bufB, bufC);
+        deleteBuffers<T>(A, B, blasC, clblasC);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xGEMM routine... ";
+    err = (cl_int)::clMath::clblas::gemm(params->order, params->transA,
+        params->transB, params->M, params->N, params->K, alpha, bufA,
+        params->offA, params->lda, bufB, params->offBX, params->ldb, beta,
+        bufC, params->offCY, params->ldc, params->numCommandQueues,
+        base->commandQueues(), 0, NULL, events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufB, bufC);
+        deleteBuffers<T>(A, B, blasC, clblasC);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::GEMM() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufB, bufC);
+        deleteBuffers<T>(A, B, blasC, clblasC);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE,
+                        params->offCY * sizeof(*clblasC),
+                        params->rowsC * params->columnsC * sizeof(*clblasC),
+                        clblasC, 0, NULL, NULL);
+
+    releaseMemObjects(bufA, bufB, bufC);
+    compareMatrices<T>(params->order, params->M, params->N, blasC, clblasC,
+                       params->ldc);
+    deleteBuffers<T>(A, B, blasC, clblasC);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(GEMM, sgemm) {
+    TestParams params;
+
+    getParams(&params);
+    gemmCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(GEMM, dgemm) {
+    TestParams params;
+
+    getParams(&params);
+    gemmCorrectnessTest<cl_double>(&params);
+}
+
+TEST_P(GEMM, cgemm) {
+    TestParams params;
+
+    getParams(&params);
+    gemmCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(GEMM, zgemm) {
+    TestParams params;
+
+    getParams(&params);
+    gemmCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-gemm2.cpp b/src/tests/correctness/corr-gemm2.cpp
new file mode 100644
index 0000000..2730d42
--- /dev/null
+++ b/src/tests/correctness/corr-gemm2.cpp
@@ -0,0 +1,256 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <gemm-2.h>
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objB, cl_mem objC)
+{
+    if(objA != NULL)
+ 	{
+    clReleaseMemObject(objA);
+	}
+	if(objB != NULL)
+    {
+    clReleaseMemObject(objB);
+	}
+	if(objC != NULL)
+	{
+    clReleaseMemObject(objC);
+}
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *B, T *blasC, T *clblasC)
+{
+    if(A != NULL)
+    {
+    delete[] A;
+    }
+	if(B != NULL)
+	{
+    delete[] B;
+	}
+	if(blasC != NULL)
+	{
+    delete[] blasC;
+	}
+	if(clblasC != NULL)
+	{
+    delete[] clblasC;
+}
+}
+
+template <typename T>
+void
+gemm2CorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *B, *blasC, *clblasC;
+    T alpha, beta;
+    cl_mem bufA, bufB, bufC;
+    clMath::BlasBase *base;
+    bool useAlpha;
+    bool useBeta;
+    cl_event *events;
+
+    base = clMath::BlasBase::getInstance();
+    useAlpha = base->useAlpha();
+    useBeta = base->useBeta();
+    alpha = ZERO<T>();
+    beta = ZERO<T>();
+
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    A = new T[params->rowsA * params->columnsA];
+    B = new T[params->rowsB * params->columnsB];
+    blasC = new T[params->rowsC * params->columnsC];
+    clblasC = new T[params->rowsC * params->columnsC];
+
+	if((A == NULL) || (B == NULL) || (blasC == NULL) || (clblasC == NULL))
+	{
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        deleteBuffers(A, B, blasC, clblasC);
+		SUCCEED();
+        return;
+	}
+
+    srand(params->seed);
+    if (useAlpha) {
+        alpha = convertMultiplier<T>(params->alpha);
+    }
+    if (useBeta) {
+        beta = convertMultiplier<T>(params->beta);
+    }
+
+    ::std::cerr << "Generating input data... ";
+    randomGemmMatrices<T>(params->order, params->transA, params->transB,
+        params->M, params->N, params->K, useAlpha, &alpha, A, params->lda,
+        B, params->ldb, useBeta, &beta, blasC, params->ldc);
+    memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC));
+    ::std::cerr << "Done" << ::std::endl;
+
+    ::std::cerr << "Calling reference xGEMM routine... ";
+    if (params->order == clblasColumnMajor) {
+        ::clMath::blas::gemm(clblasColumnMajor, params->transA, params->transB,
+                          params->M, params->N, params->K, alpha, A,
+                          params->lda, B, params->ldb, beta, blasC, params->ldc);
+    }
+    else {
+        T *reorderedA = new T[params->rowsA * params->columnsA];
+        T *reorderedB = new T[params->rowsB * params->columnsB];
+        T *reorderedC = new T[params->rowsC * params->columnsC];
+
+		if((reorderedA == NULL) || (reorderedB == NULL) || (reorderedC == NULL))
+		{
+			::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+			SUCCEED();
+			return;
+		}
+
+        reorderMatrix<T>(clblasRowMajor, params->rowsA, params->columnsA,
+                         A, reorderedA);
+        reorderMatrix<T>(clblasRowMajor, params->rowsB, params->columnsB,
+                         B, reorderedB);
+        reorderMatrix<T>(clblasRowMajor, params->rowsC, params->columnsC,
+                         blasC, reorderedC);
+        ::clMath::blas::gemm(clblasColumnMajor, params->transA, params->transB,
+                          params->M, params->N, params->K, alpha, reorderedA,
+                          params->rowsA, reorderedB, params->rowsB,
+                          beta, reorderedC, params->rowsC);
+        reorderMatrix<T>(clblasColumnMajor, params->rowsC, params->columnsC,
+                         reorderedC, blasC);
+
+        delete[] reorderedC;
+        delete[] reorderedB;
+        delete[] reorderedA;
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA *
+                                        sizeof(*A), params->offA * sizeof(*A),
+                                     CL_MEM_READ_ONLY);
+    bufB = base->createEnqueueBuffer(B, params->rowsB * params->columnsB *
+                                        sizeof(*B), params->offBX * sizeof(*B),
+                                     CL_MEM_READ_ONLY);
+    bufC = base->createEnqueueBuffer(clblasC, params->rowsC * params->columnsC *
+                                              sizeof(*clblasC),
+                                     params->offCY * sizeof(*clblasC),
+                                     CL_MEM_READ_WRITE);
+    if ((bufA == NULL) || (bufB == NULL) || (bufC == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufA, bufB, bufC);
+        deleteBuffers<T>(A, B, blasC, clblasC);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xGEMM routine... ";
+    err = (cl_int)::clMath::clblas::gemm2(params->order, params->transA,
+        params->transB, params->M, params->N, params->K, alpha, bufA,
+        params->offA, params->lda, bufB, params->offBX, params->ldb, beta,
+        bufC, params->offCY, params->ldc, params->numCommandQueues,
+        base->commandQueues(), 0, NULL, events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufB, bufC);
+        deleteBuffers<T>(A, B, blasC, clblasC);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::GEMM() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufB, bufC);
+        deleteBuffers<T>(A, B, blasC, clblasC);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE,
+                        params->offCY * sizeof(*clblasC),
+                        params->rowsC * params->columnsC * sizeof(*clblasC),
+                        clblasC, 0, NULL, NULL);
+
+    releaseMemObjects(bufA, bufB, bufC);
+    compareMatrices<T>(params->order, params->M, params->N, blasC, clblasC,
+                       params->ldc);
+    deleteBuffers<T>(A, B, blasC, clblasC);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(GEMM2, sgemm2) {
+    TestParams params;
+
+    getParams(&params);
+    gemm2CorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(GEMM2, dgemm2) {
+    TestParams params;
+
+    getParams(&params);
+    gemm2CorrectnessTest<cl_double>(&params);
+}
+
+TEST_P(GEMM2, cgemm2) {
+    TestParams params;
+
+    getParams(&params);
+    gemm2CorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(GEMM2, zgemm2) {
+    TestParams params;
+
+    getParams(&params);
+    gemm2CorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-gemv.cpp b/src/tests/correctness/corr-gemv.cpp
new file mode 100644
index 0000000..c1a564c
--- /dev/null
+++ b/src/tests/correctness/corr-gemv.cpp
@@ -0,0 +1,246 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <gemv.h>
+
+#include "tcase-filter.h"
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY)
+{
+    clReleaseMemObject(objA);
+    clReleaseMemObject(objX);
+    clReleaseMemObject(objY);
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *X, T *blasY, T *clblasY)
+{
+    delete[] A;
+    delete[] X;
+    delete[] blasY;
+    delete[] clblasY;
+}
+
+template <typename T>
+void
+gemvCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *B, *blasC, *clblasC, *X, *Y;
+    T alpha, beta;
+    cl_mem bufA, bufB, bufC;
+    clMath::BlasBase *base;
+    bool useAlpha, useBeta;
+    cl_event *events;
+    size_t lenY, lenX;
+    bool isComplex;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    isComplex = ((typeid(T) == typeid(FloatComplex)) ||
+                 (typeid(T) == typeid(DoubleComplex)));
+    if (canCaseBeSkipped(params, isComplex)) {
+        std::cerr << ">> Test is skipped because it has no importance for this "
+                     "level of coverage" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    useAlpha = base->useAlpha();
+    useBeta = base->useBeta();
+    beta = ZERO<T>();
+    alpha = ZERO<T>();
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    A = new T[params->rowsA * params->columnsA];
+    B = new T[params->rowsB * params->columnsB];
+    blasC = new T[params->rowsC * params->columnsC];
+    clblasC = new T[params->rowsC * params->columnsC];
+    X = &B[params->offBX];
+    Y = &blasC[params->offCY];
+
+    srand(params->seed);
+    if (useAlpha) {
+        alpha = convertMultiplier<T>(params->alpha);
+    }
+    if (useBeta) {
+        beta = convertMultiplier<T>(params->beta);
+    }
+
+    if (params->transA == clblasNoTrans) {
+        lenX = params->N;
+        lenY = params->M;
+    }
+    else {
+        lenX = params->M;
+        lenY = params->N;
+    }
+
+    ::std::cerr << "Generating input data... ";
+    setNans<T>(params->rowsA * params->columnsA, A);
+    setNans<T>(params->rowsB * params->columnsB, B);
+    setNans<T>(params->rowsC * params->columnsC, blasC);
+
+    randomGemmxMatrices<T>(params->order, params->transA, params->transB,
+                           params->transC, lenY, params->K, lenX, useAlpha,
+                           &alpha, A, params->lda, B, params->ldb, useBeta,
+                           &beta, blasC, params->ldc);
+
+    // set to NAN elements which must not be accessed
+    // in matrix B containing vector X
+    setVectorNans<T>(params->offBX, abs(params->incx), B, lenX,
+                  params->columnsB * params->rowsB);
+    // in matrix C containing vector Y
+    setVectorNans<T>(params->offCY, abs(params->incy), blasC, lenY,
+                  params->columnsC * params->rowsC);
+    memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*clblasC));
+
+    ::std::cerr << "Done" << ::std::endl;
+
+    ::std::cerr << "Calling reference xGEMV routine... ";
+    if (params->order == clblasColumnMajor) {
+        ::clMath::blas::gemv(clblasColumnMajor, params->transA,
+                          params->M, params->N, alpha, A, params->lda,
+                          X, params->incx, beta, Y, params->incy);
+    }
+    else {
+        T *reorderedA = new T[params->rowsA * params->columnsA];
+
+        reorderMatrix<T>(clblasRowMajor, params->rowsA, params->columnsA,
+                         A, reorderedA);
+        ::clMath::blas::gemv(clblasColumnMajor, params->transA,
+                         params->M, params->N, alpha, reorderedA, params->rowsA,
+                         X, params->incx, beta, Y, params->incy);
+
+        delete[] reorderedA;
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA *
+                                     sizeof(*A), params->offA * sizeof(*A),
+                                     CL_MEM_READ_ONLY);
+    bufB = base->createEnqueueBuffer(B, params->rowsB * params->columnsB *
+                                     sizeof(*B), 0, CL_MEM_READ_ONLY);
+    bufC = base->createEnqueueBuffer(clblasC, params->rowsC * params->columnsC *
+                                     sizeof(*clblasC), 0, CL_MEM_READ_WRITE);
+    if ((bufA == NULL) || (bufB == NULL) || (bufC == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufA, bufB, bufC);
+        deleteBuffers<T>(A, B, blasC, clblasC);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xGEMV routine... ";
+    err = (cl_int)::clMath::clblas::gemv(params->order, params->transA,
+        params->M, params->N, alpha, bufA, params->offA, params->lda,
+        bufB, params->offBX, params->incx, beta, bufC, params->offCY,
+        params->incy, params->numCommandQueues, base->commandQueues(), 0,
+        NULL, events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufB, bufC);
+        deleteBuffers<T>(A, B, blasC, clblasC);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::GEMV() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufB, bufC);
+        deleteBuffers<T>(A, B, blasC, clblasC);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, 0,
+                        params->rowsC * params->columnsC * sizeof(*clblasC),
+                        clblasC, 0, NULL, NULL);
+
+    releaseMemObjects(bufA, bufB, bufC);
+
+    compareVectors(params->offCY, lenY, abs(params->incy),
+                   params->columnsC * params->rowsC, blasC, clblasC);
+
+    deleteBuffers<T>(A, B, blasC, clblasC);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(GEMV, sgemv) {
+    TestParams params;
+
+    getParams(&params);
+    gemvCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(GEMV, dgemv) {
+    TestParams params;
+
+    getParams(&params);
+    gemvCorrectnessTest<cl_double>(&params);
+}
+
+
+TEST_P(GEMV, cgemv) {
+    TestParams params;
+
+    getParams(&params);
+    gemvCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(GEMV, zgemv) {
+    TestParams params;
+
+    getParams(&params);
+    gemvCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-ger.cpp b/src/tests/correctness/corr-ger.cpp
new file mode 100644
index 0000000..1ffe440
--- /dev/null
+++ b/src/tests/correctness/corr-ger.cpp
@@ -0,0 +1,265 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <ger.h>
+#include<cltypes.h>
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objx, cl_mem objy)
+{
+    if( objA!=NULL)
+	clReleaseMemObject(objA);
+    if( objx!=NULL)
+    	clReleaseMemObject(objx);
+    if( objy!=NULL)
+    	clReleaseMemObject(objy);
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *x, T *y, T *backA)
+{
+
+	if(A != NULL)
+    {
+	delete[] A;
+    }
+	if(backA != NULL)
+	{
+		delete[] backA;
+	}
+	if(x != NULL)
+	{
+    delete[] x;
+	}
+	if(y != NULL)
+	{
+    delete[] y;
+}
+}
+
+template <typename T>
+void
+gerCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *x, *y, *backA;
+    //size_t N, M;
+
+    T alpha_;
+    cl_mem bufA, bufx, bufy;
+    clMath::BlasBase *base;
+    cl_event *events;
+//	int ka, kxy;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+	size_t lengthA;
+	if( params->order == clblasColumnMajor )
+			lengthA = params->N  * params->lda;
+	else	lengthA = params->M  * params->lda;
+
+    size_t lengthx = (1 + (((params->M)-1) * abs(params->incx)));
+    size_t lengthy = (1 + (((params->N)-1) * abs(params->incy)));
+
+    bool useAlpha = base->useAlpha();
+
+    if (useAlpha) {
+        alpha_ = convertMultiplier<T>(params->alpha);
+    }
+
+
+    A 		= new T[lengthA + params->offa];
+    x   	= new T[lengthx + params->offBX];
+    y   	= new T[lengthy + params->offCY];
+    backA       = new T[lengthA + params->offa];
+
+	if((A == NULL) || (backA == NULL) || (x == NULL) || (y == NULL))
+	{
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        deleteBuffers<T>(A, backA, x, y);
+		delete[] events;
+		SUCCEED();
+        return;
+	}
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+    int creationFlags = 0;
+    creationFlags =  creationFlags | RANDOM_INIT;
+    creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
+	BlasRoutineID BlasFn = CLBLAS_GER;
+
+    populate( (A + params->offa), params->M, params->N, params-> lda, BlasFn, creationFlags);
+    populate( (x + params->offBX), lengthx, 1, lengthx, BlasFn );
+    populate( (y + params->offCY), lengthy, 1, lengthy, BlasFn );
+
+    // Copy C to backX
+    memcpy(backA, A, (lengthA + params->offa) * sizeof(T));
+
+	// Allocate buffers
+    bufA = base->createEnqueueBuffer(A, (lengthA + params->offa) * sizeof(*A), 0, CL_MEM_READ_WRITE);
+    bufx = base->createEnqueueBuffer(x, (lengthx + params->offBX) * sizeof(*x), 0, CL_MEM_READ_ONLY);
+    bufy = base->createEnqueueBuffer(y, (lengthy + params->offCY) * sizeof(*y), 0, CL_MEM_READ_ONLY);
+
+
+    ::std::cerr << "Done" << ::std::endl;
+    ::std::cerr << "Calling reference xGER routine... ";
+
+
+	clblasOrder fOrder;
+    size_t fN, fM;
+    size_t fOffx, fOffy;
+    int fIncx, fIncy;
+    T *fX, *fY;
+    fOrder = params->order;
+    fM = params->M;
+    fN = params->N;
+    fIncx = params->incx;
+    fIncy = params->incy;
+    fX = x;
+    fY = y;
+    fOffx = params->offBX;
+    fOffy = params->offCY;
+
+    if (fOrder != clblasColumnMajor) {
+
+           fOrder = clblasColumnMajor;
+           fM = params->N;
+           fN = params->M;
+           fX = y;
+           fY = x;
+           fIncx = params->incy;
+           fIncy = params->incx;
+           fOffx = params->offCY;
+           fOffy = params->offBX;
+       }
+
+    // Call reference blas routine
+    clMath::blas::ger(fOrder, fM, fN, alpha_, fX , fOffx, fIncx, fY, fOffy, fIncy,  A, params->offa, params->lda);
+	::std::cerr << "Done" << ::std::endl;
+
+    if ((bufA == NULL) || (bufx == NULL) || (bufy == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufA, bufx, bufy);
+        deleteBuffers<T>(A, x, y, backA);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xGER routine... ";
+
+    err = (cl_int)::clMath::clblas::ger( params->order, params->M, params->N, alpha_,
+                            bufx, params->offBX, params->incx, bufy, params->offCY, params->incy,bufA, params->offa, params->lda,
+							params->numCommandQueues, base->commandQueues(), 0, NULL, events );
+
+    if (err != CL_SUCCESS) {
+
+   	releaseMemObjects(bufA, bufx, bufy);
+        deleteBuffers<T>(A, x, y, backA);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::GER() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+
+        releaseMemObjects(bufA, bufx, bufy);
+        deleteBuffers<T>(A, x, y, backA);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    clEnqueueReadBuffer(base->commandQueues()[0], bufA, CL_TRUE, 0,
+        (lengthA + params->offa)* sizeof(*backA), backA, 0,
+        NULL, NULL);
+
+    releaseMemObjects(bufA, bufx, bufy);
+
+    // handle lda correctly based on row-major/col-major..
+    compareMatrices<T>(params->order, params->M , params->N, A+ params->offa, backA + params->offa, params->lda);
+    deleteBuffers<T>(A, x, y, backA);
+    delete[] events;
+}
+
+// Instantiate the test
+
+
+TEST_P(GER, sger) {
+    TestParams params;
+
+    getParams(&params);
+    gerCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(GER, dger) {
+    TestParams params;
+
+    getParams(&params);
+    gerCorrectnessTest<cl_double>(&params);
+}
+
+
+TEST_P(GER, cgeru) {
+    TestParams params;
+
+    getParams(&params);
+    gerCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(GER, zgeru) {
+    TestParams params;
+
+    getParams(&params);
+    gerCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-gerc.cpp b/src/tests/correctness/corr-gerc.cpp
new file mode 100644
index 0000000..0070a77
--- /dev/null
+++ b/src/tests/correctness/corr-gerc.cpp
@@ -0,0 +1,252 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <gerc.h>
+#include<cltypes.h>
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objx, cl_mem objy)
+{
+    if( objA!=NULL)
+	clReleaseMemObject(objA);
+    if( objx!=NULL)
+    	clReleaseMemObject(objx);
+    if( objy!=NULL)
+    	clReleaseMemObject(objy);
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *x, T *y, T *backA)
+{
+
+    if(A != NULL)
+    {
+    delete[] A;
+    }
+	if(backA != NULL)
+	{
+		delete[] backA;
+	}
+	if(x != NULL)
+	{
+    delete[] x;
+	}
+	if(y != NULL)
+	{
+    delete[] y;
+}
+}
+
+template <typename T>
+void
+gercCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *x, *y, *backA;
+    //size_t N, M;
+
+    T alpha_;
+    cl_mem bufA, bufx, bufy;
+    clMath::BlasBase *base;
+    cl_event *events;
+//	int ka, kxy;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+	size_t lengthA;
+	if( params->order == clblasColumnMajor )
+	lengthA = params->N  * params->lda;
+	else
+	lengthA = params->M  * params->lda;
+
+    size_t lengthx = (1 + (((params->M)-1) * abs(params->incx)));
+    size_t lengthy = (1 + (((params->N)-1) * abs(params->incy)));
+
+    bool useAlpha = base->useAlpha();
+
+    if (useAlpha) {
+        alpha_ = convertMultiplier<T>(params->alpha);
+    }
+
+
+    A 		= new T[lengthA + params->offa];
+    x   	= new T[lengthx + params->offBX];
+    y   	= new T[lengthy + params->offCY];
+    backA       = new T[lengthA + params->offa];
+
+	if((A == NULL) || (backA == NULL) || (x == NULL) || (y == NULL))
+	{
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        deleteBuffers<T>(A, x, y, backA);
+		delete[] events;
+		SUCCEED();
+        return;
+	}
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+    int creationFlags = 0;
+    creationFlags =  creationFlags | RANDOM_INIT;
+    creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
+	BlasRoutineID BlasFn = CLBLAS_GER;
+
+    populate( (A + params->offa), params->M, params->N, params-> lda, BlasFn, creationFlags);
+    populate( (x + params->offBX), lengthx, 1, lengthx, BlasFn );
+    populate( (y + params->offCY), lengthy, 1, lengthy, BlasFn );
+
+    // Copy C to backX
+    memcpy(backA, A, (lengthA + params->offa) * sizeof(T));
+
+	// Allocate buffers
+    bufA = base->createEnqueueBuffer(A, (lengthA + params->offa) * sizeof(*A), 0, CL_MEM_READ_WRITE);
+    bufx = base->createEnqueueBuffer(x, (lengthx + params->offBX) * sizeof(*x), 0, CL_MEM_READ_ONLY);
+    bufy = base->createEnqueueBuffer(y, (lengthy + params->offCY) * sizeof(*y), 0, CL_MEM_READ_ONLY);
+
+
+    ::std::cerr << "Done" << ::std::endl;
+    ::std::cerr << "Calling reference xGER routine... ";
+
+
+    clblasOrder fOrder;
+    size_t fN, fM;
+    size_t fOffx, fOffy;
+    int fIncx, fIncy;
+    T *fX, *fY;
+    fOrder = params->order;
+    fM = params->M;
+    fN = params->N;
+    fIncx = params->incx;
+    fIncy = params->incy;
+    fX = x;
+    fY = y;
+    fOffx = params->offBX;
+    fOffy = params->offCY;
+
+    if (fOrder != clblasColumnMajor) {
+
+        doConjugate( (y + params->offCY), (1 + (params->N-1) * abs(params->incy)), 1, 1 );
+		fOrder = clblasColumnMajor;
+        fM = params->N;
+        fN = params->M;
+        fX = y;
+        fY = x;
+        fIncx = params->incy;
+        fIncy = params->incx;
+        fOffx = params->offCY;
+        fOffy = params->offBX;
+		// Note this according to the Legacy guide
+		clMath::blas::ger(fOrder, fM, fN, alpha_, fX , fOffx, fIncx, fY, fOffy, fIncy,  A, params->offa, params->lda);
+    }
+	else {
+		clMath::blas::gerc(fOrder, fM, fN, alpha_, fX , fOffx, fIncx, fY, fOffy, fIncy,  A, params->offa, params->lda);
+	}
+	::std::cerr << "Done" << ::std::endl;
+
+    if ((bufA == NULL) || (bufx == NULL) || (bufy == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufA, bufx, bufy);
+        deleteBuffers<T>(A, x, y, backA);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xGER routine... ";
+
+    err = (cl_int)::clMath::clblas::gerc( params->order, params->M, params->N, alpha_,
+                            bufx, params->offBX, params->incx, bufy, params->offCY, params->incy,bufA, params->offa, params->lda,
+							params->numCommandQueues, base->commandQueues(), 0, NULL, events );
+
+    if (err != CL_SUCCESS) {
+       	releaseMemObjects(bufA, bufx, bufy);
+        deleteBuffers<T>(A, x, y, backA);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::GER() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+
+       releaseMemObjects(bufA, bufx, bufy);
+        deleteBuffers<T>(A, x, y, backA);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    clEnqueueReadBuffer(base->commandQueues()[0], bufA, CL_TRUE, 0,
+        (lengthA + params->offa)* sizeof(*backA), backA, 0,
+        NULL, NULL);
+
+    releaseMemObjects(bufA, bufx, bufy);
+
+    // handle lda correctly based on row-major/col-major..
+    compareMatrices<T>(params->order, params->M , params->N, A+ params->offa, backA + params->offa, params->lda);
+    deleteBuffers<T>(A, x, y, backA);
+    delete[] events;
+}
+
+// Instantiate the test
+
+
+TEST_P(GERC, cgerc) {
+    TestParams params;
+
+    getParams(&params);
+    gercCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(GERC, zgerc) {
+    TestParams params;
+
+    getParams(&params);
+    gercCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-hbmv.cpp b/src/tests/correctness/corr-hbmv.cpp
new file mode 100644
index 0000000..9b7ff8e
--- /dev/null
+++ b/src/tests/correctness/corr-hbmv.cpp
@@ -0,0 +1,223 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <hbmv.h>
+#include <gbmv.h>
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY)
+{
+    if(objA != NULL)
+	{
+        clReleaseMemObject(objA);
+    }
+	if(objX != NULL)
+	{
+        clReleaseMemObject(objX);
+	}
+	if(objY != NULL)
+	{
+	    clReleaseMemObject(objY);
+    }
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *X, T *blasY, T *clblasY)
+{
+    if(A != NULL)
+	{
+        delete[] A;
+	}
+    if(X != NULL)
+	{
+        delete[] X;
+	}
+	if(blasY != NULL)
+	{
+	    delete[] blasY;
+	}
+    if(clblasY != NULL)
+	{
+        delete[] clblasY; // To hold clblas GBMV call results
+    }
+}
+
+template <typename T>
+void
+hbmvCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *X, *blasY, *clblasY;
+    cl_mem bufA, bufX, bufY;
+    clMath::BlasBase *base;
+    cl_event *events;
+	T alpha, beta;
+	size_t lengthX, lengthY, lengthA;
+
+    base = clMath::BlasBase::getInstance();
+
+    if (( (typeid(T) == typeid(DoubleComplex)) || (typeid(T) == typeid(cl_double)) ) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    lengthA =  params->N * params->lda;
+    lengthX = (params->N - 1)*abs(params->incx) + 1;
+    lengthY = (params->N - 1)*abs(params->incy) + 1;
+
+
+    A 	= new T[lengthA + params->offA ];
+    X 	= new T[lengthX + params->offBX ];
+    blasY  		= new T[lengthY + params->offCY ];
+	clblasY 	= new T[lengthY + params->offCY ];
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+	if((A == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL))
+	{
+		deleteBuffers<T>(A, X, blasY, clblasY);
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        delete[] events;
+        SUCCEED();
+        return;
+	}
+
+	alpha = convertMultiplier<T>(params->alpha);
+	beta = convertMultiplier<T>(params->beta);
+
+    randomGbmvMatrices(params->order, clblasNoTrans, params->N, params->N, &alpha, &beta,
+                        (A + params->offA), params->lda, (X+params->offBX), params->incx, (blasY+params->offCY), params->incy );
+    // Copy blasY to clblasY
+    memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY));
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufA = base->createEnqueueBuffer(A, (lengthA + params->offA)* sizeof(*A), 0, CL_MEM_READ_ONLY);
+    bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY);
+    bufY = base->createEnqueueBuffer(clblasY, (lengthY + params->offCY) * sizeof(*clblasY), 0, CL_MEM_READ_WRITE);
+
+    ::std::cerr << "Calling reference xHBMV routine... ";
+
+	clblasOrder fOrder;
+    clblasUplo fUplo;
+	fOrder = params->order;
+	fUplo = params->uplo;
+	size_t fN = params->N, fK = params->K;
+
+    if (fOrder != clblasColumnMajor)
+    {
+        fOrder = clblasColumnMajor;
+        fUplo = (params->uplo == clblasLower)? clblasUpper : clblasLower;
+        doConjugate( (A + params->offA), params->N, params->lda, params->lda );
+    }
+
+	clMath::blas::hbmv(fOrder, fUplo, fN, fK, alpha, A, params->offA, params->lda,
+							X, params->offBX, params->incx, beta, blasY, params->offCY, params->incy);
+    ::std::cerr << "Done" << ::std::endl;
+
+    if ((bufA == NULL) || (bufX == NULL) || (bufY == NULL)) {
+        // Skip the test, the most probable reason is
+        //     matrix too big for a device.
+
+        releaseMemObjects(bufA, bufX, bufY);
+        deleteBuffers<T>(A, X, blasY, clblasY);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xHBMV routine... ";
+
+    err = (cl_int)clMath::clblas::hbmv(params->order, params->uplo, params->N, params->K,
+                                        alpha, bufA, params->offA, params->lda, bufX, params->offBX, params->incx,
+                                        beta, bufY, params->offCY, params->incy,
+                                        params->numCommandQueues, base->commandQueues(), 0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufX, bufY);
+        deleteBuffers<T>(A, X, blasY, clblasY);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::GBMV() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufX, bufY);
+        deleteBuffers<T>(A, X, blasY, clblasY);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0,
+        (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0,
+        NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "GBMV: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufA, bufX, bufY);
+    compareMatrices<T>(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY),
+                       lengthY);
+    deleteBuffers<T>(A, X, blasY, clblasY);
+    delete[] events;
+}
+
+// Instantiate the test
+TEST_P(HBMV, chbmv) {
+    TestParams params;
+
+    getParams(&params);
+    hbmvCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(HBMV, zhbmv) {
+    TestParams params;
+
+    getParams(&params);
+    hbmvCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-hemm.cpp b/src/tests/correctness/corr-hemm.cpp
new file mode 100644
index 0000000..feb5f2a
--- /dev/null
+++ b/src/tests/correctness/corr-hemm.cpp
@@ -0,0 +1,256 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <hemm.h>
+#include<cltypes.h>
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objB, cl_mem objC)
+{
+	if(objA != NULL)
+    {
+    clReleaseMemObject(objA);
+    }
+	if(objB != NULL)
+    {
+    clReleaseMemObject(objB);
+    }
+	if(objC != NULL)
+    {
+    clReleaseMemObject(objC);
+}
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *B, T *C, T *backC)
+{
+	if(A != NULL)
+    {
+    delete[] A;
+    }
+	if(B != NULL)
+    {
+    delete[] B;
+    }
+	if(C != NULL)
+    {
+    delete[] C;
+    }
+	if(backC != NULL)
+    {
+    delete[] backC;// To hold the original C
+}
+}
+
+template <typename T>
+void
+hemmCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *B, *C, *backC;
+	T alpha_, beta_;
+    cl_mem bufA, bufB, bufC;
+    clMath::BlasBase *base;
+    cl_event *events;
+	size_t ka, kbc;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    events = new cl_event[params->numCommandQueues];
+    if (events == NULL)
+    {
+        std::cerr << ">> WARNING: Unable to allocate memory for events"  <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+	if( params->side == clblasLeft )
+                ka = params->M;
+        else    ka = params->N;
+
+    if( params->order == clblasColumnMajor )
+                kbc = params->N;
+        else    kbc = params->M;
+
+    size_t lengthA = ka  * params->lda;
+    size_t lengthB = kbc * params->ldb;
+    size_t lengthC = kbc * params->ldc;
+
+    alpha_ = convertMultiplier<T>(params->alpha);
+    beta_ = convertMultiplier<T>(params->beta);
+
+
+    A 		= new T[ lengthA + params->offA ];
+    B   	= new T[ lengthB + params->offBX ];
+    C   	= new T[ lengthC + params->offCY ];
+    backC   = new T[ lengthC + params->offCY ];
+
+	if((A == NULL) || (B == NULL) || (C == NULL) || (backC == NULL))
+    {
+        ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        deleteBuffers<T>(A, B, C, backC);
+        delete[] events;
+		SUCCEED();
+        return;
+    }
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... " << std::endl;
+
+    int creationFlags = 0, AcreationFlags;
+    creationFlags =  creationFlags | RANDOM_INIT;
+    creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
+    AcreationFlags = ( (params-> uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY);
+	BlasRoutineID BlasFn = CLBLAS_HEMM;
+
+	populate( A + params->offA , ka, ka, params-> lda, BlasFn, AcreationFlags);
+    populate( B + params->offBX , params-> M, params-> N, params-> ldb, BlasFn, creationFlags);
+    populate( C + params->offCY , params-> M, params-> N, params-> ldc, BlasFn, creationFlags);
+
+	memcpy(backC, C, (lengthC + params->offCY) * sizeof(T));
+    //printMatrixBlock( params->order, 0, 0, params->M, params->N, params->ldc, backC);
+
+	// Allocate buffers
+    bufA = base->createEnqueueBuffer(A, (lengthA + params->offA) * sizeof(T), 0, CL_MEM_READ_ONLY);
+    bufB = base->createEnqueueBuffer(B, (lengthB + params->offBX) * sizeof(T), 0, CL_MEM_READ_ONLY);
+    bufC = base->createEnqueueBuffer(backC, (lengthC + params->offCY) * sizeof(T), 0, CL_MEM_READ_WRITE);
+
+    ::std::cerr << "Done" << ::std::endl;
+    ::std::cerr << "Calling reference xHEMM routine... ";
+
+	clblasOrder fOrder;
+    clblasUplo fUplo;
+    clblasSide fSide;
+    size_t fN, fM;
+
+	fOrder = params->order;
+    fUplo = params->uplo;
+    fSide = params->side;
+	fM = params->M;
+    fN = params->N;
+
+	if (fOrder != clblasColumnMajor) {
+
+           fOrder = clblasColumnMajor;
+           fM = params->N;
+           fN = params->M;
+           fSide = (params->side == clblasLeft)? clblasRight: clblasLeft;
+           fUplo = (params->uplo == clblasUpper)? clblasLower: clblasUpper;
+       }
+
+	// Call reference blas routine
+    clMath::blas::hemm(fOrder, fSide, fUplo, fM, fN, alpha_,
+                            A, params->offA, params->lda, B, params->offBX, params->ldb, beta_, C, params->offCY, params->ldc);
+	::std::cerr << "Done" << ::std::endl;
+
+    if ((bufA == NULL) || (bufB == NULL) || (bufC == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufA, bufB, bufC);
+        deleteBuffers<T>(A, B, C, backC);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xHEMM routine... ";
+
+    err = (cl_int)::clMath::clblas::hemm( params->order, params->side, params->uplo, params->M, params->N, alpha_,
+                            bufA, params->offA, params->lda, bufB, params->offBX, params->ldb, beta_, bufC, params->offCY, params->ldc,
+							params->numCommandQueues, base->commandQueues(), 0, NULL, events );
+
+    if (err != CL_SUCCESS) {
+
+		releaseMemObjects(bufA, bufB, bufC);
+        deleteBuffers<T>(A, B, C, backC);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HEMM() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+
+        releaseMemObjects(bufA, bufB, bufC);
+		deleteBuffers<T>(A, B, C, backC);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, 0,
+        (lengthC + params->offCY) * sizeof(T), backC, 0,
+        NULL, NULL);
+    if (err != CL_SUCCESS)
+    {
+        ::std::cerr << "WARNING: corr-hemm: Erorr reading buffer..." << err << ::std::endl;
+    }
+    //printMatrixBlock( params->order, 0, 0, params->M, params->N, params->ldc, backC);
+
+    releaseMemObjects(bufA, bufB, bufC);
+
+    // handle lda correctly based on row-major/col-major..
+    compareMatrices<T>(params->order, params->M , params->N, (C + params->offCY), (backC + params->offCY), params->ldc);
+    deleteBuffers<T>(A, B, C, backC);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(HEMM, chemm) {
+    TestParams params;
+
+    getParams(&params);
+    hemmCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(HEMM, zhemm) {
+    TestParams params;
+
+    getParams(&params);
+    hemmCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-hemv.cpp b/src/tests/correctness/corr-hemv.cpp
new file mode 100644
index 0000000..41bcb62
--- /dev/null
+++ b/src/tests/correctness/corr-hemv.cpp
@@ -0,0 +1,256 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <hemv.h>
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY)
+{
+    if(objA != NULL)
+	{
+    clReleaseMemObject(objA);
+    }
+	if(objX != NULL)
+	{
+    clReleaseMemObject(objX);
+	}
+	if(objY != NULL)
+	{
+	clReleaseMemObject(objY);
+}
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *X, T *blasY, T *clblasY)
+{
+    if(A != NULL)
+	{
+    delete[] A;
+	}
+    if(X != NULL)
+	{
+    delete[] X;
+	}
+	if(blasY != NULL)
+	{
+	delete[] blasY;
+	}
+    if(clblasY != NULL)
+	{
+    delete[] clblasY; // To hold clblas HEMV call results
+}
+}
+/*
+template <typename T> static
+void printVector(T *data, size_t length)
+{
+	for(int i =0; i < length; i ++)
+	{
+		printf("(%20f, %20f)\n", data[i].s[0], data[i].s[1]);
+	}
+}
+*/
+template <typename T>
+void
+hemvCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *X, *blasY, *clblasY;
+    cl_mem bufA, bufX, bufY;
+    clMath::BlasBase *base;
+    cl_event *events;
+	T alpha, beta;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthA = params->N * params->lda;
+    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));
+	size_t lengthY = (1 + ((params->N -1) * abs(params->incy)));
+
+    A 	= new T[lengthA + params->offA ];
+    X 	= new T[lengthX + params->offBX ];
+    blasY  		= new T[lengthY + params->offCY ];
+	clblasY 	= new T[lengthY + params->offCY ];
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+	if((A == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL))
+	{
+		deleteBuffers<T>(A, X, blasY, clblasY);
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        delete[] events;
+        SUCCEED();
+        return;
+	}
+
+	alpha = convertMultiplier<T>(params->alpha);
+	beta = convertMultiplier<T>(params->beta);
+//	beta.s[0] = 0.0f;
+//	beta.s[1] = 0.0f;
+
+    randomHemvMatrices(params->order, params->uplo, params->N, true, &alpha, (A + params->offA), params->lda,
+						(X + params->offBX), params->incx, true, &beta, (blasY + params->offCY), params->incy);
+    // Copy blasY to clblasY
+    memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY));
+    ::std::cerr << "Done" << ::std::endl;
+	/*
+	printf("\n\n before acml call\nA\n");
+    printMatrixBlock( params->order, 0, 0, params->N, params->N, params->lda, A+params->offA);
+    printf("\nX\n");
+    printMatrixBlock( clblasColumnMajor, 0, 0, lengthX, 1, lengthX, X+params->offBX);
+	printf("\nY\n");
+	printMatrixBlock( clblasColumnMajor, 0, 0, lengthY, 1, lengthY, blasY+params->offCY);
+   	printf("\nY\n");
+    printMatrixBlock( clblasColumnMajor, 0, 0, lengthY, 1, lengthY, clblasY + params->offCY);
+	*/
+	// Allocate buffers
+    bufA = base->createEnqueueBuffer(A, (lengthA + params->offA)* sizeof(*A), 0, CL_MEM_READ_ONLY);
+    bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY);
+    bufY = base->createEnqueueBuffer(clblasY, (lengthY + params->offCY) * sizeof(*clblasY), 0, CL_MEM_READ_WRITE);
+
+	//printData( "bufX", blasX, lengthX, 1, lengthX);
+	//printData( "clblasX", clblasX, lengthX, 1, lengthX);
+
+    ::std::cerr << "Calling reference xHEMV routine... ";
+
+	clblasOrder order;
+    clblasUplo fUplo;
+
+	order = params->order;
+    fUplo = params->uplo;
+
+	if (order != clblasColumnMajor)
+    {
+        order = clblasColumnMajor;
+        fUplo =  (params->uplo == clblasUpper)? clblasLower : clblasUpper;
+		doConjugate( (A + params->offA), params->N, params->N, params->lda );
+    }
+	::clMath::blas::hemv( order, fUplo, params->N, alpha, A, params->offA, params->lda, X, params->offBX, params->incx,
+						beta, blasY, params->offCY, params->incy);
+    ::std::cerr << "Done" << ::std::endl;
+	/*
+	printf("\n\n after acml call\n");
+    printf("\nY\n");
+    printMatrixBlock( clblasColumnMajor, 0, 0, lengthY, 1, lengthY, blasY+params->offCY);
+	printf("Y in different format\n");
+	printVector(blasY+params->offCY, lengthY);
+    */
+    if ((bufA == NULL) || (bufX == NULL) || (bufY == NULL)) {
+        // Skip the test, the most probable reason is
+        //     matrix too big for a device.
+
+        releaseMemObjects(bufA, bufX, bufY);
+        deleteBuffers<T>(A, X, blasY, clblasY);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xHEMV routine... ";
+
+    err = (cl_int)::clMath::clblas::hemv(params->order, params->uplo, params->N, alpha, bufA,
+    					params->offA, params->lda, bufX, params->offBX, params->incx, beta, bufY, params->offCY, params->incy,
+						params->numCommandQueues, base->commandQueues(), 0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufX, bufY);
+        deleteBuffers<T>(A, X, blasY, clblasY);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HEMV() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufX, bufY);
+        deleteBuffers<T>(A, X, blasY, clblasY);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0,
+        (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0,
+        NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "HEMV: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufA, bufX, bufY);
+	/*
+	printf("\n\n after our call\n");
+    printf("\nY\n");
+    printMatrixBlock( clblasColumnMajor, 0, 0, lengthY, 1, lengthY, clblasY+params->offCY);
+	printf("Y in different format\n");
+    printVector(clblasY+params->offCY, lengthY);
+	*/
+    compareMatrices<T>(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY),
+                       lengthY);
+    deleteBuffers<T>(A, X, blasY, clblasY);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(HEMV, chemv) {
+    TestParams params;
+
+    getParams(&params);
+    hemvCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(HEMV, zhemv) {
+    TestParams params;
+
+    getParams(&params);
+    hemvCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-her.cpp b/src/tests/correctness/corr-her.cpp
new file mode 100644
index 0000000..34b5799
--- /dev/null
+++ b/src/tests/correctness/corr-her.cpp
@@ -0,0 +1,210 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <her.h>
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objX)
+{
+
+    if( objA!=NULL)
+    clReleaseMemObject(objA);
+    if( objX!=NULL)
+    clReleaseMemObject(objX);
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *X, T *backA)
+{
+	if(A != NULL)
+	{
+    delete[] A;
+	}
+	if(X != NULL)
+	{
+    delete[] X;
+	}
+	if(backA != NULL)
+	{
+    delete[] backA;
+}
+}
+
+template <typename T>
+void
+herCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *X, *backA;
+	T alpha_;
+    cl_mem bufA, bufX;
+    clMath::BlasBase *base;
+    cl_event *events;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthA = params->N * params->lda;
+    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));
+    alpha_ = convertMultiplier<T>(params->alpha);
+
+    A 	    = new T[lengthA + params->offa ];
+    backA 	= new T[lengthA + params->offa ];
+    X		= new T[lengthX + params->offBX ];
+
+	if((A == NULL) || (backA == NULL) || (X == NULL))
+    {
+        ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        deleteBuffers<T>(A, X, backA);
+        delete[] events;
+		SUCCEED();
+        return;
+    }
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+	randomHerMatrices( params->order, params->uplo, params->N, &alpha_, (A + params->offa), params->lda, (X + params->offBX), params->incx );
+    memcpy(backA, A, (lengthA + params->offa)* sizeof(*A));
+	::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufA = base->createEnqueueBuffer(A, (lengthA + params->offa) * sizeof(*A), 0, CL_MEM_READ_WRITE);
+    bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX) * sizeof(*X), 0, CL_MEM_READ_ONLY);
+
+    ::std::cerr << "Calling reference xHER routine... ";
+
+    clblasOrder fOrder;
+    clblasUplo fUplo;
+    fOrder = params->order;
+    fUplo = params->uplo;
+
+    if (fOrder != clblasColumnMajor) {
+
+        doConjugate( (X + params->offBX), (1 + (params->N-1) * abs(params->incx)), 1, 1 );
+        fOrder = clblasColumnMajor;
+		fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower;
+	}
+	clMath::blas::her( fOrder, fUplo, params->N, CREAL(alpha_), X , params->offBX, params->incx, A, params->offa, params->lda );
+    ::std::cerr << "Done" << ::std::endl;
+
+    if ((bufA == NULL) || (bufX == NULL) ) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufA, bufX);
+        deleteBuffers<T>(backA, A, X);
+        delete[] events;
+		if(bufA == NULL)
+		{
+			::std::cerr << "BufA is null, lengthA is " << lengthA << ::std::endl;
+		}
+		if(bufX == NULL)
+		{
+			::std::cerr << "BufX is null, lengthX is  " << lengthX << ::std::endl;
+		}
+
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xHER routine... ";
+
+    err = (cl_int)::clMath::clblas::her( params->order, params->uplo, params->N, CREAL(alpha_),
+						bufX, params->offBX, params->incx, bufA, params->offa, params->lda,
+						params->numCommandQueues, base->commandQueues(),
+    					0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufX);
+        deleteBuffers<T>(backA, A, X);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HER() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufX);
+        deleteBuffers<T>(backA, A, X);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufA, CL_TRUE, 0,
+        (lengthA + params->offa) * sizeof(*A), backA, 0,
+        NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "HER: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufA, bufX);
+
+	printf("Comparing the results\n");
+	compareMatrices<T>(params->order, params->N , params->N, (A + params->offa), (backA + params->offa),
+                       params->lda);
+
+	deleteBuffers<T>( A, backA, X);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(HER, cher) {
+    TestParams params;
+
+    getParams(&params);
+    herCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(HER, zher) {
+    TestParams params;
+
+    getParams(&params);
+    herCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-her2.cpp b/src/tests/correctness/corr-her2.cpp
new file mode 100644
index 0000000..5d18e1d
--- /dev/null
+++ b/src/tests/correctness/corr-her2.cpp
@@ -0,0 +1,224 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <her2.h>
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY)
+{
+    if(objA != NULL)
+ 	{
+    clReleaseMemObject(objA);
+	}
+	if(objX != NULL)
+    {
+    clReleaseMemObject(objX);
+	}
+	if(objY != NULL)
+	{
+	clReleaseMemObject(objY);
+}
+}
+
+template <typename T> static void
+deleteBuffers(T *blasA, T *clblasA, T *X, T *Y)
+{
+	if(blasA != NULL)
+    {
+    delete[] blasA;
+    }
+	if(clblasA != NULL)
+	{
+    delete[] clblasA;
+	}
+	if(X != NULL)
+	{
+	delete[] X;
+	}
+	if(Y != NULL)
+	{
+	delete[] Y;
+}
+}
+
+template <typename T>
+void
+her2CorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *blasA, *clblasA, *X, *Y;
+    cl_mem bufA, bufX, bufY;
+    clMath::BlasBase *base;
+    cl_event *events;
+	T alpha;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double2)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthA = params->N * params->lda;
+    size_t lengthX = (1 + ((params->N - 1) * abs(params->incx)));
+	size_t lengthY = (1 + ((params->N - 1) * abs(params->incy)));
+
+    blasA 		= new T[lengthA + params->offa ];
+    clblasA 	= new T[lengthA + params->offa ];
+    X		 	= new T[lengthX + params->offBX ];
+	Y			= new T[lengthY + params->offCY ];
+
+    srand(params->seed);
+
+	if((blasA == NULL) || (clblasA == NULL) || (X == NULL) || (Y == NULL))
+	{
+		deleteBuffers<T>(blasA, clblasA, X, Y);
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        delete[] events;
+        SUCCEED();
+        return;
+	}
+	alpha =  convertMultiplier<T>(params->alpha);
+
+    ::std::cerr << "Generating input data... ";
+
+    randomHer2Matrices<T>(params->order, params->uplo, params->N, &alpha, (blasA + params->offa), params->lda,
+							(X + params->offBX), params->incx, (Y + params->offCY), params->incy);
+
+	// Copy blasA to clblasA
+    memcpy(clblasA, blasA, (lengthA + params->offa)* sizeof(*blasA));
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufA = base->createEnqueueBuffer(clblasA, (lengthA + params->offa)* sizeof(*clblasA), 0,CL_MEM_READ_WRITE);
+    bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY);
+	bufY = base->createEnqueueBuffer(Y, (lengthY + params->offCY)* sizeof(*Y), 0, CL_MEM_READ_ONLY);
+
+    ::std::cerr << "Calling reference xHER2 routine... ";
+
+	clblasOrder order;
+    clblasUplo fUplo;
+	order = params->order;
+    fUplo = params->uplo;
+
+	if (order != clblasColumnMajor)
+    {
+		doConjugate( (X + params->offBX), 1, (1 + (params->N-1) * abs(params->incx)), (1 + (params->N-1) * abs(params->incx)) );
+        doConjugate( (Y + params->offCY), 1, (1 + (params->N-1) * abs(params->incy)), (1 + (params->N-1) * abs(params->incy)) );
+        order = clblasColumnMajor;
+        fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower;
+		::clMath::blas::her2( order, fUplo, params->N, alpha, Y, params->offCY, params->incy, X, params->offBX, params->incx, blasA, params->offa, params->lda);
+    }
+	else {
+		::clMath::blas::her2( order, fUplo, params->N, alpha, X, params->offBX, params->incx, Y, params->offCY, params->incy, blasA, params->offa, params->lda);
+	}
+    ::std::cerr << "Done" << ::std::endl;
+
+    if ((bufA == NULL) || (bufX == NULL) || (bufY == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufA, bufX, bufY);
+        deleteBuffers<T>(blasA, clblasA, X, Y);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xHER2 routine... ";
+
+    err = (cl_int)::clMath::clblas::her2( params->order, params->uplo, params->N, alpha,
+						bufX, params->offBX, params->incx, bufY, params->offCY, params->incy, bufA, params->offa, params->lda,
+						params->numCommandQueues, base->commandQueues(),
+    					0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufX, bufY);
+        deleteBuffers<T>(blasA, clblasA, X, Y);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HER2() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufX, bufY);
+        deleteBuffers<T>(blasA, clblasA, X, Y);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufA, CL_TRUE, 0,
+        (lengthA + params->offa) * sizeof(*clblasA), clblasA, 0,
+        NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "HER2: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufA, bufX, bufY);
+
+	compareMatrices<T>(params->order, params->N , params->N, (blasA + params->offa), (clblasA + params->offa),
+                       params->lda);
+
+	deleteBuffers<T>(blasA, clblasA, X, Y);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(HER2, cher2) {
+    TestParams params;
+
+    getParams(&params);
+    her2CorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(HER2, zher2) {
+    TestParams params;
+
+    getParams(&params);
+    her2CorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-her2k.cpp b/src/tests/correctness/corr-her2k.cpp
new file mode 100644
index 0000000..d7db83a
--- /dev/null
+++ b/src/tests/correctness/corr-her2k.cpp
@@ -0,0 +1,212 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <her2k.h>
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objC, cl_mem objB)
+{
+    if(objA != NULL)
+		clReleaseMemObject(objA);
+	if(objC != NULL)
+   		clReleaseMemObject(objC);
+    if(objB != NULL)
+   		clReleaseMemObject(objB);
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *B, T *blasC, T *clblasC)
+{
+	if(A != NULL)
+    	delete[] A;
+    if(B != NULL)
+    	delete[] B;
+	if(blasC != NULL)
+    	delete[] blasC;
+	if(clblasC != NULL)
+    	delete[] clblasC;
+}
+
+template <typename T>
+void
+her2kCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *B, *blasC, *clblasC;
+    T alpha, beta;
+    cl_mem bufA, bufC, bufB;
+    clMath::BlasBase *base;
+    cl_event *events;
+
+    if (params->transA == clblasTrans) {
+        ::std::cerr << ">> her2k(TRANSPOSE) for complex numbers "
+                           "is not allowed." << ::std::endl <<
+                           ">> Test skipped." << ::std::endl;
+            SUCCEED();
+            return;
+        }
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    A = new T[params->rowsA * params->columnsA];
+    B = new T[params->rowsB * params->columnsB];
+    blasC = new T[params->rowsC * params->columnsC];
+    clblasC = new T[params->rowsC * params->columnsC];
+
+	if((A == NULL) || (B == NULL) || (blasC == NULL) || (clblasC == NULL))
+	{
+		deleteBuffers<T>(A, B, blasC, clblasC);
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        delete[] events;
+        SUCCEED();
+        return;
+	}
+
+    srand(params->seed);
+
+    alpha = convertMultiplier<T>(params->alpha);
+    beta  = convertMultiplier<T>(params->beta);
+
+    ::std::cerr << "Generating input data... ";
+
+    clblasTranspose ftransB = (params->transA==clblasNoTrans)? clblasConjTrans: clblasNoTrans;
+
+    randomGemmMatrices<T>(params->order, params->transA, ftransB,
+                                params->N, params->N, params->K, true, &alpha, A, params->lda,
+                                B, params->ldb, true, &beta, blasC, params->ldc);
+
+    memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC));
+    ::std::cerr << "Done" << ::std::endl;
+
+    bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A),
+                                     CL_MEM_READ_ONLY);
+    bufB = base->createEnqueueBuffer(B, params->rowsB * params->columnsB * sizeof(*B), params->offBX * sizeof(*B),
+                                     CL_MEM_READ_ONLY);
+    bufC = base->createEnqueueBuffer(clblasC, params->rowsC * params->columnsC * sizeof(*clblasC),
+                                     params->offCY * sizeof(*clblasC),
+                                     CL_MEM_READ_WRITE);
+
+    if ((bufA == NULL) || (bufB == NULL)|| (bufC == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufA, bufB, bufC);
+        deleteBuffers<T>(A, B, blasC, clblasC);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling reference xHER2K routine... ";
+    T fAlpha = alpha;
+    if (params->order == clblasColumnMajor) {
+        ::clMath::blas::her2k(clblasColumnMajor, params->uplo, params->transA,
+                                params->N, params->K, fAlpha, A, 0, params->lda, B, 0, params->ldb,
+                                CREAL(beta), blasC, 0, params->ldc);
+    }
+    else {
+
+		CIMAG( fAlpha ) *= -1.0;        // According to netlib C- interface
+        clblasTranspose fTransA = (params->transA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans;
+		clblasUplo      fUplo   = (params->uplo == clblasUpper) ? clblasLower : clblasUpper;
+
+		::clMath::blas::her2k(clblasColumnMajor, fUplo, fTransA, params->N, params->K, fAlpha,
+						        A, 0, params->lda, B, 0, params->ldb, CREAL(beta), blasC, 0, params->ldc);
+
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    ::std::cerr << "Calling clblas xHER2K routine... ";
+    err = (cl_int)::clMath::clblas::her2k(params->order, params->uplo,
+                                         params->transA, params->N, params->K,
+                                         alpha, bufA, params->offA, params->lda, bufB, params->offBX, params->ldb,
+                                         CREAL(beta), bufC, params->offCY,
+                                         params->ldc, params->numCommandQueues,
+                                         base->commandQueues(), 0, NULL,
+                                         events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufB, bufC);
+        deleteBuffers<T>(A, B, blasC, clblasC);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HER2K() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufB, bufC);
+        deleteBuffers<T>(A, B, blasC, clblasC);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, params->offCY * sizeof(*clblasC),
+                        params->rowsC * params->columnsC * sizeof(*clblasC), clblasC, 0, NULL, NULL);
+
+    releaseMemObjects(bufA, bufB, bufC);
+    compareMatrices<T>(params->order, params->N, params->N, blasC, clblasC, params->ldc);
+
+    deleteBuffers<T>(A, B, blasC, clblasC);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(HER2K, cher2k) {
+    TestParams params;
+
+    getParams(&params);
+    her2kCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(HER2K, zher2k) {
+    TestParams params;
+
+    getParams(&params);
+    her2kCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-herk.cpp b/src/tests/correctness/corr-herk.cpp
new file mode 100644
index 0000000..2b5d8ab
--- /dev/null
+++ b/src/tests/correctness/corr-herk.cpp
@@ -0,0 +1,240 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <herk.h>
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objC)
+{
+    if(objA != NULL)
+	{
+		clReleaseMemObject(objA);
+	}
+	if(objC != NULL)
+	{
+   		clReleaseMemObject(objC);
+	}
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *blasC, T *clblasC)
+{
+	if(A != NULL)
+	{
+    	delete[] A;
+	}
+	if(blasC != NULL)
+	{
+    	delete[] blasC;
+	}
+	if(clblasC != NULL)
+	{
+    	delete[] clblasC;
+	}
+}
+
+template <typename T>
+void
+herkCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *blasC, *clblasC;
+    T alpha, beta;
+    cl_mem bufA, bufC;
+    clMath::BlasBase *base;
+    bool useAlpha;
+    bool useBeta;
+    cl_event *events;
+
+    if (params->transA == clblasTrans) {
+        ::std::cerr << ">> herk(TRANSPOSE) for complex numbers "
+                           "is not allowed." << ::std::endl <<
+                           ">> Test skipped." << ::std::endl;
+            SUCCEED();
+            return;
+        }
+
+    base = clMath::BlasBase::getInstance();
+    alpha = ZERO<T>();
+    beta = ZERO<T>();
+
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    A = new T[params->rowsA * params->columnsA];
+    blasC = new T[params->rowsC * params->columnsC];
+    clblasC = new T[params->rowsC * params->columnsC];
+
+	if((A == NULL) || (blasC == NULL) || (clblasC == NULL))
+	{
+		deleteBuffers<T>(A, blasC, clblasC);
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        delete[] events;
+        SUCCEED();
+        return;
+	}
+
+    srand(params->seed);
+
+	useAlpha = true;
+	useBeta  = true;
+
+    alpha = convertMultiplier<T>(params->alpha);
+    beta  = convertMultiplier<T>(params->beta);
+
+    ::std::cerr << "Generating input data... ";
+
+    randomGemmMatrices<T>(params->order, params->transA, clblasNoTrans,
+        params->N, params->N, params->K, useAlpha, &alpha, A, params->lda,
+        NULL, 0, useBeta, &beta, blasC, params->ldc);
+    memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC));
+    ::std::cerr << "Done" << ::std::endl;
+
+    ::std::cerr << "Calling reference xHERK routine... ";
+    if (params->order == clblasColumnMajor) {
+        ::clMath::blas::herk(clblasColumnMajor, params->uplo, params->transA,
+                          params->N, params->K, CREAL(alpha), A, params->lda,
+                          CREAL(beta), blasC, params->ldc);
+    }
+    else {
+	/*
+        T *reorderedA = new T[params->rowsA * params->columnsA];
+        T *reorderedC = new T[params->rowsC * params->columnsC];
+
+        reorderMatrix<T>(clblasRowMajor, params->rowsA, params->columnsA,
+                         A, reorderedA);
+        reorderMatrix<T>(clblasRowMajor, params->rowsC, params->columnsC,
+                         blasC, reorderedC);
+        ::clMath::blas::herk(clblasColumnMajor, params->uplo, params->transA,
+                          params->N, params->K, CREAL(alpha), reorderedA,
+                          params->rowsA,
+                          CREAL(beta), reorderedC, params->rowsC);
+        reorderMatrix<T>(clblasColumnMajor, params->rowsC, params->columnsC,
+                         reorderedC, blasC);
+
+        delete[] reorderedC;
+        delete[] reorderedA;
+	*/
+		clblasTranspose fTransA = (params->transA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans;
+		clblasUplo      fUplo   = (params->uplo == clblasUpper) ? clblasLower : clblasUpper;
+
+		::clMath::blas::herk(clblasColumnMajor, fUplo, fTransA, params->N, params->K, CREAL(alpha),
+						 A, params->lda, CREAL(beta), blasC, params->ldc);
+
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA *
+                                     sizeof(*A), params->offA * sizeof(*A),
+                                     CL_MEM_READ_ONLY);
+    bufC = base->createEnqueueBuffer(clblasC, params->rowsC * params->columnsC *
+                                     sizeof(*clblasC),
+                                     params->offCY * sizeof(*clblasC),
+                                     CL_MEM_READ_WRITE);
+
+    if ((bufA == NULL) || (bufC == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufA, bufC);
+        deleteBuffers<T>(A, blasC, clblasC);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xHERK routine... ";
+    err = (cl_int)::clMath::clblas::herk(params->order, params->uplo,
+                                         params->transA, params->N, params->K,
+                                         CREAL(alpha), bufA, params->offA, params->lda,
+                                         CREAL(beta), bufC, params->offCY,
+                                         params->ldc, params->numCommandQueues,
+                                         base->commandQueues(), 0, NULL,
+                                         events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufC);
+        deleteBuffers<T>(A, blasC, clblasC);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HERK() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufC);
+        deleteBuffers<T>(A, blasC, clblasC);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE,
+                        params->offCY * sizeof(*clblasC),
+                        params->rowsC * params->columnsC * sizeof(*clblasC),
+                        clblasC, 0, NULL, NULL);
+
+    releaseMemObjects(bufA, bufC);
+    compareMatrices<T>(params->order, params->N, params->N, blasC, clblasC,
+                       params->ldc);
+
+    deleteBuffers<T>(A, blasC, clblasC);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(HERK, cherk) {
+    TestParams params;
+
+    getParams(&params);
+    herkCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(HERK, zherk) {
+    TestParams params;
+
+    getParams(&params);
+    herkCorrectnessTest<DoubleComplex>(&params);
+}
+
diff --git a/src/tests/correctness/corr-hpmv.cpp b/src/tests/correctness/corr-hpmv.cpp
new file mode 100644
index 0000000..9531774
--- /dev/null
+++ b/src/tests/correctness/corr-hpmv.cpp
@@ -0,0 +1,221 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <hpmv.h>
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY)
+{
+    if(objA != NULL)
+	{
+        clReleaseMemObject(objA);
+    }
+	if(objX != NULL)
+	{
+        clReleaseMemObject(objX);
+	}
+	if(objY != NULL)
+	{
+	    clReleaseMemObject(objY);
+    }
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *X, T *blasY, T *clblasY)
+{
+    if(A != NULL)
+	{
+        delete[] A;
+	}
+    if(X != NULL)
+	{
+        delete[] X;
+	}
+	if(blasY != NULL)
+	{
+	    delete[] blasY;
+	}
+    if(clblasY != NULL)
+	{
+        delete[] clblasY; // To hold clblas HPMV call results
+    }
+}
+
+template <typename T>
+void
+hpmvCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *AP, *X, *blasY, *clblasY;
+    cl_mem bufAP, bufX, bufY;
+    clMath::BlasBase *base;
+    cl_event *events;
+	T alpha, beta;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthA = (params->N * (params->N + 1)) / 2;
+    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));
+	size_t lengthY = (1 + ((params->N -1) * abs(params->incy)));
+
+    AP 	= new T[lengthA + params->offA ];
+    X 	= new T[lengthX + params->offBX ];
+    blasY  		= new T[lengthY + params->offCY ];
+	clblasY 	= new T[lengthY + params->offCY ];
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+	if((AP == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL))
+	{
+		deleteBuffers<T>(AP, X, blasY, clblasY);
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        delete[] events;
+        SUCCEED();
+        return;
+	}
+
+	alpha = convertMultiplier<T>(params->alpha);
+	beta = convertMultiplier<T>(params->beta);
+
+	randomHemvMatrices(params->order, params->uplo, params->N, true, &alpha, (AP + params->offA), params->lda,
+						(X + params->offBX), params->incx, true, &beta, (blasY + params->offCY), params->incy);
+    // Copy blasY to clblasY
+    memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY));
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufAP = base->createEnqueueBuffer(AP, (lengthA + params->offA)* sizeof(*AP), 0, CL_MEM_READ_ONLY);
+    bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY);
+    bufY = base->createEnqueueBuffer(clblasY, (lengthY + params->offCY) * sizeof(*clblasY), 0, CL_MEM_READ_WRITE);
+
+    ::std::cerr << "Calling reference xHPMV routine... ";
+
+	clblasOrder order;
+    clblasUplo fUplo;
+
+	order = params->order;
+    fUplo = params->uplo;
+
+	if (order != clblasColumnMajor)
+    {
+        order = clblasColumnMajor;
+        fUplo =  (params->uplo == clblasUpper)? clblasLower : clblasUpper;
+		doConjugate( (AP + params->offA), lengthA, 1, 1 );
+    }
+	::clMath::blas::hpmv( order, fUplo, params->N, alpha, AP, params->offA, X, params->offBX, params->incx,
+						beta, blasY, params->offCY, params->incy);
+    ::std::cerr << "Done" << ::std::endl;
+
+    if ((bufAP == NULL) || (bufX == NULL) || (bufY == NULL)) {
+        // Skip the test, the most probable reason is
+        //     matrix too big for a device.
+
+        releaseMemObjects(bufAP, bufX, bufY);
+        deleteBuffers<T>(AP, X, blasY, clblasY);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xHPMV routine... ";
+
+    err = (cl_int)::clMath::clblas::hpmv(params->order, params->uplo, params->N, alpha, bufAP,
+    					params->offA, bufX, params->offBX, params->incx, beta, bufY, params->offCY, params->incy,
+						params->numCommandQueues, base->commandQueues(), 0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufAP, bufX, bufY);
+        deleteBuffers<T>(AP, X, blasY, clblasY);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HPMV() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufAP, bufX, bufY);
+        deleteBuffers<T>(AP, X, blasY, clblasY);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0,
+        (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0,
+        NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "HPMV: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufAP, bufX, bufY);
+
+    compareMatrices<T>(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY),
+                       lengthY);
+    deleteBuffers<T>(AP, X, blasY, clblasY);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(HPMV, chpmv) {
+    TestParams params;
+
+    getParams(&params);
+    hpmvCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(HPMV, zhpmv) {
+    TestParams params;
+
+    getParams(&params);
+    hpmvCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-hpr.cpp b/src/tests/correctness/corr-hpr.cpp
new file mode 100644
index 0000000..7a513c3
--- /dev/null
+++ b/src/tests/correctness/corr-hpr.cpp
@@ -0,0 +1,209 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <hpr.h>
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objX)
+{
+    if( objA!=NULL)
+        clReleaseMemObject(objA);
+    if( objX!=NULL)
+        clReleaseMemObject(objX);
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *X, T *backA)
+{
+	if(A != NULL)
+	{
+        delete[] A;
+	}
+	if(X != NULL)
+	{
+        delete[] X;
+	}
+	if(backA != NULL)
+	{
+        delete[] backA;
+    }
+}
+
+template <typename T>
+void
+hprCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *AP, *X, *backA;
+	T alpha_;
+    cl_mem bufAP, bufX;
+    clMath::BlasBase *base;
+    cl_event *events;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double2)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthAP = (params->N *( params->N + 1 ))/2 ;
+    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));
+    alpha_ = convertMultiplier<T>(params->alpha);
+
+    AP 	    = new T[lengthAP + params->offa ];
+    backA 	= new T[lengthAP + params->offa ];
+    X		= new T[lengthX + params->offBX ];
+
+	if((AP == NULL) || (backA == NULL) || (X == NULL))
+    {
+        ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        deleteBuffers<T>(AP, X, backA);
+        delete[] events;
+		SUCCEED();
+        return;
+    }
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+    randomHerMatrices( params->order, params->uplo, params->N, &alpha_, (AP + params->offa), params->lda, (X + params->offBX), params->incx );
+    memcpy(backA, AP, (lengthAP + params->offa)* sizeof(T));
+	::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufAP = base->createEnqueueBuffer(AP, (lengthAP + params->offa) * sizeof(T), 0, CL_MEM_READ_WRITE);
+    bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX) * sizeof(*X), 0, CL_MEM_READ_ONLY);
+
+    ::std::cerr << "Calling reference xHPR routine... ";
+
+    clblasOrder fOrder;
+    clblasUplo fUplo;
+    fOrder = params->order;
+    fUplo = params->uplo;
+
+    if (fOrder != clblasColumnMajor) {
+
+        doConjugate( (X + params->offBX), (1 + (params->N-1) * abs(params->incx)), 1, 1 );
+        fOrder = clblasColumnMajor;
+		fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower;
+	}
+	clMath::blas::hpr( fOrder, fUplo, params->N, CREAL(alpha_), X , params->offBX, params->incx, AP, params->offa);
+    ::std::cerr << "Done" << ::std::endl;
+
+    if ((bufAP == NULL) || (bufX == NULL) ) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufAP, bufX);
+        deleteBuffers<T>(backA, AP, X);
+        delete[] events;
+		if(bufAP == NULL)
+		{
+			::std::cerr << "BufA is null, lengthA is " << lengthAP << ::std::endl;
+		}
+		if(bufX == NULL)
+		{
+			::std::cerr << "BufX is null, lengthX is  " << lengthX << ::std::endl;
+		}
+
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xHPR routine... ";
+
+    err = (cl_int)::clMath::clblas::hpr( params->order, params->uplo, params->N, CREAL(alpha_),
+						bufX, params->offBX, params->incx, bufAP, params->offa,
+						params->numCommandQueues, base->commandQueues(),
+    					0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufAP, bufX);
+        deleteBuffers<T>(backA, AP, X);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HPR() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufAP, bufX);
+        deleteBuffers<T>(backA, AP, X);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufAP, CL_TRUE, 0,
+        (lengthAP + params->offa) * sizeof(T), backA, 0,
+        NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "HPR: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufAP, bufX);
+
+	printf("Comparing the results\n");
+
+    compareMatrices<T>(clblasColumnMajor, lengthAP, 1, (AP + params->offa), (backA + params->offa), lengthAP);
+
+	deleteBuffers<T>( AP, backA, X);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(HPR, chpr) {
+    TestParams params;
+
+    getParams(&params);
+    hprCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(HPR, zhpr) {
+    TestParams params;
+
+    getParams(&params);
+    hprCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-hpr2.cpp b/src/tests/correctness/corr-hpr2.cpp
new file mode 100644
index 0000000..313f167
--- /dev/null
+++ b/src/tests/correctness/corr-hpr2.cpp
@@ -0,0 +1,222 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <hpr2.h>
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY)
+{
+    if(objA != NULL)
+ 	{
+    clReleaseMemObject(objA);
+	}
+	if(objX != NULL)
+    {
+    clReleaseMemObject(objX);
+	}
+	if(objY != NULL)
+	{
+	clReleaseMemObject(objY);
+}
+}
+
+template <typename T> static void
+deleteBuffers(T *blasA, T *clblasA, T *X, T *Y)
+{
+	if(blasA != NULL)
+    {
+    delete[] blasA;
+    }
+	if(clblasA != NULL)
+	{
+    delete[] clblasA;
+	}
+	if(X != NULL)
+	{
+	delete[] X;
+	}
+	if(Y != NULL)
+	{
+	delete[] Y;
+}
+}
+
+template <typename T>
+void
+hpr2CorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *blasAP, *clblasAP, *X, *Y;
+    cl_mem bufAP, bufX, bufY;
+    clMath::BlasBase *base;
+    cl_event *events;
+	T alpha;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double2)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthAP = (params->N *( params->N + 1 ))/2 ;
+    size_t lengthX = (1 + ((params->N - 1) * abs(params->incx)));
+	size_t lengthY = (1 + ((params->N - 1) * abs(params->incy)));
+
+    blasAP 		= new T[lengthAP + params->offa ];
+    clblasAP 	= new T[lengthAP + params->offa ];
+    X		 	= new T[lengthX + params->offBX ];
+	Y			= new T[lengthY + params->offCY ];
+
+    srand(params->seed);
+
+	if((blasAP == NULL) || (clblasAP == NULL) || (X == NULL) || (Y == NULL))
+	{
+		deleteBuffers<T>(blasAP, clblasAP, X, Y);
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        delete[] events;
+        SUCCEED();
+        return;
+	}
+	alpha =  convertMultiplier<T>(params->alpha);
+
+    ::std::cerr << "Generating input data... ";
+    randomHer2Matrices<T>(params->order, params->uplo, params->N, &alpha, (blasAP + params->offa), params->lda,
+							(X + params->offBX), params->incx, (Y + params->offCY), params->incy);
+
+	// Copy blasA to clblasA
+    memcpy(clblasAP, blasAP, (lengthAP + params->offa)* sizeof(*blasAP));
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufAP = base->createEnqueueBuffer(clblasAP, (lengthAP + params->offa)* sizeof(*clblasAP), 0,CL_MEM_READ_WRITE);
+    bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY);
+	bufY = base->createEnqueueBuffer(Y, (lengthY + params->offCY)* sizeof(*Y), 0, CL_MEM_READ_ONLY);
+
+    ::std::cerr << "Calling reference xHPR2 routine... ";
+
+	clblasOrder order;
+    clblasUplo fUplo;
+	order = params->order;
+    fUplo = params->uplo;
+
+	if (order != clblasColumnMajor)
+    {
+		doConjugate( (X + params->offBX), 1, (1 + (params->N-1) * abs(params->incx)), (1 + (params->N-1) * abs(params->incx)) );
+        doConjugate( (Y + params->offCY), 1, (1 + (params->N-1) * abs(params->incy)), (1 + (params->N-1) * abs(params->incy)) );
+        order = clblasColumnMajor;
+        fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower;
+		::clMath::blas::hpr2( order, fUplo, params->N, alpha, Y, params->offCY, params->incy, X, params->offBX, params->incx, blasAP, params->offa);
+    }
+	else {
+		::clMath::blas::hpr2( order, fUplo, params->N, alpha, X, params->offBX, params->incx, Y, params->offCY, params->incy, blasAP, params->offa);
+	}
+    ::std::cerr << "Done" << ::std::endl;
+
+    if ((bufAP == NULL) || (bufX == NULL) || (bufY == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufAP, bufX, bufY);
+        deleteBuffers<T>(blasAP, clblasAP, X, Y);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xHPR2 routine... ";
+
+    err = (cl_int)::clMath::clblas::hpr2( params->order, params->uplo, params->N, alpha,
+						bufX, params->offBX, params->incx, bufY, params->offCY, params->incy, bufAP, params->offa,
+						params->numCommandQueues, base->commandQueues(),
+    					0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufAP, bufX, bufY);
+        deleteBuffers<T>(blasAP, clblasAP, X, Y);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HPR2() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufAP, bufX, bufY);
+        deleteBuffers<T>(blasAP, clblasAP, X, Y);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufAP, CL_TRUE, 0,
+        (lengthAP + params->offa) * sizeof(*clblasAP), clblasAP, 0,
+        NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "HPR2: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufAP, bufX, bufY);
+
+    compareMatrices<T>(clblasColumnMajor, lengthAP, 1, (blasAP + params->offa), (clblasAP + params->offa), lengthAP);
+
+	deleteBuffers<T>(blasAP, clblasAP, X, Y);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(HPR2, chpr2) {
+    TestParams params;
+
+    getParams(&params);
+    hpr2CorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(HPR2, zhpr2) {
+    TestParams params;
+
+    getParams(&params);
+    hpr2CorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-iamax.cpp b/src/tests/correctness/corr-iamax.cpp
new file mode 100644
index 0000000..81f2bd3
--- /dev/null
+++ b/src/tests/correctness/corr-iamax.cpp
@@ -0,0 +1,206 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <iamax.h>
+
+static void
+releaseMemObjects(cl_mem objX, cl_mem objiAmax, cl_mem objScratch)
+{
+    if(objX != NULL)
+ 	{
+        clReleaseMemObject(objX);
+	}
+	if(objiAmax != NULL)
+    {
+        clReleaseMemObject(objiAmax);
+    }
+	if(objScratch != NULL)
+    {
+        clReleaseMemObject(objScratch);
+    }
+}
+
+template <typename T> static void
+deleteBuffers(T *blasX, int *blasiAmax=NULL, int *clblasiAmax=NULL)
+{
+	if(blasX != NULL)
+	{
+        delete[] blasX;
+    }
+	if(clblasiAmax != NULL)
+    {
+        delete[] clblasiAmax;
+    }
+	if(blasiAmax != NULL)
+    {
+        delete(blasiAmax);
+    }
+}
+
+template <typename T>
+void
+iamaxCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *blasX;
+    int *clblasiAmax, *blasiAmax;
+    cl_mem bufX, bufiAmax, scratchBuff;
+    clMath::BlasBase *base;
+    cl_event *events;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));
+
+    blasX 	= new T[lengthX + params->offBX ];
+	blasiAmax = new int[1];
+    clblasiAmax = new int[1 + params->offa];
+
+	if((blasX == NULL) || (clblasiAmax == NULL) || (blasiAmax == NULL))
+	{
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        deleteBuffers<T>(blasX, blasiAmax, clblasiAmax);
+		delete[] events;
+		SUCCEED();
+        return;
+	}
+
+    srand(params->seed);
+    ::std::cerr << "Generating input data... ";
+
+	randomVectors<T>(params->N, (blasX + params->offBX), params->incx, NULL, 0);
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(T), 0, CL_MEM_READ_ONLY);
+    bufiAmax = base->createEnqueueBuffer(NULL, (1 + params->offa) * sizeof(int), 0, CL_MEM_READ_WRITE);
+	scratchBuff = base->createEnqueueBuffer(NULL, (2 * lengthX * sizeof(T)), 0, CL_MEM_READ_WRITE);
+
+    ::std::cerr << "Calling reference xiAMAX routine... ";
+
+	*blasiAmax = ::clMath::blas::iamax( params->N, blasX, params->offBX, params->incx);
+    ::std::cerr << "Done" << ::std::endl;
+
+    if ((bufX == NULL) || (bufiAmax == NULL) || (scratchBuff == NULL)) {
+        releaseMemObjects(bufX, bufiAmax, scratchBuff);
+        deleteBuffers<T>(blasX, blasiAmax, clblasiAmax);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xiAMAX routine... ";
+
+    DataType type;
+    type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE;
+
+    // Should use bufXTemp as well
+    err = (cl_int)::clMath::clblas::iamax( type, params->N, bufiAmax, params->offa,
+                                           bufX, params->offBX, params->incx, scratchBuff,
+                                            params->numCommandQueues, base->commandQueues(), 0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufX, bufiAmax, scratchBuff);
+        deleteBuffers<T>(blasX, blasiAmax, clblasiAmax);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::iAMAX() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufX, bufiAmax, scratchBuff);
+        deleteBuffers<T>(blasX, blasiAmax, clblasiAmax);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufiAmax, CL_TRUE, 0,
+        (1 + params->offa) * sizeof(*clblasiAmax), clblasiAmax, 0, NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "iAMAX: Reading results failed...." << std::endl;
+	}
+
+    compareValues<int>((blasiAmax), (clblasiAmax+params->offa), 0);
+    releaseMemObjects(bufX, bufiAmax, scratchBuff);
+    deleteBuffers<T>(blasX, blasiAmax, clblasiAmax);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(iAMAX, isamax) {
+    TestParams params;
+
+    getParams(&params);
+    iamaxCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(iAMAX, idamax) {
+    TestParams params;
+
+    getParams(&params);
+    iamaxCorrectnessTest<cl_double>(&params);
+}
+
+TEST_P(iAMAX, icamax) {
+    TestParams params;
+
+    getParams(&params);
+    iamaxCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(iAMAX, izamax) {
+    TestParams params;
+
+    getParams(&params);
+    iamaxCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-nrm2.cpp b/src/tests/correctness/corr-nrm2.cpp
new file mode 100644
index 0000000..588ee82
--- /dev/null
+++ b/src/tests/correctness/corr-nrm2.cpp
@@ -0,0 +1,218 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <nrm2.h>
+#include "delta.h"
+
+static void
+releaseMemObjects(cl_mem objX, cl_mem objNrm2, cl_mem objScratch)
+{
+    if(objX != NULL)
+ 	{
+        clReleaseMemObject(objX);
+	}
+	if(objNrm2 != NULL)
+    {
+        clReleaseMemObject(objNrm2);
+    }
+	if(objScratch != NULL)
+    {
+        clReleaseMemObject(objScratch);
+    }
+}
+
+template <typename T> static void
+deleteBuffers(T *blasX, T *blasNRM2=NULL, T *clblasNRM2=NULL)
+{
+	if(blasX != NULL)
+	{
+        delete[] blasX;
+    }
+	if(clblasNRM2 != NULL)
+    {
+        delete[] clblasNRM2;
+    }
+	if(blasNRM2 != NULL)
+    {
+        delete(blasNRM2);
+    }
+}
+
+template <typename T1, typename T2>
+void
+nrm2CorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T1 *blasX;
+    T2 *clblasNRM2, *blasNRM2;
+    cl_mem bufX, bufNRM2, scratchBuff;
+    clMath::BlasBase *base;
+    cl_event *events;
+    cl_double deltaForType = 0.0;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T1) == typeid(cl_double) ||
+         typeid(T1) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));
+
+    blasX 	= new T1[lengthX + params->offBX ];
+	blasNRM2 = new T2[1];
+    clblasNRM2 = new T2[1 + params->offa];
+
+	if((blasX == NULL) || (clblasNRM2 == NULL) || (blasNRM2 == NULL))
+	{
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        deleteBuffers<T1>(blasX);
+        deleteBuffers<T2>(blasNRM2,  clblasNRM2);
+		delete[] events;
+		SUCCEED();
+        return;
+	}
+
+    srand(params->seed);
+    ::std::cerr << "Generating input data... ";
+
+	randomVectors<T1>(params->N, (blasX + params->offBX), params->incx, (T1*)NULL, 0, true);
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE);
+    bufNRM2 = base->createEnqueueBuffer(NULL, (1 + params->offa) * sizeof(T2), 0, CL_MEM_READ_WRITE);
+	scratchBuff = base->createEnqueueBuffer(NULL, (lengthX * 2 * sizeof(T1)), 0, CL_MEM_READ_WRITE);
+
+    ::std::cerr << "Calling reference xNRM2 routine... ";
+
+	*blasNRM2  = ::clMath::blas::nrm2( params->N, blasX, params->offBX, params->incx);
+    ::std::cerr << "Done" << ::std::endl;
+
+    if ((bufX == NULL) || (bufNRM2 == NULL) || (scratchBuff == NULL)) {
+        releaseMemObjects(bufX, bufNRM2, scratchBuff);
+        deleteBuffers<T1>(blasX);
+        deleteBuffers<T2>(blasNRM2,  clblasNRM2);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xNRM2 routine... ";
+
+    DataType type;
+    type = ( typeid(T1) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T1) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T1) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE;
+
+    err = (cl_int)::clMath::clblas::nrm2( type, params->N,  bufNRM2, params->offa, bufX,
+    					params->offBX, params->incx, scratchBuff, params->numCommandQueues, base->commandQueues(),
+    					0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufX, bufNRM2, scratchBuff);
+        deleteBuffers<T1>(blasX);
+        deleteBuffers<T2>(blasNRM2,  clblasNRM2);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::NRM2() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufX, bufNRM2, scratchBuff);
+        deleteBuffers<T1>(blasX);
+        deleteBuffers<T2>(blasNRM2,  clblasNRM2);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufNRM2, CL_TRUE, 0,
+            (1 + params->offa) * sizeof(*clblasNRM2), clblasNRM2, 0, NULL, NULL);
+	if (err != CL_SUCCESS) {
+		::std::cerr << "NRM2: Reading results failed...." << std::endl;
+	}
+    releaseMemObjects(bufX, bufNRM2, scratchBuff);
+
+    deltaForType = DELTA_0<T1>();
+
+    // Since every element of X encounters a division, delta would be sum of deltas for every element in X
+    cl_double delta = 0;
+    for(unsigned int i=0; i<(params->N); i++) {
+        delta += deltaForType * returnMax<T1>(blasX[params->offBX + i]);
+    }
+    compareValues<T2>( (blasNRM2), (clblasNRM2+params->offa), delta);
+
+    deleteBuffers<T1>(blasX);
+    deleteBuffers<T2>(blasNRM2,  clblasNRM2);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(NRM2, snrm2) {
+    TestParams params;
+
+    getParams(&params);
+    nrm2CorrectnessTest<cl_float, cl_float>(&params);
+}
+
+TEST_P(NRM2, dnrm2) {
+    TestParams params;
+
+    getParams(&params);
+    nrm2CorrectnessTest<cl_double, cl_double>(&params);
+}
+
+TEST_P(NRM2, scnrm2) {
+    TestParams params;
+
+    getParams(&params);
+    nrm2CorrectnessTest<FloatComplex, cl_float>(&params);
+}
+
+TEST_P(NRM2, dznrm2) {
+    TestParams params;
+
+    getParams(&params);
+    nrm2CorrectnessTest<DoubleComplex, cl_double>(&params);
+}
diff --git a/src/tests/correctness/corr-rot.cpp b/src/tests/correctness/corr-rot.cpp
new file mode 100644
index 0000000..c9df97a
--- /dev/null
+++ b/src/tests/correctness/corr-rot.cpp
@@ -0,0 +1,234 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <rot.h>
+#include <matrix.h>
+//#include "delta.h"
+
+static void
+releaseMemObjects(cl_mem bufX, cl_mem bufY)
+{
+    if(bufX != NULL)
+ 	{
+        clReleaseMemObject(bufX);
+	}
+	if(bufY != NULL)
+    {
+        clReleaseMemObject(bufY);
+	}
+}
+
+template <typename T> static void
+deleteBuffers(T *X, T *Y, T *back_X, T *back_Y)
+{
+	if(X != NULL)
+	{
+        delete[] X;
+    }
+	if(Y != NULL)
+	{
+	    delete[] Y;
+	}
+    if(back_X != NULL)
+	{
+        delete[] back_X;
+    }
+	if(back_Y != NULL)
+	{
+	    delete[] back_Y;
+	}
+}
+
+template <typename T>
+void
+rotCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *X, *Y, *back_X, *back_Y;
+    T alpha, beta;
+    cl_mem bufX, bufY;
+    clMath::BlasBase *base;
+    cl_event *events;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision())
+    {
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthx = 1 + (params->N - 1) * abs(params->incx);
+    size_t lengthy = 1 + (params->N - 1) * abs(params->incy);
+
+    X 	= new T[lengthx + params->offa];
+    Y 	= new T[lengthy + params->offb];
+
+    back_X 	= new T[lengthx + params->offa];
+    back_Y 	= new T[lengthy + params->offb];
+
+	if((X == NULL) || (Y == NULL) ||
+        (back_X == NULL) || (back_Y == NULL))
+	{
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        deleteBuffers<T>(X, Y, back_X, back_Y);
+		delete[] events;
+		SUCCEED();
+        return;
+	}
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+    //Filling random values for SA and SB. C & S are only for output sake
+    randomVectors(params->N, (X + params->offa), params->incx, (Y+params->offb), params->incy);
+
+    alpha = convertMultiplier<T>(params->alpha);
+	beta = convertMultiplier<T>(params->beta);
+
+    memcpy(back_X, X, (lengthx + params->offa) * sizeof(T));
+    memcpy(back_Y, Y, (lengthy + params->offb) * sizeof(T));
+
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufX = base->createEnqueueBuffer(X, (lengthx + params->offa) * sizeof(T), 0, CL_MEM_READ_WRITE);
+    bufY = base->createEnqueueBuffer(Y, (lengthy + params->offb) * sizeof(T), 0, CL_MEM_READ_WRITE);
+
+    ::std::cerr << "Calling reference xROT routine... ";
+
+	::clMath::blas::rot(params->N, back_X, params->offa, params->incx, back_Y, params->offb, params->incy,
+                 alpha, beta);
+    ::std::cerr << "Done" << ::std::endl;
+
+    // Hold X vector
+
+    if ((bufX == NULL) || (bufY == NULL))
+    {
+        releaseMemObjects(bufX, bufY);
+        deleteBuffers(X, Y, back_X, back_Y);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xROT routine... ";
+
+
+    err = (cl_int)::clMath::clblas::rot( params->N, bufX, params->offa, params->incx, bufY, params->offb, params->incy,
+                              alpha, beta, params->numCommandQueues, base->commandQueues(), 0, NULL, events);
+
+    if (err != CL_SUCCESS)
+    {
+        releaseMemObjects(bufX, bufY);
+        deleteBuffers(X, Y, back_X, back_Y);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::ROT() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS)
+    {
+        releaseMemObjects(bufX, bufY);
+        deleteBuffers(X, Y,  back_X, back_Y );
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0,
+        (lengthx + params->offa) * sizeof(T), X, 0, NULL, NULL);
+
+    err |= clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0,
+        (lengthy + params->offb) * sizeof(T), Y, 0, NULL, NULL);
+
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "ROT: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufX, bufY);
+
+
+    compareMatrices<T>(clblasRowMajor, lengthx , 1, (back_X + params->offa), (X + params->offa), 1);
+    compareMatrices<T>(clblasRowMajor, lengthy , 1, (back_Y + params->offb), (Y + params->offb), 1);
+
+    deleteBuffers<T>(X, Y, back_X, back_Y);
+    delete[] events;
+}
+
+// Instantiate the test
+TEST_P(ROT, srot)
+{
+    TestParams params;
+
+    getParams(&params);
+    rotCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(ROT, drot)
+{
+    TestParams params;
+
+    getParams(&params);
+    rotCorrectnessTest<cl_double>(&params);
+}
+
+TEST_P(ROT, csrot)
+{
+    TestParams params;
+
+    getParams(&params);
+    rotCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(ROT, zdrot)
+{
+    TestParams params;
+
+    getParams(&params);
+    rotCorrectnessTest<DoubleComplex>(&params);
+}
+
+
diff --git a/src/tests/correctness/corr-rotg.cpp b/src/tests/correctness/corr-rotg.cpp
new file mode 100644
index 0000000..e26e7cd
--- /dev/null
+++ b/src/tests/correctness/corr-rotg.cpp
@@ -0,0 +1,292 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+/***********************************************************************
+**  Rotgright (C) 2011 Advanced Micro Devices, Inc. All Rights Reserved.
+***********************************************************************/
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <rotg.h>
+#include <matrix.h>
+#include "delta.h"
+
+static void
+releaseMemObjects(cl_mem bufSA, cl_mem bufSB, cl_mem bufC, cl_mem bufS)
+{
+    if(bufSA != NULL)
+ 	{
+        clReleaseMemObject(bufSA);
+	}
+	if(bufSB != NULL)
+    {
+        clReleaseMemObject(bufSB);
+	}
+    if(bufC != NULL)
+ 	{
+        clReleaseMemObject(bufC);
+	}
+	if(bufS != NULL)
+    {
+        clReleaseMemObject(bufS);
+	}
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *B, T *C=NULL, T *D=NULL, T *E=NULL, T *F=NULL)
+{
+	if(A != NULL)
+	{
+        delete[] A;
+    }
+	if(B != NULL)
+	{
+	    delete[] B;
+	}
+    if(C != NULL)
+	{
+        delete[] C;
+    }
+	if(D != NULL)
+	{
+	    delete[] D;
+	}
+    if(E != NULL)
+	{
+        delete[] E;
+    }
+	if(F != NULL)
+	{
+	    delete[] F;
+	}
+}
+
+// type T1 indicates the basic type,
+// while T2 indicates type of buffer C. C is not complex for complex types
+template <typename T1, typename T2>
+void
+rotgCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T1 *SA, *SB, *S, *back_SA, *back_SB, *back_S;
+    T2 *C, *back_C;
+    cl_mem bufSA, bufSB, bufC, bufS;
+    clMath::BlasBase *base;
+    cl_event *events;
+    cl_double deltaForType = 0.0;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T1) == typeid(cl_double) ||
+         typeid(T1) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision())
+    {
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t length = 1;//only one element need to be accessed always
+
+    SA 	= new T1[length + params->offBX ];
+    SB 	= new T1[length + params->offCY ];
+    C   = new T2[length + params->offa ];
+    S   = new T1[length + params->offb ];
+
+    back_SA 	= new T1[length + params->offBX ];
+    back_SB 	= new T1[length + params->offCY ];
+    back_C      = new T2[length + params->offa ];
+    back_S      = new T1[length + params->offb ];
+
+	if((SA == NULL) || (SB == NULL) || (C == NULL) || (S == NULL) ||
+        (back_SA == NULL) || (back_SB == NULL) || (back_C == NULL) || (back_S == NULL))
+	{
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        deleteBuffers<T1>(SA, SB, S, back_SA, back_SB, back_S);
+        deleteBuffers<T2>(C, back_C);
+		delete[] events;
+		SUCCEED();
+        return;
+	}
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+    //Filling random values for SA and SB. C & S are only for output sake
+    randomVectors(1, (SA+params->offBX), 1, (SB+params->offCY), 1);
+    S[params->offb] =  back_S[params->offb] = ZERO<T1>();
+    C[params->offa] = back_C[params->offa] = ZERO<T2>();
+
+    back_SA[params->offBX] = SA[params->offBX];
+    back_SB[params->offCY] = SB[params->offCY];
+    ::std::cerr << "Done" << ::std::endl;
+
+    //printing the inputs, as they change after processing
+    ::std::cerr << "A = ";
+    printElement<T1>(SA[params->offBX]);
+    ::std::cerr << "\tB = ";
+    printElement<T1>(SB[params->offCY]);
+    ::std::cerr << "\tC = ";
+    printElement<T2>(C[params->offa]);
+    ::std::cerr << "\tS = ";
+    printElement<T1>(S[params->offb]);
+    ::std::cout << std::endl << std::endl;
+
+	// Allocate buffers
+    bufSA = base->createEnqueueBuffer(SA, (length + params->offBX) * sizeof(T1), 0, CL_MEM_READ_WRITE);
+    bufSB = base->createEnqueueBuffer(SB, (length + params->offCY) * sizeof(T1), 0, CL_MEM_READ_WRITE);
+    bufC  = base->createEnqueueBuffer(C,  (length + params->offa ) * sizeof(T2), 0, CL_MEM_WRITE_ONLY);
+    bufS  = base->createEnqueueBuffer(S,  (length + params->offb ) * sizeof(T1), 0, CL_MEM_WRITE_ONLY);
+
+    ::std::cerr << "Calling reference xROTG routine... ";
+
+	::clMath::blas::rotg(back_SA, params->offBX, back_SB, params->offCY, back_C, params->offa, back_S, params->offb);
+    ::std::cerr << "Done" << ::std::endl;
+
+    // Hold X vector
+
+    if ((bufSA == NULL) || (bufSB == NULL) || (bufC == NULL) || (bufS == NULL))
+    {
+        releaseMemObjects(bufSA, bufSB, bufC, bufS);
+        deleteBuffers<T1>(SA, SB, S, back_SA, back_SB, back_S);
+        deleteBuffers<T2>(C, back_C);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xROTG routine... ";
+
+    DataType type;
+    type = ( typeid(T1) == typeid(cl_float)) ? TYPE_FLOAT :
+           ( typeid(T1) == typeid(cl_double)) ? TYPE_DOUBLE:
+           ( typeid(T1) == typeid(cl_float2)) ? TYPE_COMPLEX_FLOAT:
+            TYPE_COMPLEX_DOUBLE;
+
+    err = (cl_int)::clMath::clblas::rotg( type, bufSA, params->offBX, bufSB, params->offCY,
+                                         bufC, params->offa, bufS, params->offb,
+                                         params->numCommandQueues, base->commandQueues(), 0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufSA, bufSB, bufC, bufS);
+        deleteBuffers<T1>(SA, SB, S, back_SA, back_SB, back_S);
+        deleteBuffers<T2>(C, back_C);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::ROTG() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufSA, bufSB, bufC, bufS);
+        deleteBuffers<T1>(SA, SB, S, back_SA, back_SB, back_S);
+        deleteBuffers<T2>(C, back_C);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufSA, CL_TRUE, 0,
+        (length + params->offBX) * sizeof(T1), SA, 0, NULL, NULL);
+
+    err |= clEnqueueReadBuffer(base->commandQueues()[0], bufSB, CL_TRUE, 0,
+        (length + params->offCY) * sizeof(T1), SB, 0, NULL, NULL);
+
+    err |= clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, 0,
+        (length + params->offa) * sizeof(T2), C, 0, NULL, NULL);
+
+    err |= clEnqueueReadBuffer(base->commandQueues()[0], bufS, CL_TRUE, 0,
+        (length + params->offb) * sizeof(T1), S, 0, NULL, NULL);
+
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "ROTG: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufSA, bufSB, bufC, bufS);
+
+    deltaForType = DELTA_0<T1>();
+    cl_double delta;
+
+    delta = deltaForType * returnMax<T1>(back_SA[params->offBX]);
+    compareValues<T1>( (back_SA + params->offBX), (SA + params->offBX), delta);
+
+    delta = deltaForType * returnMax<T1>(back_SB[params->offCY]);
+    compareValues<T1>( (back_SB + params->offCY), (SB + params->offCY), delta);
+
+    delta = deltaForType * returnMax<T2>(back_C[params->offa]);
+    compareValues<T2>( (back_C + params->offa), (C + params->offa), delta);
+
+    delta = deltaForType * returnMax<T1>(back_S[params->offb]);
+    compareValues<T1>( (back_S + params->offb), (S + params->offb), delta);
+
+    deleteBuffers<T1>(SA, SB, S, back_SA, back_SB, back_S);
+    deleteBuffers<T2>(C, back_C);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(ROTG, srotg) {
+    TestParams params;
+
+    getParams(&params);
+    rotgCorrectnessTest<cl_float, cl_float>(&params);
+}
+
+TEST_P(ROTG, drotg) {
+    TestParams params;
+
+    getParams(&params);
+    rotgCorrectnessTest<cl_double, cl_double>(&params);
+}
+
+TEST_P(ROTG, crotg) {
+    TestParams params;
+
+    getParams(&params);
+    rotgCorrectnessTest<FloatComplex, cl_float>(&params);
+}
+
+TEST_P(ROTG, zrotg) {
+    TestParams params;
+
+    getParams(&params);
+    rotgCorrectnessTest<DoubleComplex, cl_double>(&params);
+}
+
diff --git a/src/tests/correctness/corr-rotm.cpp b/src/tests/correctness/corr-rotm.cpp
new file mode 100644
index 0000000..4a1a02e
--- /dev/null
+++ b/src/tests/correctness/corr-rotm.cpp
@@ -0,0 +1,232 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <rotm.h>
+#include <matrix.h>
+
+static void
+releaseMemObjects(cl_mem bufX, cl_mem bufY, cl_mem bufParam)
+{
+    if(bufX != NULL)
+ 	{
+        clReleaseMemObject(bufX);
+	}
+	if(bufY != NULL)
+    {
+        clReleaseMemObject(bufY);
+	}
+    if(bufParam != NULL)
+ 	{
+        clReleaseMemObject(bufParam);
+	}
+}
+
+template <typename T> static void
+deleteBuffers(T *X, T *Y, T *PARAM, T *back_X, T *back_Y, T *back_PARAM)
+{
+	if(X != NULL)
+	{
+        delete[] X;
+    }
+	if(Y != NULL)
+	{
+	    delete[] Y;
+	}
+    if(PARAM != NULL)
+	{
+        delete[] PARAM;
+    }
+    if(back_X != NULL)
+	{
+        delete[] back_X;
+    }
+	if(back_Y != NULL)
+	{
+	    delete[] back_Y;
+	}
+    if(back_PARAM != NULL)
+	{
+        delete[] back_PARAM;
+    }
+}
+
+template <typename T>
+void
+rotmCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *X, *Y, *back_X, *back_Y;
+    T *PARAM, *back_PARAM;
+    T sflagParam;
+    cl_mem bufX, bufY, bufParam;
+    clMath::BlasBase *base;
+    cl_event *events;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double)) &&
+        !base->isDevSupportDoublePrecision())
+    {
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthx = 1 + (params->N - 1) * abs(params->incx);
+    size_t lengthy = 1 + (params->N - 1) * abs(params->incy);
+
+    X 	= new T[lengthx + params->offa];
+    Y 	= new T[lengthy + params->offb];
+    PARAM   = new T[5 + params->offc]; //params always has 5 elements
+
+    back_X 	= new T[lengthx + params->offa];
+    back_Y 	= new T[lengthy + params->offb];
+    back_PARAM   = new T[5 + params->offc]; //params always has 5 elements
+
+	if((X == NULL) || (Y == NULL) || (PARAM == NULL) ||
+        (back_X == NULL) || (back_Y == NULL) || (back_PARAM == NULL))
+	{
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        deleteBuffers<T>(X, Y, PARAM, back_X, back_Y, back_PARAM);
+		delete[] events;
+		SUCCEED();
+        return;
+	}
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+    randomVectors(params->N, (X + params->offa), params->incx, (Y+params->offb), params->incy);
+    randomVectors(4, (PARAM + params->offc + 1), 1); //1st element is initialized separately
+
+    sflagParam = convertMultiplier<T>(params->alpha);
+    PARAM[params->offc] = sflagParam; // initializing first element
+
+    memcpy(back_X, X, (lengthx + params->offa)*sizeof(T));
+    memcpy(back_Y, Y, (lengthy + params->offb)*sizeof(T));
+    memcpy(back_PARAM, PARAM, (params->offc + 5)*sizeof(T));
+
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufX = base->createEnqueueBuffer(X, (lengthx + params->offa) * sizeof(T), 0, CL_MEM_READ_WRITE);
+    bufY = base->createEnqueueBuffer(Y, (lengthy + params->offb) * sizeof(T), 0, CL_MEM_READ_WRITE);
+    bufParam  = base->createEnqueueBuffer(PARAM,  (5 + params->offc) * sizeof(T), 0, CL_MEM_READ_ONLY);
+
+    ::std::cerr << "Calling reference xROTM routine... ";
+
+	::clMath::blas::rotm(params->N, back_X, params->offa, params->incx, back_Y, params->offb, params->incy,
+                 back_PARAM, params->offc);
+    ::std::cerr << "Done" << ::std::endl;
+
+    if ((bufX == NULL) || (bufY == NULL) || (bufParam == NULL))
+    {
+        releaseMemObjects(bufX, bufY, bufParam);
+        deleteBuffers(X, Y, PARAM, back_X, back_Y, back_PARAM);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xROTM routine... ";
+
+    DataType type;
+    type = ( typeid(T) == typeid(cl_float)) ? TYPE_FLOAT :
+            TYPE_DOUBLE;
+
+    err = (cl_int)::clMath::clblas::rotm( type, params->N, bufX, params->offa, params->incx, bufY, params->offb, params->incy,
+                              bufParam, params->offc, params->numCommandQueues, base->commandQueues(), 0, NULL, events);
+
+    if (err != CL_SUCCESS)
+    {
+        releaseMemObjects(bufX, bufY, bufParam);
+        deleteBuffers(X, Y, PARAM, back_X, back_Y, back_PARAM);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::ROTM() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS)
+    {
+        releaseMemObjects(bufX, bufY, bufParam);
+        deleteBuffers(X, Y, PARAM, back_X, back_Y, back_PARAM);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0,
+        (lengthx + params->offa) * sizeof(T), X, 0, NULL, NULL);
+
+    err |= clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0,
+        (lengthy + params->offb) * sizeof(T), Y, 0, NULL, NULL);
+
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "ROTM: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufX, bufY, bufParam);
+
+    compareMatrices<T>(clblasColumnMajor, lengthx , 1, (back_X + params->offa), (X + params->offa), lengthx);
+    compareMatrices<T>(clblasColumnMajor, lengthy , 1, (back_Y + params->offb), (Y + params->offb), lengthy);
+
+    deleteBuffers<T>(X, Y, PARAM, back_X, back_Y, back_PARAM);
+    delete[] events;
+}
+
+// Instantiate the test
+TEST_P(ROTM, srotm)
+{
+    TestParams params;
+
+    getParams(&params);
+    rotmCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(ROTM, drotm)
+{
+    TestParams params;
+
+    getParams(&params);
+    rotmCorrectnessTest<cl_double>(&params);
+}
diff --git a/src/tests/correctness/corr-rotmg.cpp b/src/tests/correctness/corr-rotmg.cpp
new file mode 100644
index 0000000..851310c
--- /dev/null
+++ b/src/tests/correctness/corr-rotmg.cpp
@@ -0,0 +1,283 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <rotmg.h>
+#include <matrix.h>
+#include "delta.h"
+
+static void
+releaseMemObjects(cl_mem bufD1, cl_mem bufD2, cl_mem bufX, cl_mem bufY, cl_mem bufParam)
+{
+    if(bufD1 != NULL)
+ 	{
+        clReleaseMemObject(bufD1);
+	}
+	if(bufD2 != NULL)
+    {
+        clReleaseMemObject(bufD2);
+	}
+    if(bufX != NULL)
+ 	{
+        clReleaseMemObject(bufX);
+	}
+	if(bufY != NULL)
+    {
+        clReleaseMemObject(bufY);
+	}
+    if(bufParam != NULL)
+ 	{
+        clReleaseMemObject(bufParam);
+	}
+}
+
+template <typename T> static void
+deleteBuffers(T *D1, T *D2, T *X, T *Y, T *PARAM)
+{
+	if(D1 != NULL)
+	{
+        delete[] D1;
+    }
+	if(D2 != NULL)
+	{
+	    delete[] D2;
+	}
+    if(X != NULL)
+	{
+        delete[] X;
+    }
+	if(Y != NULL)
+	{
+	    delete[] Y;
+	}
+    if(PARAM != NULL)
+	{
+        delete[] PARAM;
+    }
+}
+
+template <typename T>
+void
+rotmgCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *D1, *D2, *X, *Y, *PARAM;
+    T *back_D1, *back_D2, *back_X, *back_Y, *back_PARAM;
+    T sflagParam;
+    cl_mem bufD1, bufD2, bufX, bufY, bufParam;
+    clMath::BlasBase *base;
+    cl_event *events;
+    cl_double deltaForType = 0.0;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double)) &&
+        !base->isDevSupportDoublePrecision())
+    {
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    X 	    = new T[1 + params->offBX];
+    Y 	    = new T[1 + params->offCY];
+    D1 	    = new T[1 + params->offa];
+    D2 	    = new T[1 + params->offb];
+    PARAM   = new T[5 + params->offc]; //params always has 5 elements
+
+    back_X 	    = new T[1 + params->offBX];
+    back_Y 	    = new T[1 + params->offCY];
+    back_D1 	= new T[1 + params->offa];
+    back_D2 	= new T[1 + params->offb];
+    back_PARAM  = new T[5 + params->offc]; //params always has 5 elements
+
+	if((D1 == NULL) || (D2 == NULL) || (X == NULL) || (Y == NULL) || (PARAM == NULL) ||
+        (back_D1 == NULL) || (back_D2 == NULL) ||(back_X == NULL) || (back_Y == NULL) || (back_PARAM == NULL))
+	{
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        deleteBuffers<T>(D1, D2, X, Y, PARAM);
+        deleteBuffers<T>(back_D1, back_D2, back_X, back_Y, back_PARAM);
+		delete[] events;
+		SUCCEED();
+        return;
+	}
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+    //Filling random values for SA and SB. C & S are only for output sake
+    randomRotmg( (D1 + params->offa), (D2 + params->offb),
+                (X + params->offBX), (Y + params->offCY), (PARAM + params->offc) );
+
+    sflagParam = convertMultiplier<T>(params->alpha);
+    PARAM[params->offc] = sflagParam; // initializing first element
+
+    memcpy(back_X, X, (1 + params->offBX)*sizeof(T));
+    memcpy(back_Y, Y, (1 + params->offCY)*sizeof(T));
+    memcpy(back_D1, D1, (1 + params->offa)*sizeof(T));
+    memcpy(back_D2, D2, (1 + params->offb)*sizeof(T));
+    memcpy(back_PARAM, PARAM, (params->offc + 5)*sizeof(T));
+
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufD1 = base->createEnqueueBuffer(D1, (1 + params->offa) * sizeof(T), 0, CL_MEM_READ_WRITE);
+    bufD2 = base->createEnqueueBuffer(D2, (1 + params->offb) * sizeof(T), 0, CL_MEM_READ_WRITE);
+    bufX = base->createEnqueueBuffer(X, (1 + params->offBX) * sizeof(T), 0, CL_MEM_READ_WRITE);
+    bufY = base->createEnqueueBuffer(Y, (1 + params->offCY) * sizeof(T), 0, CL_MEM_READ_ONLY);
+    bufParam  = base->createEnqueueBuffer(PARAM,  (5 + params->offc) * sizeof(T), 0, CL_MEM_READ_WRITE);
+
+    ::std::cerr << "Calling reference xROTMG routine... ";
+
+	::clMath::blas::rotmg(back_D1, params->offa, back_D2, params->offb, back_X, params->offBX, back_Y, params->offCY,
+                 back_PARAM, params->offc);
+    ::std::cerr << "Done" << ::std::endl;
+
+    // Hold X vector
+
+    if ((bufD1 == NULL) || (bufD2 == NULL) || (bufX == NULL) || (bufY == NULL) || (bufParam == NULL))
+    {
+        releaseMemObjects(bufD1, bufD2, bufX, bufY, bufParam);
+        deleteBuffers<T>(D1, D2, X, Y, PARAM);
+        deleteBuffers<T>(back_D1, back_D2, back_X, back_Y, back_PARAM);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xROTMG routine... ";
+
+    DataType type;
+    type = ( typeid(T) == typeid(cl_float)) ? TYPE_FLOAT :
+            TYPE_DOUBLE;
+
+    err = (cl_int)::clMath::clblas::rotmg(  type, bufD1, params->offa, bufD2, params->offb, bufX, params->offBX,
+                                            bufY, params->offCY, bufParam, params->offc,
+                                            params->numCommandQueues, base->commandQueues(), 0, NULL, events);
+
+    if (err != CL_SUCCESS)
+    {
+        releaseMemObjects(bufD1, bufD2, bufX, bufY, bufParam);
+        deleteBuffers<T>(D1, D2, X, Y, PARAM);
+        deleteBuffers<T>(back_D1, back_D2, back_X, back_Y, back_PARAM);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::ROTMG() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS)
+    {
+        releaseMemObjects(bufD1, bufD2, bufX, bufY, bufParam);
+        deleteBuffers<T>(D1, D2, X, Y, PARAM);
+        deleteBuffers<T>(back_D1, back_D2, back_X, back_Y, back_PARAM);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufD1, CL_TRUE, 0,
+        (1 + params->offa) * sizeof(T), D1, 0, NULL, NULL);
+
+    err |= clEnqueueReadBuffer(base->commandQueues()[0], bufD2, CL_TRUE, 0,
+        (1 + params->offb) * sizeof(T), D2, 0, NULL, NULL);
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0,
+        (1 + params->offBX) * sizeof(T), X, 0, NULL, NULL);
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0,
+        (1 + params->offCY) * sizeof(T), Y, 0, NULL, NULL);
+
+    err |= clEnqueueReadBuffer(base->commandQueues()[0], bufParam, CL_TRUE, 0,
+        (5 + params->offc) * sizeof(T), PARAM, 0, NULL, NULL);
+
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "ROTMG: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufD1, bufD2, bufX, bufY, bufParam);
+
+    deltaForType = DELTA_0<T>();
+
+    #ifndef CORR_TEST_WITH_ACML
+    // Acml doesn't store answer in D1, D2 and X1. So skipping those checks
+        cl_double delta;
+        delta = deltaForType * returnMax<T>(back_D1[params->offa]);
+        compareValues<T>( (back_D1 + params->offa), (D1 + params->offa), delta);
+
+        delta = deltaForType * returnMax<T>(back_D2[params->offb]);
+        compareValues<T>( (back_D2 + params->offb), (D2 + params->offb), delta);
+
+        delta = deltaForType * returnMax<T>(back_X[params->offBX]);
+        compareValues<T>( (back_X + params->offBX), (X + params->offBX), delta);
+
+        delta = deltaForType * returnMax<T>(back_Y[params->offCY]);
+        compareValues<T>( (back_Y + params->offCY), (Y + params->offCY), delta);
+    #endif
+
+    // Creating delta array for PARAM array
+    cl_double deltaArr[5];
+    for(int i=0; i<5; i++) {
+        deltaArr[i] = deltaForType * returnMax<T>(back_PARAM[i + (params->offc)]);
+    }
+    compareMatrices<T>(clblasColumnMajor, 5 , 1, (back_PARAM + params->offc), (PARAM + params->offc), 5, deltaArr);
+
+    deleteBuffers<T>(D1, D2, X, Y, PARAM);
+    deleteBuffers<T>(back_D1, back_D2, back_X, back_Y, back_PARAM);
+
+    delete[] events;
+}
+
+// Instantiate the test
+TEST_P(ROTMG, srotmg)
+{
+    TestParams params;
+
+    getParams(&params);
+    rotmgCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(ROTMG, drotmg)
+{
+    TestParams params;
+
+    getParams(&params);
+    rotmgCorrectnessTest<cl_double>(&params);
+}
diff --git a/src/tests/correctness/corr-sbmv.cpp b/src/tests/correctness/corr-sbmv.cpp
new file mode 100644
index 0000000..b17f669
--- /dev/null
+++ b/src/tests/correctness/corr-sbmv.cpp
@@ -0,0 +1,224 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <sbmv.h>
+#include <gbmv.h>
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY)
+{
+    if(objA != NULL)
+	{
+        clReleaseMemObject(objA);
+    }
+	if(objX != NULL)
+	{
+        clReleaseMemObject(objX);
+	}
+	if(objY != NULL)
+	{
+	    clReleaseMemObject(objY);
+    }
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *X, T *blasY, T *clblasY)
+{
+    if(A != NULL)
+	{
+        delete[] A;
+	}
+    if(X != NULL)
+	{
+        delete[] X;
+	}
+	if(blasY != NULL)
+	{
+	    delete[] blasY;
+	}
+    if(clblasY != NULL)
+	{
+        delete[] clblasY; // To hold clblas GBMV call results
+    }
+}
+
+template <typename T>
+void
+sbmvCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *X, *blasY, *clblasY;
+    cl_mem bufA, bufX, bufY;
+    clMath::BlasBase *base;
+    cl_event *events;
+	T alpha, beta;
+	size_t lengthX, lengthY, lengthA;
+
+    base = clMath::BlasBase::getInstance();
+
+    if (((typeid(T) == typeid(cl_double))) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    lengthA =  params->N  * params->lda;
+    lengthX = (params->N - 1)*abs(params->incx) + 1;
+    lengthY = (params->N - 1)*abs(params->incy) + 1;
+
+    A 	= new T[ lengthA + params->offA ];
+    X 	= new T[ lengthX + params->offBX ];
+    blasY  		= new T[ lengthY + params->offCY ];
+	clblasY 	= new T[ lengthY + params->offCY ];
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+	if((A == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL))
+	{
+		deleteBuffers<T>(A, X, blasY, clblasY);
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        delete[] events;
+        SUCCEED();
+        return;
+	}
+
+	alpha = convertMultiplier<T>(params->alpha);
+	beta = convertMultiplier<T>(params->beta);
+
+    randomGbmvMatrices(params->order, clblasNoTrans, params->N, params->N, &alpha, &beta,
+                        (A + params->offA), params->lda, (X+params->offBX), params->incx, (blasY+params->offCY), params->incy );
+    // Copy blasY to clblasY
+    memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY));
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufA = base->createEnqueueBuffer(A, (lengthA + params->offA)* sizeof(*A), 0, CL_MEM_READ_ONLY);
+    bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY);
+    bufY = base->createEnqueueBuffer(clblasY, (lengthY + params->offCY) * sizeof(*clblasY), 0, CL_MEM_READ_WRITE);
+
+    ::std::cerr << "Calling reference xSBMV routine... ";
+
+	clblasOrder fOrder;
+	clblasUplo fUplo;
+	fOrder = params->order;
+	fUplo = params->uplo;
+	size_t fN = params->N, fK = params->K;
+
+	if (fOrder != clblasColumnMajor)
+    {
+        fOrder = clblasColumnMajor;
+        fUplo = (params->uplo == clblasLower)? clblasUpper : clblasLower;
+        fN = params->N;
+   	}
+
+	clMath::blas::sbmv(fOrder, fUplo, fN, fK, alpha, A, params->offA, params->lda,
+							X, params->offBX, params->incx, beta, blasY, params->offCY, params->incy);
+    ::std::cerr << "Done" << ::std::endl;
+
+    if ((bufA == NULL) || (bufX == NULL) || (bufY == NULL)) {
+        // Skip the test, the most probable reason is
+        //     matrix too big for a device.
+
+        releaseMemObjects(bufA, bufX, bufY);
+        deleteBuffers<T>(A, X, blasY, clblasY);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xSBMV routine... ";
+
+    err = (cl_int)clMath::clblas::sbmv(params->order, params->uplo, params->N, params->K,
+                                        alpha, bufA, params->offA, params->lda, bufX, params->offBX, params->incx,
+                                        beta, bufY, params->offCY, params->incy,
+                                        params->numCommandQueues, base->commandQueues(), 0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufX, bufY);
+        deleteBuffers<T>(A, X, blasY, clblasY);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SBMV() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufX, bufY);
+        deleteBuffers<T>(A, X, blasY, clblasY);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0,
+        (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0,
+        NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "SBMV: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufA, bufX, bufY);
+    compareMatrices<T>(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY),
+                       lengthY);
+    deleteBuffers<T>(A, X, blasY, clblasY);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(SBMV, ssbmv) {
+    TestParams params;
+
+    getParams(&params);
+    sbmvCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(SBMV, dsbmv) {
+    TestParams params;
+
+    getParams(&params);
+    sbmvCorrectnessTest<cl_double>(&params);
+}
+
diff --git a/src/tests/correctness/corr-scal.cpp b/src/tests/correctness/corr-scal.cpp
new file mode 100644
index 0000000..ad156f6
--- /dev/null
+++ b/src/tests/correctness/corr-scal.cpp
@@ -0,0 +1,215 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <scal.h>
+
+static void
+releaseMemObjects(cl_mem objX)
+{
+    if(objX != NULL)
+    {
+        clReleaseMemObject(objX);
+	}
+}
+
+template <typename T> static void
+deleteBuffers(T *blasX, T *clblasX)
+{
+	if(blasX != NULL)
+	{
+        delete[] blasX;
+    }
+	if(clblasX != NULL)
+	{
+		delete[] clblasX;
+	}
+}
+
+template <typename T>
+void scalCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *blasX, *clblasX;
+    cl_mem bufX;
+    clMath::BlasBase *base;
+    cl_event *events;
+    T alpha;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));
+    bool is_css_zds = (params->K == 1)? true: false;        // K indicates csscal/zdscal
+
+    blasX = new T[lengthX + params->offBX ];
+    clblasX = new T[lengthX + params->offBX ];
+
+	if( (blasX == NULL) || (clblasX == NULL) )
+	{
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        deleteBuffers<T>(blasX, clblasX);
+		delete[] events;
+		SUCCEED();
+        return;
+	}
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+    randomVectors(params->N, (blasX+params->offBX), params->incx);
+    alpha = convertMultiplier<T>(params->alpha);
+    memcpy(clblasX, blasX, (lengthX + params->offBX)* sizeof(*blasX));
+    ::std::cerr << "Done" << ::std::endl;
+    bufX = base->createEnqueueBuffer(clblasX, (lengthX + params->offBX)* sizeof(*clblasX), 0, CL_MEM_READ_WRITE);
+
+    ::std::cerr << "Calling reference xSCAL routine... ";
+    // Both blas and clBlas wrapper functions consider the real part of alpha in case of css/zdscal
+    // This is to make sure both get the same scalar alpha. check wrapper functions
+    ::clMath::blas::scal(is_css_zds, params->N, alpha, blasX, params->offBX, params->incx);
+    ::std::cerr << "Done" << ::std::endl;
+
+    if (bufX == NULL) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufX);
+        deleteBuffers<T>(blasX, clblasX);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xSCAL routine... ";
+    // Both blas and clBlas wrapper functions consider the real part of alpha in case of css/zdscal
+    // This is to make sure both get the same scalar alpha. check wrapper functions
+    err = (cl_int)::clMath::clblas::scal(is_css_zds, params->N, alpha, bufX, params->offBX,
+                    params->incx, params->numCommandQueues, base->commandQueues(), 0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufX);
+        deleteBuffers<T>(blasX, clblasX);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SCAL() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufX);
+        deleteBuffers<T>(blasX, clblasX);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0,
+                   (lengthX + params->offBX) * sizeof(*clblasX), clblasX, 0, NULL, NULL);
+	if (err != CL_SUCCESS) {
+		::std::cerr << "SCAL: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufX);
+
+    compareMatrices<T>(clblasColumnMajor, lengthX , 1, (blasX + params->offBX),
+                        (clblasX + params->offBX), lengthX);
+    deleteBuffers<T>(blasX, clblasX);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(SCAL, sscal) {
+    TestParams params;
+
+    getParams(&params);
+    params.K = 0;                           // K will indicate wheather routine is csscal/zdscal
+    scalCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(SCAL, dscal) {
+    TestParams params;
+
+    getParams(&params);
+    params.K = 0;                           // K will indicate wheather routine is csscal/zdscal
+    scalCorrectnessTest<cl_double>(&params);
+}
+
+TEST_P(SCAL, cscal) {
+    TestParams params;
+
+    getParams(&params);
+    params.K = 0;                           // K will indicate wheather routine is csscal/zdscal
+    scalCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(SCAL, zscal) {
+    TestParams params;
+
+    getParams(&params);
+    params.K = 0;                           // K will indicate wheather routine is csscal/zdscal
+    scalCorrectnessTest<DoubleComplex>(&params);
+}
+
+
+// For these 2 routines alpha is scalar
+TEST_P(SCAL, csscal) {
+    TestParams params;
+
+    getParams(&params);
+    params.K = 1;                           // K will indicate wheather routine is csscal/zdscal
+    scalCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(SCAL, zdscal) {
+    TestParams params;
+
+    getParams(&params);
+    params.K = 1;                           // K will indicate wheather routine is csscal/zdscal
+    scalCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-spmv.cpp b/src/tests/correctness/corr-spmv.cpp
new file mode 100644
index 0000000..dcbad3b
--- /dev/null
+++ b/src/tests/correctness/corr-spmv.cpp
@@ -0,0 +1,220 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <spmv.h>
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY)
+{
+    if(objA != NULL)
+	{
+        clReleaseMemObject(objA);
+    }
+	if(objX != NULL)
+	{
+        clReleaseMemObject(objX);
+	}
+	if(objY != NULL)
+	{
+	    clReleaseMemObject(objY);
+    }
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *X, T *blasY, T *clblasY)
+{
+    if(A != NULL)
+	{
+        delete[] A;
+	}
+    if(X != NULL)
+	{
+        delete[] X;
+	}
+	if(blasY != NULL)
+	{
+	    delete[] blasY;
+	}
+    if(clblasY != NULL)
+	{
+        delete[] clblasY; // To hold clblas SPMV call results
+    }
+}
+
+template <typename T>
+void
+spmvCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *AP, *X, *blasY, *clblasY;
+    cl_mem bufAP, bufX, bufY;
+    clMath::BlasBase *base;
+    cl_event *events;
+	T alpha, beta;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthA = (params->N * (params->N + 1)) / 2;
+    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));
+	size_t lengthY = (1 + ((params->N -1) * abs(params->incy)));
+
+    AP 	= new T[lengthA + params->offA ];
+    X 	= new T[lengthX + params->offBX ];
+    blasY  		= new T[lengthY + params->offCY ];
+	clblasY 	= new T[lengthY + params->offCY ];
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+	if((AP == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL))
+	{
+		deleteBuffers<T>(AP, X, blasY, clblasY);
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        delete[] events;
+        SUCCEED();
+        return;
+	}
+
+	alpha = convertMultiplier<T>(params->alpha);
+	beta = convertMultiplier<T>(params->beta);
+
+    randomSpmvMatrices(params->order, params->uplo, params->N, true, &alpha, (AP + params->offA),
+						(X + params->offBX), params->incx, true, &beta, (blasY + params->offCY), params->incy);
+    // Copy blasY to clblasY
+    memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY));
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufAP = base->createEnqueueBuffer(AP, (lengthA + params->offA)* sizeof(*AP), 0, CL_MEM_READ_ONLY);
+    bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY);
+    bufY = base->createEnqueueBuffer(clblasY, (lengthY + params->offCY) * sizeof(*clblasY), 0, CL_MEM_READ_WRITE);
+
+    ::std::cerr << "Calling reference xSPMV routine... ";
+
+	clblasOrder order;
+    clblasUplo fUplo;
+
+	order = params->order;
+    fUplo = params->uplo;
+
+	if (order != clblasColumnMajor)
+    {
+        order = clblasColumnMajor;
+        fUplo =  (params->uplo == clblasUpper)? clblasLower : clblasUpper;
+    }
+	::clMath::blas::spmv( order, fUplo, params->N, alpha, AP, params->offA, X, params->offBX, params->incx,
+						beta, blasY, params->offCY, params->incy);
+    ::std::cerr << "Done" << ::std::endl;
+
+    if ((bufAP == NULL) || (bufX == NULL) || (bufY == NULL)) {
+        // Skip the test, the most probable reason is
+        //     matrix too big for a device.
+
+        releaseMemObjects(bufAP, bufX, bufY);
+        deleteBuffers<T>(AP, X, blasY, clblasY);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xSPMV routine... ";
+
+    err = (cl_int)::clMath::clblas::spmv(params->order, params->uplo, params->N, alpha, bufAP,
+    					params->offA, bufX, params->offBX, params->incx, beta, bufY, params->offCY, params->incy,
+						params->numCommandQueues, base->commandQueues(), 0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufAP, bufX, bufY);
+        deleteBuffers<T>(AP, X, blasY, clblasY);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SPMV() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufAP, bufX, bufY);
+        deleteBuffers<T>(AP, X, blasY, clblasY);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0,
+        (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0,
+        NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "SPMV: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufAP, bufX, bufY);
+
+    compareMatrices<T>(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY),
+                       lengthY);
+    deleteBuffers<T>(AP, X, blasY, clblasY);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(SPMV, sspmv) {
+    TestParams params;
+
+    getParams(&params);
+    spmvCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(SPMV, dspmv) {
+    TestParams params;
+
+    getParams(&params);
+    spmvCorrectnessTest<cl_double>(&params);
+}
diff --git a/src/tests/correctness/corr-spr.cpp b/src/tests/correctness/corr-spr.cpp
new file mode 100644
index 0000000..4b00a02
--- /dev/null
+++ b/src/tests/correctness/corr-spr.cpp
@@ -0,0 +1,228 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <spr.h>
+
+static void
+releaseMemObjects(cl_mem objAP, cl_mem objX)
+{
+	if(objAP != NULL)
+	{
+    clReleaseMemObject(objAP);
+	}
+	if(objX != NULL)
+	{
+    clReleaseMemObject(objX);
+}
+
+}
+
+template <typename T> static void
+deleteBuffers(T *blasAP, T *clblasAP, T *X)
+{
+	if(blasAP != NULL)
+	{
+    delete[] blasAP;
+	}
+	if(clblasAP != NULL)
+	{
+    delete[] clblasAP;
+	}
+	if(X != NULL)
+	{
+	delete[] X;
+}
+}
+
+template <typename T>
+void
+sprCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *blasAP, *clblasAP, *X;
+//	T *tempA;
+    cl_mem bufAP, bufX;
+    clMath::BlasBase *base;
+    cl_event *events;
+	bool useAlpha;
+	T alpha;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthAP = ( ( params->N*( params->N + 1 ) )/2 );
+    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));
+
+    blasAP 		= new T[lengthAP + params->offa];
+    clblasAP 	= new T[lengthAP + params->offa];
+    X		 	= new T[lengthX + params->offBX];
+//	tempA 		= new T[lengthA + params->offa ];
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+
+	memset(blasAP, -1, (lengthAP + params->offa));
+	memset(clblasAP, -1, (lengthAP + params->offa));
+	memset(X, -1, (lengthX + params->offBX));
+
+	alpha =  convertMultiplier<T>(params->alpha);
+	useAlpha = true;
+
+	#ifdef DEBUG_SPR
+	printf("ALPHA in CORR_SPR.CPP %f\n", alpha);
+	#endif
+
+	if((blasAP == NULL) || (X == NULL) || (clblasAP == NULL))
+    {
+        ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+		deleteBuffers<T>(blasAP, clblasAP, X);
+        delete[] events;
+        SUCCEED();
+        return;
+    }
+
+	randomSyrMatrices<T>(params->order, params->uplo, params->N, useAlpha, &alpha,
+						(blasAP + params->offa), 0, (X + params->offBX), params->incx);
+
+    memcpy(clblasAP, blasAP, (lengthAP + params->offa)* sizeof(*blasAP));
+
+	::std::cerr << "Done" << ::std::endl;
+
+    bufAP = base->createEnqueueBuffer(clblasAP, (lengthAP + params->offa) * sizeof(*clblasAP), 0, CL_MEM_READ_WRITE);
+    bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY);
+
+    ::std::cerr << "Calling reference xSPR routine... ";
+
+	clblasOrder order;
+    clblasUplo fUplo;
+	order = params->order;
+    fUplo = params->uplo;
+
+
+	if (order != clblasColumnMajor)
+    {
+
+        order = clblasColumnMajor;
+        fUplo =  (params->uplo == clblasUpper)? clblasLower : clblasUpper;
+
+        if( params->transA == clblasConjTrans )
+            doConjugate( (blasAP +params->offa), (( params->N * (params->N + 1)) / 2) , 1, 1 );
+
+    }
+
+    clMath::blas::spr( clblasColumnMajor, fUplo, params->N, alpha, X, params->offBX, params->incx, blasAP, params->offa);
+    ::std::cerr << "Done" << ::std::endl;
+
+    if ((bufAP == NULL) || (bufX == NULL) ) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufAP, bufX);
+        deleteBuffers<T>(blasAP, clblasAP, X);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xSPR routine... ";
+
+    err = (cl_int)::clMath::clblas::spr( params->order, params->uplo, params->N, alpha,
+						bufX, params->offBX, params->incx, bufAP, params->offa,
+						params->numCommandQueues, base->commandQueues(),
+    					0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufAP, bufX);
+        deleteBuffers<T>(blasAP, clblasAP, X);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SYR() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufAP, bufX);
+        deleteBuffers<T>(blasAP, clblasAP, X);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufAP, CL_TRUE, 0,
+        (lengthAP + params->offa) * sizeof(*clblasAP), clblasAP, 0,
+        NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "SPR: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufAP, bufX);
+	printf("Comparing the results\n");
+	compareMatrices<T>(clblasColumnMajor, lengthAP , 1, (blasAP + params->offa), (clblasAP + params->offa),
+                       lengthAP);
+
+	deleteBuffers<T>(blasAP, clblasAP, X);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(SPR, sspr) {
+    TestParams params;
+
+    getParams(&params);
+    sprCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(SPR, dspr) {
+    TestParams params;
+
+    getParams(&params);
+    sprCorrectnessTest<cl_double>(&params);
+}
diff --git a/src/tests/correctness/corr-spr2.cpp b/src/tests/correctness/corr-spr2.cpp
new file mode 100644
index 0000000..c000e64
--- /dev/null
+++ b/src/tests/correctness/corr-spr2.cpp
@@ -0,0 +1,216 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <spr2.h>
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY)
+{
+    if(objA != NULL) {
+    clReleaseMemObject(objA);
+	}
+    if(objX != NULL) {
+    clReleaseMemObject(objX);
+	}
+	if(objY != NULL) {
+	clReleaseMemObject(objY);
+}
+}
+
+template <typename T> static void
+deleteBuffers(T *blasA, T *clblasA, T *X, T *Y)
+{
+	if(blasA != NULL) {
+    delete[] blasA;
+	}
+	if(clblasA != NULL) {
+    delete[] clblasA;
+	}
+	if(X != NULL) {
+	delete[] X;
+	}
+	if(Y != NULL) {
+	delete[] Y;
+}
+}
+
+template <typename T>
+void
+spr2CorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *blasAP, *clblasAP, *X, *Y;
+    cl_mem bufAP, bufX, bufY;
+    clMath::BlasBase *base;
+    cl_event *events;
+	bool useAlpha;
+	T alpha;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthAP = (params->N *( params->N + 1 ))/2 ;
+    size_t lengthX = (1 + ((params->N - 1) * abs(params->incx)));
+	size_t lengthY = (1 + ((params->N - 1) * abs(params->incy)));
+
+    blasAP 		= new T[lengthAP + params->offa ];
+    clblasAP 	= new T[lengthAP + params->offa ];
+    X		 	= new T[lengthX + params->offBX ];
+	Y			= new T[lengthY + params->offCY ];
+
+    srand(params->seed);
+
+	if((blasAP == NULL) || (clblasAP == NULL) || (X == NULL) || (Y == NULL))
+    {
+        deleteBuffers<T>(blasAP, clblasAP, X, Y);
+        ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        delete[] events;
+        SUCCEED();
+        return;
+    }
+
+	alpha =  convertMultiplier<T>(params->alpha);
+	useAlpha = true;
+
+    ::std::cerr << "Generating input data... ";
+    randomSyr2Matrices<T>(params->order, params->uplo, params->N, useAlpha, &alpha, (blasAP + params->offa), params->lda,
+							(X + params->offBX), params->incx, (Y + params->offCY), params->incy);
+
+	// Copy blasAP to clblasAP
+    memcpy(clblasAP, blasAP, (lengthAP + params->offa)* sizeof(*blasAP));
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufAP = base->createEnqueueBuffer(clblasAP, (lengthAP + params->offa)* sizeof(*clblasAP), 0,CL_MEM_READ_WRITE);
+    bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY);
+	bufY = base->createEnqueueBuffer(Y, (lengthY + params->offCY)* sizeof(*Y), 0, CL_MEM_READ_ONLY);
+
+    ::std::cerr << "Calling reference xSPR2 routine... ";
+
+	clblasOrder order;
+    clblasUplo fUplo;
+
+	order = params->order;
+    fUplo = params->uplo;
+
+	if (order != clblasColumnMajor)
+    {
+        order = clblasColumnMajor;
+        fUplo =  (params->uplo == clblasUpper)? clblasLower : clblasUpper;
+    }
+
+	::clMath::blas::spr2( order, fUplo, params->N, alpha, X, params->offBX, params->incx,
+		                Y, params->offCY, params->incy, blasAP, params->offa);
+    ::std::cerr << "Done" << ::std::endl;
+
+    if ((bufAP == NULL) || (bufX == NULL) || (bufY == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufAP, bufX, bufY);
+        deleteBuffers<T>(blasAP, clblasAP, X, Y);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xSPR2 routine... ";
+
+    err = (cl_int)::clMath::clblas::spr2( params->order, params->uplo, params->N, alpha,
+						bufX, params->offBX, params->incx, bufY, params->offCY, params->incy, bufAP, params->offa,
+						params->numCommandQueues, base->commandQueues(),
+    					0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufAP, bufX, bufY);
+        deleteBuffers<T>(blasAP, clblasAP, X, Y);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SPR2() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufAP, bufX, bufY);
+        deleteBuffers<T>(blasAP, clblasAP, X, Y);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufAP, CL_TRUE, 0,
+                                (lengthAP + params->offa) * sizeof(*clblasAP), clblasAP, 0,
+                                NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "SPR2: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufAP, bufX, bufY);
+
+    compareMatrices<T>(clblasColumnMajor, lengthAP, 1, (blasAP + params->offa), (clblasAP + params->offa), lengthAP);
+
+	deleteBuffers<T>(blasAP, clblasAP, X, Y);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(SPR2, sspr2) {
+    TestParams params;
+
+    getParams(&params);
+    spr2CorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(SPR2, dspr2) {
+    TestParams params;
+
+    getParams(&params);
+    spr2CorrectnessTest<cl_double>(&params);
+}
diff --git a/src/tests/correctness/corr-swap.cpp b/src/tests/correctness/corr-swap.cpp
new file mode 100644
index 0000000..e62f88a
--- /dev/null
+++ b/src/tests/correctness/corr-swap.cpp
@@ -0,0 +1,221 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <swap.h>
+
+static void
+releaseMemObjects(cl_mem objX,  cl_mem objY)
+{
+  if(objX != NULL)
+  {
+  	clReleaseMemObject(objX);
+  }
+  if(objY != NULL)
+  {
+    clReleaseMemObject(objY);
+  }
+}
+
+template <typename T> static void
+deleteBuffers(T *X, T *Y,  T *blasX, T *blasY)
+{
+    if(X != NULL)
+    {
+        delete[] X;
+    }
+	if(Y != NULL)
+    {
+        delete[] Y;
+    }
+    if(blasX != NULL)
+	{
+        delete[] blasX;
+	}
+	if(blasY != NULL)
+	{
+        delete[] blasY;
+	}
+}
+
+template <typename T>
+void
+swapCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *X, *Y, *blasX, *blasY;
+    cl_mem bufX, bufY;
+    clMath::BlasBase *base;
+    cl_event *events;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));
+    size_t lengthY = (1 + ((params->N -1) * abs(params->incy)));
+
+    X 		= new T[lengthX + params->offBX ];
+    Y 		= new T[lengthY + params->offCY ];
+    blasX 	= new T[lengthX + params->offBX ];
+    blasY	= new T[lengthY + params->offCY ];
+
+	if((X == NULL) || (blasX == NULL) || (Y == NULL) || (blasY == NULL))
+	{
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        deleteBuffers<T>(X, Y, blasX, blasY);
+		delete[] events;
+		SUCCEED();
+        return;
+	}
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+    // Populate A and blasX
+    randomVectors(params->N, (X+params->offBX), params->incx, (Y+params->offCY), params->incy);
+
+	memcpy(blasX, X, (lengthX + params->offBX) * sizeof(T));
+	memcpy(blasY, Y, (lengthY + params->offCY) * sizeof(T));
+
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(T), 0, CL_MEM_READ_WRITE);
+    bufY = base->createEnqueueBuffer(Y, (lengthY + params->offCY)* sizeof(T), 0, CL_MEM_READ_WRITE);
+
+	if ((bufX == NULL) || (bufY == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufX, bufY);
+        deleteBuffers<T>(X, Y, blasX, blasY);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling reference xSWAP routine... ";
+
+	::clMath::blas::swap( params->N, blasX, params->offBX, params->incx,
+						 blasY, params->offCY, params->incy);
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    ::std::cerr << "Calling clblas xSWAP routine... ";
+
+    DataType type;
+    type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : (( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: (( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE));
+
+    err = (cl_int)::clMath::clblas::swap( type, params->N, bufX, params->offBX, params->incx, bufY, params->offCY, params->incy,
+										  params->numCommandQueues, base->commandQueues(), 0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufX, bufY);
+        deleteBuffers<T>(X, Y, blasX, blasY);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SWAP() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufX, bufY);
+        deleteBuffers<T>(X, Y, blasX, blasY);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0,
+        (lengthX + params->offBX) * sizeof(T), X, 0, NULL, NULL);
+    err |= clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0,
+        (lengthY + params->offCY) * sizeof(T), Y, 0, NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "SWAP: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufX, bufY);
+
+
+    compareMatrices<T>(clblasColumnMajor, lengthX , 1, (blasX + params->offBX), (X + params->offBX), lengthX);
+    compareMatrices<T>(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (Y + params->offCY), lengthY);
+    deleteBuffers<T>(X, Y, blasX, blasY);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(SWAPXY, sswap) {
+    TestParams params;
+
+    getParams(&params);
+    swapCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(SWAPXY, dswap) {
+    TestParams params;
+
+    getParams(&params);
+    swapCorrectnessTest<cl_double>(&params);
+}
+
+TEST_P(SWAPXY, cswap) {
+    TestParams params;
+
+    getParams(&params);
+    swapCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(SWAPXY, zswap) {
+    TestParams params;
+
+    getParams(&params);
+    swapCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-symm.cpp b/src/tests/correctness/corr-symm.cpp
new file mode 100644
index 0000000..cb74768
--- /dev/null
+++ b/src/tests/correctness/corr-symm.cpp
@@ -0,0 +1,281 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <symm.h>
+#include<cltypes.h>
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objB, cl_mem objC)
+{
+    if(objA != NULL)
+ 	{
+    clReleaseMemObject(objA);
+	}
+	if(objB != NULL)
+    {
+    clReleaseMemObject(objB);
+	}
+	if(objC != NULL)
+	{
+    clReleaseMemObject(objC);
+}
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *B, T *C, T *backC)
+{
+    if(A != NULL)
+    {
+    delete[] A;
+    }
+	if(B != NULL)
+	{
+    delete[] B;
+	}
+	if(C != NULL)
+	{
+    delete[] C;
+}
+	if(backC != NULL)
+	{
+		delete[] backC;
+	}
+}
+
+template <typename T>
+void
+symmCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *B, *C, *backC;
+	T alpha_, beta_;
+    cl_mem bufA, bufB, bufC;
+    clMath::BlasBase *base;
+    cl_event *events;
+	size_t ka, kbc;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    events = new cl_event[params->numCommandQueues];
+    if (events == NULL)
+    {
+    }
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+	if( params->side == clblasLeft )
+                ka = params->M;
+        else    ka = params->N;
+
+    if( params->order == clblasColumnMajor )
+                kbc = params->N;
+        else    kbc = params->M;
+
+    size_t lengthA = ka  * params->lda;
+    size_t lengthB = kbc * params->ldb;
+    size_t lengthC = kbc * params->ldc;
+
+    alpha_ = convertMultiplier<T>(params->alpha);
+    beta_ = convertMultiplier<T>(params->beta);
+
+    A 		= new T[ lengthA + params->offa ];
+    B   	= new T[ lengthB + params->offb ];
+    C   	= new T[ lengthC + params->offc ];
+    backC   = new T[ lengthC + params->offc ];
+
+	if((A == NULL) || (B == NULL) || (C == NULL) || (backC == NULL))
+	{
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+		deleteBuffers<T>(A, B, C, backC);
+        delete[] events;
+        SUCCEED();
+        return;
+	}
+    srand(params->seed);
+    ::std::cerr << "Generating input data... ";
+
+    int creationFlags = 0, AcreationFlags;
+    creationFlags =  creationFlags | RANDOM_INIT;
+    creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
+    AcreationFlags = ( (params-> uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY);
+	BlasRoutineID BlasFn = CLBLAS_SYMM;
+
+#ifdef __TEST_CSYMM_ACML_NANBUG__
+	//
+	// NOTE: Whether this clearing to zero is present or not
+	// 		 ACML returns "nan" for few csymm cases. This is here
+	//		 to make things easier and rule of out-of-bound inputs
+	//
+	memset(A, 0, (lengthA + params->offa)*sizeof(T));
+	memset(B, 0, (lengthB + params->offb)*sizeof(T));
+	memset(C, 0, (lengthC + params->offc)*sizeof(T));
+#else
+	populate( A + params->offa , ka, ka, params-> lda, BlasFn, AcreationFlags);
+    populate( B + params->offb , params-> M, params-> N, params-> ldb, BlasFn, creationFlags);
+    populate( C + params->offc , params-> M, params-> N, params-> ldc, BlasFn, creationFlags);
+#endif
+
+    // Copy C to backX
+    memcpy(backC, C, (lengthC + params->offc) * sizeof(T));
+
+	// Allocate buffers
+    bufA = base->createEnqueueBuffer(A, (lengthA + params->offa) * sizeof(T), 0, CL_MEM_READ_ONLY);
+    bufB = base->createEnqueueBuffer(B, (lengthB + params->offb) * sizeof(T), 0, CL_MEM_READ_ONLY);
+    bufC = base->createEnqueueBuffer(backC, (lengthC + params->offc) * sizeof(T), 0, CL_MEM_READ_WRITE);
+
+    ::std::cerr << "Done" << ::std::endl;
+    ::std::cerr << "Calling reference xSYMM routine... ";
+
+	clblasOrder fOrder;
+    clblasUplo fUplo;
+    clblasSide fSide;
+    size_t fN, fM;
+
+	fOrder = params->order;
+    fUplo = params->uplo;
+    fSide = params->side;
+	fM = params->M;
+    fN = params->N;
+
+	if (fOrder != clblasColumnMajor) {
+
+           fOrder = clblasColumnMajor;
+           fM = params->N;
+           fN = params->M;
+           fSide = (params->side == clblasLeft)? clblasRight: clblasLeft;
+           fUplo = (params->uplo == clblasUpper)? clblasLower: clblasUpper;
+       }
+
+	// Call reference blas routine
+	clMath::blas::symm(fOrder, fSide, fUplo, fM, fN, alpha_,
+                            A, params->offa, params->lda, B, params->offb, params->ldb, beta_, C, params->offc, params->ldc);
+	::std::cerr << "Done" << ::std::endl;
+
+    if ((bufA == NULL) || (bufB == NULL) || (bufC == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufA, bufB, bufC);
+        deleteBuffers<T>(A, B, C, backC);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xSYMM routine... ";
+
+    err = (cl_int)::clMath::clblas::symm( params->order, params->side, params->uplo, params->M, params->N, alpha_,
+                            bufA, params->offa, params->lda, bufB, params->offb, params->ldb, beta_, bufC, params->offc, params->ldc,
+							params->numCommandQueues, base->commandQueues(), 0, NULL, events );
+
+    if (err != CL_SUCCESS) {
+
+		releaseMemObjects(bufA, bufB, bufC);
+        deleteBuffers<T>(A, B, C, backC);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SYMM() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+
+        releaseMemObjects(bufA, bufB, bufC);
+		deleteBuffers<T>(A, B, C, backC);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, 0,
+        (lengthC + params->offc) * sizeof(T), backC, 0,
+        NULL, NULL);
+
+    releaseMemObjects(bufA, bufB, bufC);
+
+    // handle lda correctly based on row-major/col-major..
+    compareMatrices<T>(params->order, params->M , params->N, (C + params->offc), (backC + params->offc), params->ldc);
+    deleteBuffers<T>(A, B, C, backC);
+    delete[] events;
+}
+
+// Instantiate the test
+
+#ifndef __TEST_CSYMM_ACML_NANBUG__
+TEST_P(SYMM, ssymm) {
+    TestParams params;
+
+    getParams(&params);
+    symmCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(SYMM, dsymm) {
+    TestParams params;
+
+    getParams(&params);
+    symmCorrectnessTest<cl_double>(&params);
+}
+
+TEST_P(SYMM, csymm) {
+    TestParams params;
+
+    getParams(&params);
+    symmCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(SYMM, zsymm) {
+    TestParams params;
+
+    getParams(&params);
+    symmCorrectnessTest<DoubleComplex>(&params);
+}
+#else
+TEST_P(SYMM, csymm) {
+    TestParams params;
+
+    getParams(&params);
+    symmCorrectnessTest<FloatComplex>(&params);
+}
+
+#endif
+
diff --git a/src/tests/correctness/corr-symv.cpp b/src/tests/correctness/corr-symv.cpp
new file mode 100644
index 0000000..b5fb420
--- /dev/null
+++ b/src/tests/correctness/corr-symv.cpp
@@ -0,0 +1,223 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <symv.h>
+
+#include "tcase-filter.h"
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY)
+{
+    clReleaseMemObject(objA);
+    clReleaseMemObject(objX);
+    clReleaseMemObject(objY);
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *X, T *blasY, T *clblasY)
+{
+    delete[] A;
+    delete[] X;
+    delete[] blasY;
+    delete[] clblasY;
+}
+
+template <typename T>
+void
+symvCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *B, *blasC, *clblasC, *X, *Y;
+    T alpha, beta;
+    cl_mem bufA, bufB, bufC;
+    clMath::BlasBase *base;
+    bool useAlpha, useBeta;
+    cl_event *events;
+    bool isComplex;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    isComplex = ((typeid(T) == typeid(FloatComplex)) ||
+                 (typeid(T) == typeid(DoubleComplex)));
+    if (canCaseBeSkipped(params, isComplex)) {
+        std::cerr << ">> Test is skipped because it has no importance for this "
+                     "level of coverage" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    useAlpha = base->useAlpha();
+    useBeta = base->useBeta();
+    alpha = ZERO<T>();
+    beta = ZERO<T>();
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    A = new T[params->rowsA * params->columnsA];
+    // X and Y are rows or columns in matrixes B and C
+    B = new T[params->rowsB * params->columnsB];
+    blasC = new T[params->rowsC * params->columnsC];
+    clblasC = new T[params->rowsC * params->columnsC];
+    X = &B[params->offBX];
+    Y = &blasC[params->offCY];
+
+    srand(params->seed);
+    if (useAlpha) {
+        alpha = convertMultiplier<T>(params->alpha);
+    }
+    if (useBeta) {
+        beta = convertMultiplier<T>(params->beta);
+    }
+
+    ::std::cerr << "Generating input data... ";
+    setNans<T>(params->rowsA * params->columnsA, A);
+    setNans<T>(params->rowsB * params->columnsB, B);
+    setNans<T>(params->rowsC * params->columnsC, blasC);
+    randomGemmMatrices(params->order, clblasNoTrans, clblasNoTrans,
+                       params->N, params->N, params->N, useAlpha, &alpha, A,
+                       params->lda, B, params->ldb, useBeta, &beta, blasC,
+                       params->ldc);
+    // set to NAN elements which must not be accessed
+    // in matrix A
+    setTriangleNans<T>(params->order, params->uplo, params->N, A, params->lda);
+
+    // in matrix B containing vector X
+    setVectorNans<T>(params->offBX, abs(params->incx), B, params->N,
+                  params->columnsB * params->rowsB);
+    // in matrix C containing vector Y
+    setVectorNans<T>(params->offCY, abs(params->incy), blasC, params->N,
+                  params->columnsC * params->rowsC);
+    memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*clblasC));
+    ::std::cerr << "Done" << ::std::endl;
+
+    ::std::cerr << "Calling reference xSYMV routine... ";
+
+    if (params->order == clblasColumnMajor) {
+        ::clMath::blas::symv(clblasColumnMajor, params->uplo,
+                          params->N, alpha, A, params->lda,
+                          X, params->incx, beta, Y, params->incy);
+    }
+    else {
+        T *reorderedA = new T[params->rowsA * params->columnsA];
+
+        reorderMatrix<T>(clblasRowMajor, params->rowsA, params->columnsA,
+                         A, reorderedA);
+        ::clMath::blas::symv(clblasColumnMajor, params->uplo,
+                          params->N, alpha, reorderedA, params->rowsA,
+                          X, params->incx, beta, Y, params->incy);
+
+        delete[] reorderedA;
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA *
+                                     sizeof(*A), params->offA * sizeof(*A),
+                                     CL_MEM_READ_ONLY);
+    bufB = base->createEnqueueBuffer(B, params->rowsB * params->columnsB *
+                                     sizeof(*X), 0, CL_MEM_READ_ONLY);
+    bufC = base->createEnqueueBuffer(clblasC, params->rowsC * params->columnsC *
+                                     sizeof(*clblasC), 0, CL_MEM_READ_WRITE);
+    if ((bufA == NULL) || (bufB == NULL) || (bufC == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufA, bufB, bufC);
+        deleteBuffers<T>(A, B, blasC, clblasC);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xSYMV routine... ";
+    err = (cl_int)::clMath::clblas::symv(params->order, params->uplo,
+        params->N, alpha, bufA, params->offA, params->lda, bufB, params->offBX,
+        params->incx, beta, bufC, params->offCY, params->incy,
+        params->numCommandQueues, base->commandQueues(),
+        0, NULL, events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufB, bufC);
+        deleteBuffers<T>(A, B, blasC, clblasC);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SYMV() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufB, bufC);
+        deleteBuffers<T>(A, B, blasC, clblasC);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, 0,
+                        params->rowsC * params->columnsC * sizeof(*clblasC),
+                        clblasC, 0, NULL, NULL);
+
+    releaseMemObjects(bufA, bufB, bufC);
+
+    compareVectors(params->offCY, params->N, abs(params->incy),
+                   params->columnsC * params->rowsC, blasC, clblasC);
+
+    deleteBuffers<T>(A, B, blasC, clblasC);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(SYMV, ssymv) {
+    TestParams params;
+
+    getParams(&params);
+    symvCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(SYMV, dsymv) {
+    TestParams params;
+
+    getParams(&params);
+    symvCorrectnessTest<cl_double>(&params);
+}
diff --git a/src/tests/correctness/corr-syr.cpp b/src/tests/correctness/corr-syr.cpp
new file mode 100644
index 0000000..12967c9
--- /dev/null
+++ b/src/tests/correctness/corr-syr.cpp
@@ -0,0 +1,266 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <syr.h>
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objX)
+{
+	if(objA != NULL)
+	{
+    clReleaseMemObject(objA);
+	}
+	if(objX != NULL)
+	{
+    clReleaseMemObject(objX);
+}
+
+}
+
+template <typename T> static void
+deleteBuffers(T *blasA, T *clblasA, T *X)
+{
+	if(blasA != NULL)
+	{
+    delete[] blasA;
+	}
+	if(clblasA != NULL)
+	{
+    delete[] clblasA;
+	}
+	if(X != NULL)
+	{
+	delete[] X;
+}
+}
+
+template <typename T>
+void
+syrCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *blasA, *clblasA, *X;
+//	T *tempA;
+    cl_mem bufA, bufX;
+    clMath::BlasBase *base;
+    cl_event *events;
+	bool useAlpha;
+	T alpha;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthA = params->N * params->lda;
+    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));
+
+    blasA 		= new T[lengthA + params->offa ];
+    clblasA 	= new T[lengthA + params->offa ];
+    X		 	= new T[lengthX + params->offBX ];
+//	tempA 		= new T[lengthA + params->offa ];
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+
+	memset(blasA, -1, (lengthA + params->offa));
+	memset(clblasA, -1, (lengthA + params->offa));
+	memset(X, -1, (lengthX + params->offBX));
+
+	alpha =  convertMultiplier<T>(params->alpha);
+	useAlpha = true;
+
+	#ifdef DEBUG_SYR
+	printf("ALPHA in CORR_SYR.CPP %f\n", alpha);
+	#endif
+
+	if((blasA == NULL) || (X == NULL) || (clblasA == NULL))
+    {
+        ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+		deleteBuffers<T>(blasA, clblasA, X);
+        delete[] events;
+        SUCCEED();
+        return;
+    }
+
+	randomSyrMatrices<T>(params->order, params->uplo, params->N, useAlpha, &alpha,
+						(blasA + params->offa), params->lda, (X + params->offBX), params->incx);
+
+/*
+	// Set data in A and X using populate() routine
+    int creationFlags = 0;
+    creationFlags =  creationFlags | RANDOM_INIT;
+
+    // Default is Column-Major
+    creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
+    creationFlags = ( (params-> uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY);
+	BlasRoutineID BlasFn = CLBLAS_SYR;
+    // Populate A and blasX
+    populate( blasA + params->offa, params-> N, params-> N, params-> lda, BlasFn, creationFlags);
+    populate( X , (lengthX + params->offBX), 1, (lengthX + params->offBX), BlasFn);
+*/
+    // Copy blasA to clblasA
+    memcpy(clblasA, blasA, (lengthA + params->offa)* sizeof(*blasA));
+  //  memcpy(tempA, blasA, (lengthA + params->offa)* sizeof(*blasA));
+
+	::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufA = base->createEnqueueBuffer(clblasA, (lengthA + params->offa) * sizeof(*clblasA), 0, CL_MEM_READ_WRITE);
+    bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY);
+
+    ::std::cerr << "Calling reference xSYR routine... ";
+
+	clblasOrder order;
+    clblasUplo fUplo;
+
+	order = params->order;
+    fUplo = params->uplo;
+
+	//printf("\n\n before acml call\nA\n");
+   // printMatrixBlock( params->order, 0, 0, params->N, params->N, params->lda, blasA);
+    //printf("\nX\n");
+    //printMatrixBlock( clblasColumnMajor, 0, 0, lengthX, 1, lengthX, X);
+
+	if (order == clblasColumnMajor)
+    {
+		::clMath::blas::syr( clblasColumnMajor, fUplo, params->N, alpha, X, params->offBX, params->incx, blasA, params->offa, params->lda);
+    }
+ 	else
+	{
+        T *reorderedA = new T[lengthA + params->offa];
+
+        //reorderMatrix<T>(clblasRowMajor, params->N, params->lda, blasA, reorderedA);
+
+		fUplo = (fUplo == clblasUpper) ? clblasLower : clblasUpper;
+		//::clMath::blas::syr( clblasColumnMajor, fUplo, params->N, alpha, X, params->offBX, params->incx, reorderedA, params->offa, params->lda);
+
+		::clMath::blas::syr( clblasColumnMajor, fUplo, params->N, alpha, X, params->offBX, params->incx, blasA, params->offa, params->lda);
+
+		//reorderMatrix<T>(clblasColumnMajor, params->lda, params->N, reorderedA, blasA);
+
+        delete[] reorderedA;
+    }
+	//printf("After acml\n");
+	//printMatrixBlock( params->order, 0, 0, params->N, params->N, params->lda, blasA);
+
+    ::std::cerr << "Done" << ::std::endl;
+
+    if ((bufA == NULL) || (bufX == NULL) ) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufA, bufX);
+        deleteBuffers<T>(blasA, clblasA, X);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xSYR routine... ";
+
+    err = (cl_int)::clMath::clblas::syr( params->order, params->uplo, params->N, alpha,
+						bufX, params->offBX, params->incx, bufA, params->offa, params->lda,
+						params->numCommandQueues, base->commandQueues(),
+    					0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufX);
+        deleteBuffers<T>(blasA, clblasA, X);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SYR() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufX);
+        deleteBuffers<T>(blasA, clblasA, X);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufA, CL_TRUE, 0,
+        (lengthA + params->offa) * sizeof(*clblasA), clblasA, 0,
+        NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "SYR: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufA, bufX);
+	//printMatrixBlock( params->order, 0, 0, params->N, params->N, params->lda, clblasA);
+	//getchar();
+
+//	printf("Comparing with the temp buffer\n");
+//    compareMatrices<T>(clblasColumnMajor, 1, (params->lda - params->N), (blasA + params->offa + params->N), (tempA + params->offa + params->N),
+//    					params->lda);
+//	delete[] tempA;
+	printf("Comparing the results\n");
+	compareMatrices<T>(params->order, params->N , params->N, (blasA + params->offa), (clblasA + params->offa),
+                       params->lda);
+
+	deleteBuffers<T>(blasA, clblasA, X);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(SYR, ssyr) {
+    TestParams params;
+
+    getParams(&params);
+    syrCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(SYR, dsyr) {
+    TestParams params;
+
+    getParams(&params);
+    syrCorrectnessTest<cl_double>(&params);
+}
diff --git a/src/tests/correctness/corr-syr2.cpp b/src/tests/correctness/corr-syr2.cpp
new file mode 100644
index 0000000..4148ed5
--- /dev/null
+++ b/src/tests/correctness/corr-syr2.cpp
@@ -0,0 +1,218 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <syr2.h>
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY)
+{
+    if(objA != NULL) {
+    clReleaseMemObject(objA);
+	}
+    if(objX != NULL) {
+    clReleaseMemObject(objX);
+	}
+	if(objY != NULL) {
+	clReleaseMemObject(objY);
+}
+}
+
+template <typename T> static void
+deleteBuffers(T *blasA, T *clblasA, T *X, T *Y)
+{
+	if(blasA != NULL) {
+    delete[] blasA;
+	}
+	if(clblasA != NULL) {
+    delete[] clblasA;
+	}
+	if(X != NULL) {
+	delete[] X;
+	}
+	if(Y != NULL) {
+	delete[] Y;
+}
+}
+
+template <typename T>
+void
+syr2CorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *blasA, *clblasA, *X, *Y;
+    cl_mem bufA, bufX, bufY;
+    clMath::BlasBase *base;
+    cl_event *events;
+	bool useAlpha;
+	T alpha;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthA = params->N * params->lda;
+    size_t lengthX = (1 + ((params->N - 1) * abs(params->incx)));
+	size_t lengthY = (1 + ((params->N - 1) * abs(params->incy)));
+
+    blasA 		= new T[lengthA + params->offa ];
+    clblasA 	= new T[lengthA + params->offa ];
+    X		 	= new T[lengthX + params->offBX ];
+	Y			= new T[lengthY + params->offCY ];
+
+    srand(params->seed);
+
+	if((blasA == NULL) || (clblasA == NULL) || (X == NULL) || (Y == NULL))
+    {
+        deleteBuffers<T>(blasA, clblasA, X, Y);
+        ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        delete[] events;
+        SUCCEED();
+        return;
+    }
+
+	alpha =  convertMultiplier<T>(params->alpha);
+	useAlpha = true;
+
+    ::std::cerr << "Generating input data... ";
+
+    randomSyr2Matrices<T>(params->order, params->uplo, params->N, useAlpha, &alpha, (blasA + params->offa), params->lda,
+							(X + params->offBX), params->incx, (Y + params->offCY), params->incy);
+
+	// Copy blasA to clblasA
+    memcpy(clblasA, blasA, (lengthA + params->offa)* sizeof(*blasA));
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufA = base->createEnqueueBuffer(clblasA, (lengthA + params->offa)* sizeof(*clblasA), 0,CL_MEM_READ_WRITE);
+    bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY);
+	bufY = base->createEnqueueBuffer(Y, (lengthY + params->offCY)* sizeof(*Y), 0, CL_MEM_READ_ONLY);
+
+    ::std::cerr << "Calling reference xSYR2 routine... ";
+
+	clblasOrder order;
+    clblasUplo fUplo;
+
+	order = params->order;
+    fUplo = params->uplo;
+
+	if (order != clblasColumnMajor)
+    {
+        order = clblasColumnMajor;
+        fUplo =  (params->uplo == clblasUpper)? clblasLower : clblasUpper;
+    }
+
+	::clMath::blas::syr2( order, fUplo, params->N, alpha, X, params->offBX, params->incx,
+		Y, params->offCY, params->incy, blasA, params->offa, params->lda);
+    ::std::cerr << "Done" << ::std::endl;
+
+    if ((bufA == NULL) || (bufX == NULL) || (bufY == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufA, bufX, bufY);
+        deleteBuffers<T>(blasA, clblasA, X, Y);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xSYR2 routine... ";
+
+    err = (cl_int)::clMath::clblas::syr2( params->order, params->uplo, params->N, alpha,
+						bufX, params->offBX, params->incx, bufY, params->offCY, params->incy, bufA, params->offa, params->lda,
+						params->numCommandQueues, base->commandQueues(),
+    					0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufX, bufY);
+        deleteBuffers<T>(blasA, clblasA, X, Y);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SYR2() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufX, bufY);
+        deleteBuffers<T>(blasA, clblasA, X, Y);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufA, CL_TRUE, 0,
+        (lengthA + params->offa) * sizeof(*clblasA), clblasA, 0,
+        NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "SYR2: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufA, bufX, bufY);
+
+    compareMatrices<T>(clblasColumnMajor, params->N , params->N, (blasA + params->offa), (clblasA + params->offa),
+                       params->lda);
+
+	deleteBuffers<T>(blasA, clblasA, X, Y);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(SYR2, ssyr2) {
+    TestParams params;
+
+    getParams(&params);
+    syr2CorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(SYR2, dsyr2) {
+    TestParams params;
+
+    getParams(&params);
+    syr2CorrectnessTest<cl_double>(&params);
+}
diff --git a/src/tests/correctness/corr-syr2k.cpp b/src/tests/correctness/corr-syr2k.cpp
new file mode 100644
index 0000000..d42c7f4
--- /dev/null
+++ b/src/tests/correctness/corr-syr2k.cpp
@@ -0,0 +1,260 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <syr2k.h>
+
+#include "tcase-filter.h"
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objB, cl_mem objC)
+{
+    clReleaseMemObject(objA);
+    clReleaseMemObject(objB);
+    clReleaseMemObject(objC);
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *B, T *blasC, T *clblasC)
+{
+    delete[] A;
+    delete[] B;
+    delete[] blasC;
+    delete[] clblasC;
+}
+
+template <typename T>
+void
+syr2kCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *B, *blasC, *clblasC;
+    T alpha, beta, a;
+    cl_mem bufA, bufB, bufC;
+    clMath::BlasBase *base;
+    bool useAlpha;
+    bool useBeta;
+    cl_event *events;
+    clblasTranspose transB;
+    bool isComplex;
+
+    base = clMath::BlasBase::getInstance();
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+         !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    if ((typeid(T) == typeid(FloatComplex)) ||
+        (typeid(T) == typeid(DoubleComplex))) {
+        if (params->transA == clblasConjTrans) {
+            ::std::cerr << ">> syr2k(CONJUGATE_TRANSPOSE) for complex numbers "
+                           "is not allowed." << ::std::endl <<
+                           ">> Test skipped." << ::std::endl;
+            SUCCEED();
+            return;
+        }
+    }
+
+    isComplex = ((typeid(T) == typeid(FloatComplex)) ||
+                 (typeid(T) == typeid(DoubleComplex)));
+    if (canCaseBeSkipped(params, isComplex)) {
+        std::cerr << ">> Test is skipped because it has no importance for this "
+                     "level of coverage" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    useAlpha = base->useAlpha();
+    useBeta = base->useBeta();
+    alpha = ZERO<T>();
+    beta = ZERO<T>();
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    A = new T[params->rowsA * params->columnsA];
+    B = new T[params->rowsB * params->columnsB];
+    blasC = new T[params->rowsC * params->columnsC];
+    clblasC = new T[params->rowsC * params->columnsC];
+
+    srand(params->seed);
+    if (useAlpha) {
+        alpha = convertMultiplier<T>(params->alpha);
+    }
+    if (useBeta) {
+        beta = convertMultiplier<T>(params->beta);
+    }
+
+    ::std::cerr << "Generating input data... ";
+    if (!useAlpha) {
+        alpha = random<T>(100);
+        if (module(alpha) == 0.0) {
+            alpha = ONE<T>();
+        }
+    }
+
+    a = alpha * 2;
+    transB = (params->transA == clblasNoTrans) ? clblasTrans :
+             clblasNoTrans;
+    randomGemmMatrices<T>(params->order, params->transA, transB,
+        params->N, params->N, params->K, true, &a, A, params->lda,
+        B, params->ldb, useBeta, &beta, blasC, params->ldc);
+    memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC));
+    ::std::cerr << "Done" << ::std::endl;
+
+    ::std::cerr << "Calling reference xSYR2K routine... ";
+    if (params->order == clblasColumnMajor) {
+        ::clMath::blas::syr2k(clblasColumnMajor, params->uplo, params->transA,
+                          params->N, params->K, alpha, A,
+                          params->lda, B, params->ldb, beta, blasC, params->ldc);
+    }
+    else {
+        T *reorderedA = new T[params->rowsA * params->columnsA];
+        T *reorderedB = new T[params->rowsB * params->columnsB];
+        T *reorderedC = new T[params->rowsC * params->columnsC];
+
+        reorderMatrix<T>(clblasRowMajor, params->rowsA, params->columnsA,
+                         A, reorderedA);
+        reorderMatrix<T>(clblasRowMajor, params->rowsB, params->columnsB,
+                         B, reorderedB);
+        reorderMatrix<T>(clblasRowMajor, params->rowsC, params->columnsC,
+                         blasC, reorderedC);
+        ::clMath::blas::syr2k(clblasColumnMajor, params->uplo, params->transA,
+                          params->N, params->K, alpha, reorderedA,
+                          params->rowsA, reorderedB, params->rowsB,
+                          beta, reorderedC, params->rowsC);
+        reorderMatrix<T>(clblasColumnMajor, params->rowsC, params->columnsC,
+                         reorderedC, blasC);
+
+        delete[] reorderedC;
+        delete[] reorderedB;
+        delete[] reorderedA;
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA *
+                                     sizeof(*A), params->offA * sizeof(*A),
+                                     CL_MEM_READ_ONLY);
+    bufB = base->createEnqueueBuffer(B, params->rowsB * params->columnsB *
+                                     sizeof(*B), params->offBX * sizeof(*B),
+                                     CL_MEM_READ_ONLY);
+    bufC = base->createEnqueueBuffer(clblasC, params->rowsC * params->columnsC *
+                                     sizeof(*clblasC),
+                                     params->offCY * sizeof(*clblasC),
+                                     CL_MEM_READ_WRITE);
+    if ((bufA == NULL) || (bufB == NULL) || (bufC == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufA, bufB, bufC);
+        deleteBuffers<T>(A, B, blasC, clblasC);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xSYR2K routine... ";
+    err = (cl_int)::clMath::clblas::syr2k(params->order, params->uplo,
+                                          params->transA, params->N, params->K,
+                                          alpha, bufA, params->offA,
+                                          params->lda, bufB, params->offBX,
+                                          params->ldb, beta, bufC,
+                                          params->offCY, params->ldc,
+                                          params->numCommandQueues,
+                                          base->commandQueues(),
+                                          0, NULL, events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufB, bufC);
+        deleteBuffers<T>(A, B, blasC, clblasC);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SYR2K() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufB, bufC);
+        deleteBuffers<T>(A, B, blasC, clblasC);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE,
+                        params->offCY * sizeof(*clblasC),
+                        params->rowsC * params->columnsC * sizeof(*clblasC),
+                        clblasC, 0, NULL, NULL);
+
+    releaseMemObjects(bufA, bufB, bufC);
+    compareMatrices<T>(params->order, params->N, params->N, blasC, clblasC,
+                       params->ldc);
+
+    deleteBuffers<T>(A, B, blasC, clblasC);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(SYR2K, ssyr2k) {
+    TestParams params;
+
+    getParams(&params);
+    syr2kCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(SYR2K, dsyr2k) {
+    TestParams params;
+
+    getParams(&params);
+    syr2kCorrectnessTest<cl_double>(&params);
+}
+
+
+TEST_P(SYR2K, csyr2k) {
+    TestParams params;
+
+    getParams(&params);
+    syr2kCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(SYR2K, zsyr2k) {
+    TestParams params;
+
+    getParams(&params);
+    syr2kCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-syrk.cpp b/src/tests/correctness/corr-syrk.cpp
new file mode 100644
index 0000000..8977718
--- /dev/null
+++ b/src/tests/correctness/corr-syrk.cpp
@@ -0,0 +1,244 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <syrk.h>
+
+#include "tcase-filter.h"
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objC)
+{
+    clReleaseMemObject(objA);
+    clReleaseMemObject(objC);
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *blasC, T *clblasC)
+{
+    delete[] A;
+    delete[] blasC;
+    delete[] clblasC;
+}
+
+template <typename T>
+void
+syrkCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *blasC, *clblasC;
+    T alpha, beta;
+    cl_mem bufA, bufC;
+    clMath::BlasBase *base;
+    bool useAlpha;
+    bool useBeta;
+    cl_event *events;
+    bool isComplex;
+
+    base = clMath::BlasBase::getInstance();
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+         !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    if ((typeid(T) == typeid(FloatComplex)) ||
+        (typeid(T) == typeid(DoubleComplex))) {
+        if (params->transA == clblasConjTrans) {
+            ::std::cerr << ">> syrk(CONJUGATE_TRANSPOSE) for complex numbers "
+                           "is not allowed." << ::std::endl <<
+                           ">> Test skipped." << ::std::endl;
+            SUCCEED();
+            return;
+        }
+    }
+
+    isComplex = ((typeid(T) == typeid(FloatComplex)) ||
+                 (typeid(T) == typeid(DoubleComplex)));
+    if (canCaseBeSkipped(params, isComplex)) {
+        std::cerr << ">> Test is skipped because it has no importance for this "
+                     "level of coverage" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    useAlpha = base->useAlpha();
+    useBeta = base->useBeta();
+    alpha = ZERO<T>();
+    beta = ZERO<T>();
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    A = new T[params->rowsA * params->columnsA];
+    blasC = new T[params->rowsC * params->columnsC];
+    clblasC = new T[params->rowsC * params->columnsC];
+
+    srand(params->seed);
+    if (useAlpha) {
+        alpha = convertMultiplier<T>(params->alpha);
+    }
+    if (useBeta) {
+        beta = convertMultiplier<T>(params->beta);
+    }
+
+    ::std::cerr << "Generating input data... ";
+    if (!useAlpha) {
+        alpha = random<T>(100);
+        if (module(alpha) == 0.0) {
+            alpha = ONE<T>();
+        }
+    }
+
+    randomGemmMatrices<T>(params->order, params->transA, clblasNoTrans,
+        params->N, params->N, params->K, useAlpha, &alpha, A, params->lda,
+        NULL, 0, useBeta, &beta, blasC, params->ldc);
+    memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC));
+    ::std::cerr << "Done" << ::std::endl;
+
+    ::std::cerr << "Calling reference xSYRK routine... ";
+    if (params->order == clblasColumnMajor) {
+        ::clMath::blas::syrk(clblasColumnMajor, params->uplo, params->transA,
+                          params->N, params->K, alpha, A, params->lda,
+                          beta, blasC, params->ldc);
+    }
+    else {
+        T *reorderedA = new T[params->rowsA * params->columnsA];
+        T *reorderedC = new T[params->rowsC * params->columnsC];
+
+        reorderMatrix<T>(clblasRowMajor, params->rowsA, params->columnsA,
+                         A, reorderedA);
+        reorderMatrix<T>(clblasRowMajor, params->rowsC, params->columnsC,
+                         blasC, reorderedC);
+        ::clMath::blas::syrk(clblasColumnMajor, params->uplo, params->transA,
+                          params->N, params->K, alpha, reorderedA,
+                          params->rowsA,
+                          beta, reorderedC, params->rowsC);
+        reorderMatrix<T>(clblasColumnMajor, params->rowsC, params->columnsC,
+                         reorderedC, blasC);
+
+        delete[] reorderedC;
+        delete[] reorderedA;
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA *
+                                     sizeof(*A), params->offA * sizeof(*A),
+                                     CL_MEM_READ_ONLY);
+    bufC = base->createEnqueueBuffer(clblasC, params->rowsC * params->columnsC *
+                                     sizeof(*clblasC),
+                                     params->offCY * sizeof(*clblasC),
+                                     CL_MEM_READ_WRITE);
+    if ((bufA == NULL) || (bufC == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufA, bufC);
+        deleteBuffers<T>(A, blasC, clblasC);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xSYRK routine... ";
+    err = (cl_int)::clMath::clblas::syrk(params->order, params->uplo,
+                                         params->transA, params->N, params->K,
+                                         alpha, bufA, params->offA, params->lda,
+                                         beta, bufC, params->offCY,
+                                         params->ldc, params->numCommandQueues,
+                                         base->commandQueues(), 0, NULL,
+                                         events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufC);
+        deleteBuffers<T>(A, blasC, clblasC);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SYRK() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufC);
+        deleteBuffers<T>(A, blasC, clblasC);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE,
+                        params->offCY * sizeof(*clblasC),
+                        params->rowsC * params->columnsC * sizeof(*clblasC),
+                        clblasC, 0, NULL, NULL);
+
+    releaseMemObjects(bufA, bufC);
+    compareMatrices<T>(params->order, params->N, params->N, blasC, clblasC,
+                       params->ldc);
+
+    deleteBuffers<T>(A, blasC, clblasC);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(SYRK, ssyrk) {
+    TestParams params;
+
+    getParams(&params);
+    syrkCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(SYRK, dsyrk) {
+    TestParams params;
+
+    getParams(&params);
+    syrkCorrectnessTest<cl_double>(&params);
+}
+
+TEST_P(SYRK, csyrk) {
+    TestParams params;
+
+    getParams(&params);
+    syrkCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(SYRK, zsyrk) {
+    TestParams params;
+
+    getParams(&params);
+    syrkCorrectnessTest<DoubleComplex>(&params);
+}
+
diff --git a/src/tests/correctness/corr-tbmv.cpp b/src/tests/correctness/corr-tbmv.cpp
new file mode 100644
index 0000000..7b438ff
--- /dev/null
+++ b/src/tests/correctness/corr-tbmv.cpp
@@ -0,0 +1,233 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <tbmv.h>
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objXtemp)
+{
+    if(objA != NULL)
+	{
+        clReleaseMemObject(objA);
+    }
+	if(objX != NULL)
+	{
+        clReleaseMemObject(objX);
+	}
+    if(objXtemp != NULL)
+    {
+        clReleaseMemObject(objXtemp);
+    }
+
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *blasX, T *clblasX)
+{
+    if(A != NULL)
+	{
+        delete[] A;
+	}
+    if(blasX != NULL)
+	{
+        delete[] blasX;
+	}
+    if(clblasX != NULL)
+	{
+        delete[] clblasX; // To hold clblas TBMV call results
+    }
+}
+
+template <typename T>
+void
+tbmvCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *blasX, *clblasX;
+    cl_mem bufA, bufX, bufXtemp;
+    clMath::BlasBase *base;
+    cl_event *events;
+	size_t lengthX, lengthA;
+
+    base = clMath::BlasBase::getInstance();
+
+    if (( (typeid(T) == typeid(DoubleComplex)) || (typeid(T) == typeid(cl_double)) ) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    lengthA = params->N  * params->lda ;
+
+    lengthX = (params->N - 1)*abs(params->incx) + 1;
+
+    A 	= new T[ lengthA + params->offA ];
+    blasX  		= new T[ lengthX + params->offBX ];
+	clblasX 	= new T[ lengthX + params->offBX ];
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+	if((A == NULL) || (blasX == NULL) || (clblasX == NULL))
+	{
+		deleteBuffers<T>(A, blasX, clblasX);
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped!!!!!!!!!!!!" << ::std::endl;
+        delete[] events;
+        SUCCEED();
+        return;
+	}
+	randomTbmvMatrices( params->N, (A + params->offA), params->lda, (blasX + params->offBX), params->incx );
+
+    // Copy blasY to clblasY
+    memcpy(clblasX, blasX, (lengthX + params->offBX)* sizeof(*blasX));
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufA = base->createEnqueueBuffer(A, (lengthA + params->offA)* sizeof(*A), 0, CL_MEM_READ_WRITE);
+    bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE);
+    bufXtemp = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE);
+
+    ::std::cerr << "Calling reference xTBMV routine... ";
+
+	clblasOrder fOrder;
+	clblasTranspose fTrans;
+    clblasUplo fUplo;
+	fOrder = params->order;
+	fTrans = params->transA;
+    fUplo = params->uplo;
+	size_t  fN = params->N, fK = params->K;
+
+	if (fOrder != clblasColumnMajor)
+    {
+        fOrder = clblasColumnMajor;
+        fTrans = (params->transA == clblasNoTrans)? clblasTrans : clblasNoTrans;
+        fUplo = (params->uplo == clblasLower)? clblasUpper : clblasLower;
+
+        if( params->transA == clblasConjTrans )
+            doConjugate( (A + params->offA), 1, lengthA, params->lda );
+   	}
+
+	clMath::blas::tbmv(fOrder, fUplo, fTrans, params->diag, fN, fK, A, params->offA, params->lda, blasX, params->offBX, params->incx);
+    ::std::cerr << "Done" << ::std::endl;
+
+    if ((bufA == NULL) || (bufX == NULL)|| (bufXtemp == NULL)) {
+        // Skip the test, the most probable reason is
+        //     matrix too big for a device.
+
+        releaseMemObjects(bufA, bufX, bufXtemp );
+        deleteBuffers<T>(A, blasX, clblasX);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xTBMV routine... ";
+    DataType type;
+    type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT:( typeid(T) == typeid(cl_double))? TYPE_DOUBLE:
+                                      ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+    err = (cl_int)clMath::clblas::tbmv(type, params->order, params->uplo, params->transA, params->diag, params->N, params->K,
+                                        bufA, params->offA, params->lda, bufX, params->offBX, params->incx, bufXtemp,
+                                        params->numCommandQueues, base->commandQueues(), 0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufX, bufXtemp);
+        deleteBuffers<T>(A, blasX, clblasX);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::TBMV() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufX, bufXtemp);
+        deleteBuffers<T>(A, blasX, clblasX);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0,
+        (lengthX + params->offBX) * sizeof(*clblasX), clblasX, 0,
+        NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "TBMV: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufA, bufX, bufXtemp);
+    compareMatrices<T>(clblasColumnMajor, lengthX , 1, (blasX + params->offBX), (clblasX + params->offBX),
+                       lengthX);
+    deleteBuffers<T>(A, blasX, clblasX);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(TBMV, stbmv) {
+    TestParams params;
+
+    getParams(&params);
+    tbmvCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(TBMV, dtbmv) {
+    TestParams params;
+
+    getParams(&params);
+    tbmvCorrectnessTest<cl_double>(&params);
+}
+
+TEST_P(TBMV, ctbmv) {
+    TestParams params;
+
+    getParams(&params);
+    tbmvCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(TBMV, ztbmv) {
+    TestParams params;
+
+    getParams(&params);
+    tbmvCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-tbsv.cpp b/src/tests/correctness/corr-tbsv.cpp
new file mode 100644
index 0000000..17c59f3
--- /dev/null
+++ b/src/tests/correctness/corr-tbsv.cpp
@@ -0,0 +1,242 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <tbsv.h>
+#include "trsv-delta.h"
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objX)
+{
+    if(objA != NULL)
+	{
+        clReleaseMemObject(objA);
+    }
+	if(objX != NULL)
+	{
+        clReleaseMemObject(objX);
+	}
+
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *blasX, T *backX, cl_double *deltaX)
+{
+    if( A != NULL )
+	{
+        delete[] A;
+	}
+	if( blasX != NULL )
+	{
+        delete[] blasX;
+	}
+	if( backX != NULL )
+	{
+		delete[] backX;
+	}
+	if( deltaX != NULL )
+	{
+	    delete[] deltaX;
+    }
+}
+
+template <typename T>
+void
+tbsvCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *blasX, *clblasX;
+    cl_mem bufA, bufX;
+  	cl_double *deltaX;
+    clMath::BlasBase *base;
+    cl_event *events;
+	size_t lengthX, lengthA;
+
+    base = clMath::BlasBase::getInstance();
+
+    if (( (typeid(T) == typeid(DoubleComplex)) || (typeid(T) == typeid(cl_double)) ) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    lengthA = params->N  * params->lda ;
+    lengthX = (params->N - 1)*abs(params->incx) + 1;
+
+    A 	= new T[ lengthA + params->offA ];
+    blasX  		= new T[ lengthX + params->offBX ];
+	clblasX 	= new T[ lengthX + params->offBX ];
+	deltaX	= new cl_double[lengthX + params->offBX];
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+	if((A == NULL) || (blasX == NULL) || (clblasX == NULL))
+	{
+		deleteBuffers<T>(A, blasX, clblasX, deltaX);
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped!!!!!!!!!!!!" << ::std::endl;
+        delete[] events;
+        SUCCEED();
+        return;
+	}
+	memset( deltaX, 0, (lengthX + params->offBX)*sizeof(cl_double) );
+    memset( blasX, 0, (lengthX + params->offBX)*sizeof(T));
+    memset( clblasX, 0, (lengthX + params->offBX)*sizeof(T));
+
+	randomTbsvMatrices<T>( params->order, params->uplo, params->diag, params->N, params->K,
+	                        (A + params->offA), params->lda, (blasX + params->offBX), params->incx );
+
+	// Generate delta X for result comparison
+	tbsvDelta<T>( params->order, params->uplo, params->transA, params->diag, params->N, params->K,
+	                (A + params->offA), params->lda, (blasX + params->offBX), params->incx, (deltaX + params->offBX) );
+
+    memcpy(clblasX, blasX, (lengthX + params->offBX)* sizeof(*blasX));
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufA = base->createEnqueueBuffer(A, (lengthA + params->offA)* sizeof(*A), 0, CL_MEM_READ_WRITE);
+    bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE);
+
+    ::std::cerr << "Calling reference xTBSV routine... ";
+
+	clblasOrder fOrder;
+	clblasTranspose fTrans;
+    clblasUplo fUplo;
+	fOrder = params->order;
+	fTrans = params->transA;
+    fUplo = params->uplo;
+	size_t  fN = params->N, fK = params->K;
+
+	if (fOrder != clblasColumnMajor)
+    {
+        fOrder = clblasColumnMajor;
+        fTrans = (params->transA == clblasNoTrans)? clblasTrans : clblasNoTrans;
+        fUplo = (params->uplo == clblasLower)? clblasUpper : clblasLower;
+
+        if( params->transA == clblasConjTrans )
+            doConjugate( (A + params->offA), params->N, params->lda, params->lda );
+   	}
+
+	clMath::blas::tbsv(fOrder, fUplo, fTrans, params->diag, fN, fK, A, params->offA, params->lda, blasX, params->offBX, params->incx);
+    ::std::cerr << "Done" << ::std::endl;
+
+    if ((bufA == NULL) || (bufX == NULL)) {
+        // Skip the test, the most probable reason is
+        //     matrix too big for a device.
+
+        releaseMemObjects(bufA, bufX);
+        deleteBuffers<T>(A, blasX, clblasX, deltaX);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xTBSV routine... ";
+    DataType type;
+    type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT:( typeid(T) == typeid(cl_double))? TYPE_DOUBLE:
+                                      ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+    err = (cl_int)clMath::clblas::tbsv(type, params->order, params->uplo, params->transA, params->diag, params->N, params->K,
+                                            bufA, params->offA, params->lda, bufX, params->offBX, params->incx,
+                                            params->numCommandQueues, base->commandQueues(), 0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufX);
+        deleteBuffers<T>(A, blasX, clblasX, deltaX);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::TBSV() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufX);
+        deleteBuffers<T>(A, blasX, clblasX, deltaX);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0,
+                                (lengthX + params->offBX) * sizeof(*clblasX), clblasX, 0,
+        NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "TBSV: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufA, bufX);
+    compareMatrices<T>(clblasColumnMajor, lengthX , 1, (blasX + params->offBX), (clblasX + params->offBX),
+                       lengthX, (deltaX + params->offBX) );
+    deleteBuffers<T>(A, blasX, clblasX, deltaX);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(TBSV, stbsv) {
+    TestParams params;
+
+    getParams(&params);
+    tbsvCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(TBSV, dtbsv) {
+    TestParams params;
+
+    getParams(&params);
+    tbsvCorrectnessTest<cl_double>(&params);
+}
+
+TEST_P(TBSV, ctbsv) {
+    TestParams params;
+
+    getParams(&params);
+    tbsvCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(TBSV, ztbsv) {
+    TestParams params;
+
+    getParams(&params);
+    tbsvCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-tpmv.cpp b/src/tests/correctness/corr-tpmv.cpp
new file mode 100644
index 0000000..041154e
--- /dev/null
+++ b/src/tests/correctness/corr-tpmv.cpp
@@ -0,0 +1,252 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <tpmv.h>
+
+static void
+releaseMemObjects(cl_mem objAP, cl_mem objX,  cl_mem objXTemp)
+{
+   if(objAP != NULL)
+ 	{
+    clReleaseMemObject(objAP);
+	}
+	if(objX != NULL)
+    {
+    clReleaseMemObject(objX);
+	}
+	if(objXTemp != NULL)
+	{
+    clReleaseMemObject(objXTemp);
+}
+}
+
+template <typename T> static void
+deleteBuffers(T *AP, T *blasX, T *clblasX)
+{
+    if(AP != NULL)
+    {
+    delete[] AP;
+    }
+	if(blasX != NULL)
+	{
+    delete[] blasX;
+    }
+	if(clblasX != NULL)
+	{
+		delete[] clblasX;
+	}
+}
+
+template <typename T>
+void
+tpmvCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *AP, *blasX, *clblasX;
+    cl_mem bufAP, bufX, bufXTemp;
+    clMath::BlasBase *base;
+    cl_event *events;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthAP = (params->N *( params->N + 1 ))/2 ;
+    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));
+
+    AP 		= new T[lengthAP + params->offa ];
+    blasX 	= new T[lengthX + params->offBX ];
+    clblasX = new T[lengthX + params->offBX ];
+
+	if((AP == NULL) || (blasX == NULL) || (clblasX == NULL))
+	{
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        deleteBuffers<T>(AP, blasX, clblasX);
+		delete[] events;
+		SUCCEED();
+        return;
+	}
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+    // Set data in A and X using populate() routine
+    int creationFlags = 0;
+    creationFlags =  creationFlags | RANDOM_INIT | PACKED_MATRIX;
+
+    // Default is Column-Major
+    creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
+    creationFlags = ( (params-> uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY);
+	BlasRoutineID BlasFn = CLBLAS_TRMV;
+
+    // Populate A and blasX
+    populate( AP + params->offa, params-> N, params-> N, 0, BlasFn, creationFlags);
+    populate( blasX , (lengthX + params->offBX), 1, (lengthX + params->offBX), BlasFn);
+
+    // Copy blasX to clblasX
+    memcpy(clblasX, blasX, (lengthX + params->offBX)* sizeof(*blasX));
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufAP = base->createEnqueueBuffer(AP, (lengthAP + params->offa)* sizeof(*AP), 0, CL_MEM_READ_ONLY);
+    bufX = base->createEnqueueBuffer(clblasX, (lengthX + params->offBX)* sizeof(*clblasX), 0, CL_MEM_WRITE_ONLY);
+    bufXTemp = base->createEnqueueBuffer(NULL, lengthX * sizeof(*clblasX), 0, CL_MEM_READ_ONLY);
+
+	//printData( "bufX", blasX, lengthX, 1, lengthX);
+	//printData( "clblasX", clblasX, lengthX, 1, lengthX);
+
+    ::std::cerr << "Calling reference xTPMV routine... ";
+
+
+	clblasOrder order;
+    clblasUplo fUplo;
+    clblasTranspose fTrans;
+
+	order = params->order;
+    fUplo = params->uplo;
+    fTrans = params->transA;
+
+	if (order != clblasColumnMajor)
+    {
+        order = clblasColumnMajor;
+        fUplo =  (params->uplo == clblasUpper)? clblasLower : clblasUpper;
+        fTrans = (params->transA == clblasNoTrans)? clblasTrans : clblasNoTrans;
+
+        if( params->transA == clblasConjTrans )
+            doConjugate( (AP +params->offa), (( params->N * (params->N + 1)) / 2) , 1, 1 );
+    }
+
+	::clMath::blas::tpmv( order, fUplo, fTrans, params->diag, params->N, AP, params->offa, blasX, params->offBX, params->incx);
+    ::std::cerr << "Done" << ::std::endl;
+
+    // Hold X vector
+
+    if ((bufAP == NULL) || (bufX == NULL) || (bufXTemp == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufAP, bufX, bufXTemp);
+        deleteBuffers<T>(AP, blasX, clblasX);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xTPMV routine... ";
+
+    DataType type;
+    type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE;
+
+    // Should use bufXTemp as well
+    err = (cl_int)::clMath::clblas::tpmv( type, params->order, params->uplo, params->transA, params->diag, params->N, bufAP,
+    					params->offa, bufX, params->offBX, params->incx, bufXTemp, params->numCommandQueues, base->commandQueues(),
+    					0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufAP, bufX, bufXTemp);
+        deleteBuffers<T>(AP, blasX, clblasX);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::TPMV() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufAP, bufX, bufXTemp);
+        deleteBuffers<T>(AP, blasX, clblasX);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0,
+        (lengthX + params->offBX) * sizeof(*clblasX), clblasX, 0,
+        NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "TPMV: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufAP, bufX, bufXTemp);
+
+    compareMatrices<T>(clblasColumnMajor, lengthX , 1, (blasX + params->offBX), (clblasX + params->offBX),
+                       lengthX);
+    deleteBuffers<T>(AP, blasX, clblasX);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(TPMV, stpmv) {
+    TestParams params;
+
+    getParams(&params);
+    tpmvCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(TPMV, dtpmv) {
+    TestParams params;
+
+    getParams(&params);
+    tpmvCorrectnessTest<cl_double>(&params);
+}
+
+TEST_P(TPMV, ctpmv) {
+    TestParams params;
+
+    getParams(&params);
+    tpmvCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(TPMV, ztpmv) {
+    TestParams params;
+
+    getParams(&params);
+    tpmvCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-tpsv.cpp b/src/tests/correctness/corr-tpsv.cpp
new file mode 100644
index 0000000..931d821
--- /dev/null
+++ b/src/tests/correctness/corr-tpsv.cpp
@@ -0,0 +1,252 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <tpsv.h>
+#include <cltypes.h>
+
+#include "trsv-delta.h"
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objX)
+{
+	if (objA != NULL)
+    clReleaseMemObject(objA);
+	if (objX != NULL)
+    clReleaseMemObject(objX);
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *blasX, T *backX, cl_double *deltaX)
+{
+    if( A != NULL )
+	{
+    delete[] A;
+	}
+	if( blasX != NULL )
+	{
+    delete[] blasX;
+	}
+	if( backX != NULL )
+	{
+		delete[] backX;
+	}
+	if( deltaX != NULL )
+	{
+	delete[] deltaX;
+}
+}
+
+template <typename T>
+void
+tpsvCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *blasX, *backX;
+	cl_double *deltaX;
+    cl_mem bufA, bufX;
+    clMath::BlasBase *base;
+    cl_event *events;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthA = (params->N * (params->N + 1)) / 2;
+    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));
+
+    A 		= new T[lengthA + params->offa];
+    blasX 	= new T[lengthX + params->offBX];
+    backX 	= new T[lengthX + params->offBX];
+	deltaX	= new cl_double[lengthX + params->offBX];
+
+	if ((A==NULL) || (blasX == NULL) || (backX == NULL) || (deltaX == NULL))
+	{
+		::std::cerr << "Unable to allocate matrices in Host memory" << std::endl;
+		deleteBuffers<T>(A, blasX, backX, deltaX);
+		delete[] events;
+		SUCCEED();
+		return;
+	}
+	memset( deltaX, 0, lengthX*sizeof(cl_double) );
+	memset( blasX, 0, lengthX*sizeof(T) );
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+	//custom generation function in blas-random.h
+	randomTrsvMatrices<T>( params->order, params->uplo, params->diag, params->N, (A + params->offa), 0, (blasX + params->offBX), params->incx);
+
+	// Generate delta X for result comparison
+	trsvDelta<T>( params->order, params->uplo, params->transA, params->diag, params->N, (A + params->offa), 0, (blasX + params->offBX), params->incx, (deltaX + params->offBX) );
+
+	/*printf("\n\n before acml call\nA\n");
+	printMatrixBlock( params->order, 0, 0, params->N, params->N, params->lda, A);
+	printf("\nX\n");
+	printMatrixBlock( clblasColumnMajor, 0, 0, lengthX, 1, lengthX, blasX);*/
+
+    // Copy blasX to clblasX
+    memcpy(backX, blasX, (lengthX + params->offBX) * sizeof(T));
+	// Allocate buffers
+    bufA = base->createEnqueueBuffer(A, (lengthA + params->offa)* sizeof(T), 0, CL_MEM_READ_ONLY);
+    bufX = base->createEnqueueBuffer(backX, (lengthX + params->offBX)* sizeof(T), 0, CL_MEM_WRITE_ONLY);
+    ::std::cerr << "Done" << ::std::endl;
+
+    ::std::cerr << "Calling reference xTPSV routine... ";
+
+    clblasOrder order;
+    clblasUplo fUplo;
+    clblasTranspose fTrans;
+
+    order = params->order;
+    fUplo = params->uplo;
+    fTrans = params->transA;
+
+
+    if (order != clblasColumnMajor)
+    {
+        order = clblasColumnMajor;
+        fUplo =  (params->uplo == clblasUpper)? clblasLower : clblasUpper;
+        fTrans = (params->transA == clblasNoTrans)? clblasTrans : clblasNoTrans;
+
+        if( params->transA == clblasConjTrans )
+            doConjugate((A + params->offa), 1, lengthA, 1);
+    }
+	::clMath::blas::tpsv( order, fUplo, fTrans, params->diag, params->N, A, params->offa, blasX, params->offBX, params->incx);
+	::std::cerr << "Done" << ::std::endl;
+
+	/*
+	printf("\n\n acml result X\n");
+	printf("\nblasX\n");
+	printMatrixBlock( clblasColumnMajor, 0, 0, lengthX, 1, lengthX, blasX);*/
+
+    if ((bufA == NULL) || (bufX == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufA, bufX);
+        deleteBuffers<T>(A, blasX, backX, deltaX);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xTPSV routine... ";
+
+    DataType type;
+    type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+    // Should use bufXTemp as well
+    err = (cl_int)::clMath::clblas::tpsv(type, params->order, params->uplo, params->transA, params->diag, params->N, bufA,
+    					params->offa, bufX, params->offBX, params->incx, params->numCommandQueues, base->commandQueues(),
+    					0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+
+        deleteBuffers<T>(A, blasX, backX, deltaX);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::TPSV() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+
+        deleteBuffers<T>(A, blasX, backX, deltaX);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0,
+        lengthX * sizeof(*backX), backX, 0,
+        NULL, NULL);
+
+    releaseMemObjects(bufA, bufX);
+
+	/*
+	printf("\n\n clblas result X\n");
+	printf("\nclBlasX\n");
+	printMatrixBlock( clblasColumnMajor, 0, 0, lengthX, 1, lengthX, backX);
+
+	printf("\n\n delta X\n\n");
+	printMatrixBlock( clblasColumnMajor, 0, 0, lengthX, 1, lengthX, deltaX);*/
+
+    // handle lda correctly based on row-major/col-major..
+    compareMatrices<T>( clblasColumnMajor, lengthX , 1, blasX, backX,
+                       lengthX, deltaX );
+    deleteBuffers<T>(A, blasX, backX, deltaX);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(TPSV, stpsv) {
+    TestParams params;
+
+    getParams(&params);
+    tpsvCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(TPSV, dtpsv) {
+    TestParams params;
+
+    getParams(&params);
+    tpsvCorrectnessTest<cl_double>(&params);
+}
+
+TEST_P(TPSV, ctpsv) {
+    TestParams params;
+
+    getParams(&params);
+    tpsvCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(TPSV, ztpsv) {
+    TestParams params;
+
+    getParams(&params);
+    tpsvCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-trmm.cpp b/src/tests/correctness/corr-trmm.cpp
new file mode 100644
index 0000000..c92e0e3
--- /dev/null
+++ b/src/tests/correctness/corr-trmm.cpp
@@ -0,0 +1,215 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <trmm.h>
+
+#include "tcase-filter.h"
+
+static void
+releaseMemObjects(cl_mem A, cl_mem B)
+{
+    clReleaseMemObject(A);
+    clReleaseMemObject(B);
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *blasB, T *clblasB)
+{
+    delete[] A;
+    delete[] blasB;
+    delete[] clblasB;
+}
+
+template <typename T>
+void
+trmmCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *blasB, *clblasB;
+    T alpha;
+    cl_mem bufA, bufB;
+    clMath::BlasBase *base;
+    bool useAlpha;
+    cl_event *events;
+    bool isComplex;
+
+    base = clMath::BlasBase::getInstance();
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    isComplex = ((typeid(T) == typeid(FloatComplex)) ||
+                 (typeid(T) == typeid(DoubleComplex)));
+    if (canCaseBeSkipped(params, isComplex)) {
+        std::cerr << ">> Test is skipped because it has no importance for this "
+                     "level of coverage" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    useAlpha = base->useAlpha();
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    A = new T[params->rowsA * params->columnsA];
+    blasB = new T[params->rowsB * params->columnsB];
+    clblasB = new T[params->rowsB * params->columnsB];
+    alpha = ZERO<T>();
+
+    srand(params->seed);
+    if (useAlpha) {
+        alpha = convertMultiplier<T>(params->alpha);
+    }
+
+    ::std::cerr << "Generating input data... ";
+    randomTrmmMatrices<T>(params->order, params->side, params->uplo,
+        params->diag, params->M, params->N, useAlpha,
+        &alpha, A, params->lda, blasB, params->ldb);
+    memcpy(clblasB, blasB, params->rowsB * params->columnsB * sizeof(*blasB));
+    ::std::cerr << "Done" << ::std::endl;
+
+    ::std::cerr << "Calling reference xTRMM routine... ";
+    if (params->order == clblasColumnMajor) {
+        ::clMath::blas::trmm(clblasColumnMajor, params->side, params->uplo,
+            params->transA, params->diag, params->M, params->N, alpha,
+            A, params->lda, blasB, params->ldb);
+    }
+    else {
+        T *reorderedA = new T[params->rowsA * params->columnsA];
+        T *reorderedB = new T[params->rowsB * params->columnsB];
+
+        reorderMatrix<T>(clblasRowMajor, params->rowsA, params->columnsA,
+                         A, reorderedA);
+        reorderMatrix<T>(clblasRowMajor, params->rowsB, params->columnsB,
+                         blasB, reorderedB);
+        ::clMath::blas::trmm(clblasColumnMajor, params->side, params->uplo,
+            params->transA, params->diag, params->M, params->N, alpha,
+            reorderedA, params->rowsA, reorderedB, params->rowsB);
+        reorderMatrix<T>(clblasColumnMajor, params->rowsB, params->columnsB,
+                         reorderedB, blasB);
+
+        delete[] reorderedB;
+        delete[] reorderedA;
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA *
+                                     sizeof(*A), params->offA * sizeof(*A),
+                                     CL_MEM_READ_ONLY);
+    bufB = base->createEnqueueBuffer(clblasB, params->rowsB * params->columnsB *
+                                     sizeof(*clblasB),
+                                     params->offBX * sizeof(*clblasB),
+                                     CL_MEM_READ_WRITE);
+    if ((bufA == NULL) || (bufB == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufA, bufB);
+        deleteBuffers<T>(A, blasB, clblasB);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xTRMM routine... ";
+    err = (cl_int)::clMath::clblas::trmm(params->order, params->side,
+        params->uplo, params->transA, params->diag, params->M, params->N,
+        alpha, bufA, params->offA, params->lda, bufB, params->offBX,
+        params->ldb, params->numCommandQueues, base->commandQueues(),
+        0, NULL, events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufB);
+        deleteBuffers<T>(A, blasB, clblasB);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::TRMM() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufB);
+        deleteBuffers<T>(A, blasB, clblasB);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    clEnqueueReadBuffer(base->commandQueues()[0], bufB, CL_TRUE,
+                        params->offBX * sizeof(*clblasB),
+                        params->rowsB * params->columnsB * sizeof(*clblasB),
+                        clblasB, 0, NULL, NULL);
+
+    releaseMemObjects(bufA, bufB);
+    compareMatrices<T>(params->order, params->M, params->N, blasB, clblasB,
+                       params->ldb);
+    deleteBuffers<T>(A, blasB, clblasB);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(TRMM, strmm) {
+    TestParams params;
+
+    getParams(&params);
+    trmmCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(TRMM, dtrmm) {
+    TestParams params;
+
+    getParams(&params);
+    trmmCorrectnessTest<cl_double>(&params);
+}
+
+TEST_P(TRMM, ctrmm) {
+    TestParams params;
+
+    getParams(&params);
+    trmmCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(TRMM, ztrmm) {
+    TestParams params;
+
+    getParams(&params);
+    trmmCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-trmv.cpp b/src/tests/correctness/corr-trmv.cpp
new file mode 100644
index 0000000..95089fc
--- /dev/null
+++ b/src/tests/correctness/corr-trmv.cpp
@@ -0,0 +1,258 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <trmv.h>
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objX,  cl_mem objXTemp)
+{
+   if(objA != NULL)
+ 	{
+    clReleaseMemObject(objA);
+	}
+	if(objX != NULL)
+    {
+    clReleaseMemObject(objX);
+	}
+	if(objXTemp != NULL)
+	{
+    clReleaseMemObject(objXTemp);
+}
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *blasX, T *clblasX)
+{
+    if(A != NULL)
+    {
+    delete[] A;
+    }
+	if(blasX != NULL)
+	{
+    delete[] blasX;
+}
+	if(clblasX != NULL)
+	{
+		delete[] clblasX;
+	}
+}
+
+template <typename T>
+void
+trmvCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *blasX, *clblasX;
+    cl_mem bufA, bufX, bufXTemp;
+    clMath::BlasBase *base;
+    cl_event *events;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+	printf("number of command queues : %d\n\n", params->numCommandQueues);
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthA = params->N * params->lda;
+    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));
+
+    A 		= new T[lengthA + params->offa ];
+    blasX 	= new T[lengthX + params->offBX ];
+    clblasX 	= new T[lengthX + params->offBX ];
+
+	if((A == NULL) || (blasX == NULL) || (clblasX == NULL))
+	{
+		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
+        deleteBuffers<T>(A, blasX, clblasX);
+		delete[] events;
+		SUCCEED();
+        return;
+	}
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+    // Set data in A and X using populate() routine
+    int creationFlags = 0;
+    creationFlags =  creationFlags | RANDOM_INIT;
+
+    // Default is Column-Major
+    creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
+    creationFlags = ( (params-> uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY);
+	BlasRoutineID BlasFn = CLBLAS_TRMV;
+
+    // Populate A and blasX
+    populate( A + params->offa, params-> N, params-> N, params-> lda, BlasFn, creationFlags);
+    populate( blasX , (lengthX + params->offBX), 1, (lengthX + params->offBX), BlasFn);
+
+    // Copy blasX to clblasX
+    memcpy(clblasX, blasX, (lengthX + params->offBX)* sizeof(*blasX));
+    ::std::cerr << "Done" << ::std::endl;
+
+	// Allocate buffers
+    bufA = base->createEnqueueBuffer(A, (lengthA + params->offa)* sizeof(*A), 0, CL_MEM_READ_ONLY);
+    bufX = base->createEnqueueBuffer(clblasX, (lengthX + params->offBX)* sizeof(*clblasX), 0, CL_MEM_WRITE_ONLY);
+    bufXTemp = base->createEnqueueBuffer(NULL, lengthX * sizeof(*clblasX), 0, CL_MEM_READ_ONLY);
+
+	//printData( "bufX", blasX, lengthX, 1, lengthX);
+	//printData( "clblasX", clblasX, lengthX, 1, lengthX);
+
+    ::std::cerr << "Calling reference xTRMV routine... ";
+
+
+	clblasOrder order;
+    clblasUplo fUplo;
+    clblasTranspose fTrans;
+
+	order = params->order;
+    fUplo = params->uplo;
+    fTrans = params->transA;
+
+	if (order != clblasColumnMajor)
+    {
+        order = clblasColumnMajor;
+        fUplo =  (params->uplo == clblasUpper)? clblasLower : clblasUpper;
+        fTrans = (params->transA == clblasNoTrans)? clblasTrans : clblasNoTrans;
+
+        if( params->transA == clblasConjTrans )
+            doConjugate( (A + params->offa), params->N, params->N, params->lda );
+    }
+
+	::clMath::blas::trmv( order, fUplo, fTrans, params->diag, params->N, A, params->offa, params->lda, blasX, params->offBX, params->incx);
+    ::std::cerr << "Done" << ::std::endl;
+
+    // Hold X vector
+
+    if ((bufA == NULL) || (bufX == NULL) || (bufXTemp == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufA, bufX, bufXTemp);
+        deleteBuffers<T>(A, blasX, clblasX);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xTRMV routine... ";
+
+    DataType type;
+    type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE;
+
+    // Should use bufXTemp as well
+    err = (cl_int)::clMath::clblas::trmv( type, params->order, params->uplo, params->transA, params->diag, params->N, bufA,
+    					params->offa, params->lda, bufX, params->offBX, params->incx, bufXTemp, params->numCommandQueues, base->commandQueues(),
+    					0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufX, bufXTemp);
+        deleteBuffers<T>(A, blasX, clblasX);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::TRMV() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufX, bufXTemp);
+        deleteBuffers<T>(A, blasX, clblasX);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+
+    err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0,
+        (lengthX + params->offBX) * sizeof(*clblasX), clblasX, 0,
+        NULL, NULL);
+	if (err != CL_SUCCESS)
+	{
+		::std::cerr << "TRMV: Reading results failed...." << std::endl;
+	}
+
+    releaseMemObjects(bufA, bufX, bufXTemp);
+
+
+    // handle lda correctly based on row-major/col-major..
+//	printData( "Ref blasX result:", blasX, lengthX, 1, lengthX);
+//	printData( "OpenCL clblasX result:", clblasX, lengthX, 1, lengthX);
+
+
+    compareMatrices<T>(clblasColumnMajor, lengthX , 1, (blasX + params->offBX), (clblasX + params->offBX),
+                       lengthX);
+    deleteBuffers<T>(A, blasX, clblasX);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(TRMV, strmv) {
+    TestParams params;
+
+    getParams(&params);
+    trmvCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(TRMV, dtrmv) {
+    TestParams params;
+
+    getParams(&params);
+    trmvCorrectnessTest<cl_double>(&params);
+}
+
+TEST_P(TRMV, ctrmv) {
+    TestParams params;
+
+    getParams(&params);
+    trmvCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(TRMV, ztrmv) {
+    TestParams params;
+
+    getParams(&params);
+    trmvCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/corr-trsm.cpp b/src/tests/correctness/corr-trsm.cpp
new file mode 100644
index 0000000..e53331c
--- /dev/null
+++ b/src/tests/correctness/corr-trsm.cpp
@@ -0,0 +1,454 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <trsm.h>
+
+#include "trsm-delta.h"
+#include "tcase-filter.h"
+
+static void
+releaseMemObjects(cl_mem A, cl_mem B)
+{
+    clReleaseMemObject(A);
+    clReleaseMemObject(B);
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *B, T *blasB, T *clblasB, cl_double *delta)
+{
+    delete[] A;
+    delete[] B;
+    delete[] blasB;
+    delete[] clblasB;
+    delete[] delta;
+}
+
+template <typename T>
+void
+trsmCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *B, *blasB, *clblasB;
+    T alpha;
+    cl_mem bufA, bufB;
+    cl_double *delta;
+    clMath::BlasBase *base;
+    bool useAlpha;
+    cl_event *events;
+    bool isComplex;
+
+    base = clMath::BlasBase::getInstance();
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    isComplex = ((typeid(T) == typeid(FloatComplex)) ||
+                 (typeid(T) == typeid(DoubleComplex)));
+    if (canCaseBeSkipped(params, isComplex)) {
+        std::cerr << ">> Test is skipped because it has no importance for this "
+                     "level of coverage" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    useAlpha = base->useAlpha();
+    alpha = ZERO<T>();
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    A = new T[params->rowsA * params->columnsA];
+    B = new T[params->rowsB * params->columnsB];
+    blasB = new T[params->rowsB * params->columnsB];
+    clblasB = new T[params->rowsB * params->columnsB];
+    delta = new cl_double[params->rowsB * params->columnsB];
+
+    srand(params->seed);
+    if (useAlpha) {
+        alpha = convertMultiplier<T>(params->alpha);
+    }
+
+    ::std::cerr << "Generating input data... ";
+
+    randomTrsmMatrices<T>(params->order, params->side, params->uplo,
+        params->diag, params->M, params->N, useAlpha,
+        &alpha, A, params->lda, B, params->ldb);
+
+    memcpy(blasB, B, params->rowsB * params->columnsB * sizeof(*B));
+    memcpy(clblasB, B, params->rowsB * params->columnsB * sizeof(*B));
+    ::std::cerr << "Done" << ::std::endl;
+
+    ::std::cerr << "Calling reference xTRSM routine... ";
+    if (params->order == clblasColumnMajor) {
+        ::clMath::blas::trsm(clblasColumnMajor, params->side, params->uplo,
+            params->transA, params->diag, params->M, params->N, alpha, A,
+            params->lda, blasB, params->ldb);
+    }
+    else {
+        T *reorderedA = new T[params->rowsA * params->columnsA];
+        T *reorderedB = new T[params->rowsB * params->columnsB];
+
+        reorderMatrix<T>(clblasRowMajor, params->rowsA, params->columnsA,
+                         A, reorderedA);
+        reorderMatrix<T>(clblasRowMajor, params->rowsB, params->columnsB,
+                         blasB, reorderedB);
+
+        ::clMath::blas::trsm(clblasColumnMajor, params->side, params->uplo,
+            params->transA, params->diag, params->M, params->N, alpha,
+            reorderedA, params->rowsA, reorderedB, params->rowsB);
+
+        reorderMatrix<T>(clblasColumnMajor, params->rowsB, params->columnsB,
+            reorderedB, blasB);
+
+        delete[] reorderedB;
+        delete[] reorderedA;
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA *
+                                     sizeof(*A), params->offA * sizeof(*A),
+                                     CL_MEM_READ_ONLY);
+    bufB = base->createEnqueueBuffer(clblasB, params->rowsB * params->columnsB *
+                                     sizeof(*clblasB),
+                                     params->offBX * sizeof(*clblasB),
+                                     CL_MEM_READ_WRITE);
+    if ((bufA == NULL) || (bufB == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufA, bufB);
+        deleteBuffers<T>(A, B, blasB, clblasB, delta);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xTRSM routine... ";
+    err = (cl_int)::clMath::clblas::trsm(params->order, params->side,
+        params->uplo, params->transA, params->diag, params->M, params->N,
+        alpha, bufA, params->offA, params->lda, bufB, params->offBX,
+        params->ldb, params->numCommandQueues, base->commandQueues(),
+        0, NULL, events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufB);
+        deleteBuffers<T>(A, B, blasB, clblasB, delta);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::TRSM() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufB);
+        deleteBuffers<T>(A, B, blasB, clblasB, delta);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    clEnqueueReadBuffer(base->commandQueues()[0], bufB, CL_TRUE,
+                        params->offBX * sizeof(*clblasB),
+                        params->rowsB * params->columnsB * sizeof(*clblasB),
+                        clblasB, 0, NULL, NULL);
+
+    releaseMemObjects(bufA, bufB);
+
+    trsmDelta<T>(params->order, params->side, params->uplo, params->transA,
+        params->diag, params->M, params->N, A, params->lda, B, params->ldb,
+        alpha, delta);
+
+    compareMatrices<T>(params->order, params->M, params->N, blasB, clblasB,
+                       params->ldb, delta);
+    deleteBuffers<T>(A, B, blasB, clblasB, delta);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(TRSM, strsm) {
+    TestParams params;
+
+    getParams(&params);
+    trsmCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(TRSM, dtrsm) {
+    TestParams params;
+
+    getParams(&params);
+    trsmCorrectnessTest<cl_double>(&params);
+}
+
+TEST_P(TRSM, ctrsm) {
+    TestParams params;
+
+    getParams(&params);
+    trsmCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(TRSM, ztrsm) {
+    TestParams params;
+
+    getParams(&params);
+    trsmCorrectnessTest<DoubleComplex>(&params);
+}
+
+
+
+// ====================================
+// Adding some tests to catch bugs in the scenario where lda != M
+
+
+int arithsum(int i)
+{
+	int j;
+	for(j=i-1; j>0; j--)
+		i += j;
+	return i;
+}
+
+template <typename T>
+void AssignA(T *A, size_t i, size_t j, size_t ld)
+{
+	A[i*ld + j] = j == i ? (j+1) : ( j > i ? 0 : 1.0 );
+}
+
+template <>
+void AssignA(FloatComplex *A, size_t i, size_t j, size_t ld)
+{
+	FloatComplex *Ac = (FloatComplex *)A;
+	Ac[i*ld + j].s[0] = j == i ? (j+1) : ( j > i ? 0 : 1.0 );
+	Ac[i*ld + j].s[1] = 0;
+}
+
+template <>
+void AssignA(DoubleComplex *A, size_t i, size_t j, size_t ld)
+{
+	DoubleComplex *Az = (DoubleComplex *)A;
+	Az[i*ld + j].s[0] = j == i ? (j+1) : ( j > i ? 0 : 1.0 );
+	Az[i*ld + j].s[1] = 0;
+}
+
+template <typename T>
+void AssignB(T *B, size_t i, size_t j, size_t ld, size_t M)
+{
+	B[i*ld + j] = arithsum(M) - arithsum(j+1) + (j+1)*(j+1);
+}
+
+template <>
+void AssignB(FloatComplex *B, size_t i, size_t j, size_t ld, size_t M)
+{
+	FloatComplex *Bc = (FloatComplex *)B;
+	Bc[i*ld + j].s[0] = arithsum(M) - arithsum(j+1) + (j+1)*(j+1);
+	Bc[i*ld + j].s[1] = 0;
+}
+
+template <>
+void AssignB(DoubleComplex *B, size_t i, size_t j, size_t ld, size_t M)
+{
+	DoubleComplex *Bz = (DoubleComplex *)B;
+	Bz[i*ld + j].s[0] = arithsum(M) - arithsum(j+1) + (j+1)*(j+1);
+	Bz[i*ld + j].s[1] = 0;
+}
+
+template <typename T>
+void local_assert(T x, T y, T d)
+{
+	ASSERT_NEAR(x, y, d);
+}
+
+template <>
+void local_assert<FloatComplex>(FloatComplex x, FloatComplex y, FloatComplex d)
+{
+	ASSERT_NEAR(x.s[0], y.s[0], d.s[0]);
+	ASSERT_NEAR(x.s[1], y.s[1], d.s[1]);
+}
+
+template <>
+void local_assert<DoubleComplex>(DoubleComplex x, DoubleComplex y, DoubleComplex d)
+{
+	ASSERT_NEAR(x.s[0], y.s[0], d.s[0]);
+	ASSERT_NEAR(x.s[1], y.s[1], d.s[1]);
+}
+
+
+template <typename T>
+void Extratest(size_t M, size_t N, size_t lda, size_t ldb, T alpha, T delta)
+{
+	T *A, *B, *blasB, *clblasB;
+	cl_mem bufA, bufB;
+	clMath::BlasBase *base;
+    cl_event *events;
+	cl_int err;
+
+	base = clMath::BlasBase::getInstance();
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+
+	clblasOrder order = clblasColumnMajor;
+	clblasSide side = clblasLeft;
+	clblasUplo uplo = clblasUpper;
+	clblasTranspose trans = clblasNoTrans;
+	clblasDiag diag = clblasNonUnit;
+
+	A = new T[M * lda];
+    B = new T[N * ldb];
+    blasB = new T[N * ldb];
+    clblasB = new T[N * ldb];
+
+	memset(A, 0, M*lda*sizeof(T));
+	memset(B, 0, N*ldb*sizeof(T));
+
+	for(int i=0; i<M; i++) // down each column
+	{
+		for(int j=0; j<M; j++) // down each row
+		{
+			AssignA<T>(A, i, j, lda);
+		}
+	}
+
+	for(int i=0; i<N; i++) // down each column
+	{
+		for(int j=0; j<M; j++) // down each row
+		{
+			AssignB<T>(B, i, j, ldb, M);
+		}
+	}
+
+    memcpy(blasB, B, N*ldb*sizeof(T));
+    memcpy(clblasB, B, N*ldb*sizeof(T));
+
+	::std::cerr << "Calling reference xTRSM routine... ";
+	::clMath::blas::trsm(order, side, uplo, trans, diag, M, N, alpha, A, lda, blasB, ldb);
+
+
+    bufA = base->createEnqueueBuffer(A, M*lda*sizeof(T), 0, CL_MEM_READ_ONLY);
+    bufB = base->createEnqueueBuffer(clblasB, N*ldb*sizeof(T), 0, CL_MEM_READ_WRITE);
+
+    events = new cl_event[1];
+    memset(events, 0, sizeof(cl_event));
+
+    if ((bufA == NULL) || (bufB == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufA, bufB);
+        deleteBuffers<T>(A, B, blasB, clblasB, NULL);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xTRSM routine... ";
+    err = (cl_int)::clMath::clblas::trsm(order, side, uplo, trans, diag, M, N, alpha, bufA, 0, lda, bufB, 0, ldb,
+				1, base->commandQueues(), 0, NULL, events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufB);
+        deleteBuffers<T>(A, B, blasB, clblasB, NULL);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::TRSM() failed";
+    }
+
+    err = waitForSuccessfulFinish(1, base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+        releaseMemObjects(bufA, bufB);
+        deleteBuffers<T>(A, B, blasB, clblasB, NULL);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    clEnqueueReadBuffer(base->commandQueues()[0], bufB, CL_TRUE,
+                        0, N*ldb*sizeof(T), clblasB, 0, NULL, NULL);
+
+    releaseMemObjects(bufA, bufB);
+
+	// Validate the answer
+	for(int i=0; i<N; i++) // down each column
+	{
+		for(int j=0; j<ldb; j++) // down each row
+		{
+			local_assert(blasB[i*ldb + j], clblasB[i*ldb + j], delta);
+		}
+	}
+
+	deleteBuffers<T>(A, B, blasB, clblasB, NULL);
+    delete[] events;
+}
+
+#define ETST_TOLERENCE 1E-5
+
+TEST(TRSM_extratest, strsm)
+{
+	Extratest<float>(5, 2, 32, 32, 1.0f, ETST_TOLERENCE);
+}
+
+TEST(TRSM_extratest, dtrsm)
+{
+	Extratest<double>(5, 2, 32, 32, 1.0, ETST_TOLERENCE);
+}
+
+TEST(TRSM_extratest, ctrsm)
+{
+	FloatComplex alpha = floatComplex(1.0f, 0);
+	FloatComplex delta = floatComplex(ETST_TOLERENCE, ETST_TOLERENCE);
+	Extratest<FloatComplex>(5, 2, 32, 32, alpha, delta);
+}
+
+TEST(TRSM_extratest, ztrsm)
+{
+	DoubleComplex alpha = doubleComplex(1.0, 0);
+	DoubleComplex delta = doubleComplex(ETST_TOLERENCE, ETST_TOLERENCE);
+	Extratest<DoubleComplex>(5, 2, 32, 32, alpha, delta);
+}
\ No newline at end of file
diff --git a/src/tests/correctness/corr-trsv.cpp b/src/tests/correctness/corr-trsv.cpp
new file mode 100644
index 0000000..b0ed0ca
--- /dev/null
+++ b/src/tests/correctness/corr-trsv.cpp
@@ -0,0 +1,252 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <trsv.h>
+#include <cltypes.h>
+
+#include "trsv-delta.h"
+
+static void
+releaseMemObjects(cl_mem objA, cl_mem objX)
+{
+	if (objA != NULL)
+    clReleaseMemObject(objA);
+	if (objX != NULL)
+    clReleaseMemObject(objX);
+}
+
+template <typename T> static void
+deleteBuffers(T *A, T *blasX, T *backX, cl_double *deltaX)
+{
+    if( A != NULL )
+	{
+    delete[] A;
+	}
+	if( blasX != NULL )
+	{
+    delete[] blasX;
+	}
+	if( backX != NULL )
+	{
+		delete[] backX;
+	}
+	if( deltaX != NULL )
+	{
+	delete[] deltaX;
+}
+}
+
+template <typename T>
+void
+trsvCorrectnessTest(TestParams *params)
+{
+    cl_int err;
+    T *A, *blasX, *backX;
+	cl_double *deltaX;
+    cl_mem bufA, bufX;
+    clMath::BlasBase *base;
+    cl_event *events;
+
+    base = clMath::BlasBase::getInstance();
+
+    if ((typeid(T) == typeid(cl_double) ||
+         typeid(T) == typeid(DoubleComplex)) &&
+        !base->isDevSupportDoublePrecision()) {
+
+        std::cerr << ">> WARNING: The target device doesn't support native "
+                     "double precision floating point arithmetic" <<
+                     std::endl << ">> Test skipped" << std::endl;
+        SUCCEED();
+        return;
+    }
+
+    events = new cl_event[params->numCommandQueues];
+    memset(events, 0, params->numCommandQueues * sizeof(cl_event));
+
+    size_t lengthA = params->N * params->lda;
+    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));
+
+    A 		= new T[lengthA + params->offa];
+    blasX 	= new T[lengthX + params->offBX];
+    backX 	= new T[lengthX + params->offBX];
+	deltaX	= new cl_double[lengthX + params->offBX];
+
+	if ((A==NULL) || (blasX == NULL) || (backX == NULL) || (deltaX == NULL))
+	{
+		::std::cerr << "Unable to allocate matrices in Host memory" << std::endl;
+		deleteBuffers<T>(A, blasX, backX, deltaX);
+		delete[] events;
+		SUCCEED();
+		return;
+	}
+	memset( deltaX, 0, lengthX*sizeof(cl_double) );
+	memset( blasX, 0, lengthX*sizeof(T) );
+
+    srand(params->seed);
+
+    ::std::cerr << "Generating input data... ";
+
+	//custom generation function in blas-random.h
+	randomTrsvMatrices<T>( params->order, params->uplo, params->diag, params->N, (A + params->offa), params->lda, (blasX + params->offBX), params->incx);
+
+	// Generate delta X for result comparison
+	trsvDelta<T>( params->order, params->uplo, params->transA, params->diag, params->N, (A + params->offa), params->lda, (blasX + params->offBX), params->incx, (deltaX + params->offBX) );
+
+	/*printf("\n\n before acml call\nA\n");
+	printMatrixBlock( params->order, 0, 0, params->N, params->N, params->lda, A);
+	printf("\nX\n");
+	printMatrixBlock( clblasColumnMajor, 0, 0, lengthX, 1, lengthX, blasX);*/
+
+    // Copy blasX to clblasX
+    memcpy(backX, blasX, (lengthX + params->offBX) * sizeof(T));
+	// Allocate buffers
+    bufA = base->createEnqueueBuffer(A, (lengthA + params->offa)* sizeof(T), 0, CL_MEM_READ_ONLY);
+    bufX = base->createEnqueueBuffer(backX, (lengthX + params->offBX)* sizeof(T), 0, CL_MEM_WRITE_ONLY);
+    ::std::cerr << "Done" << ::std::endl;
+
+    ::std::cerr << "Calling reference xTRSV routine... ";
+
+    clblasOrder order;
+    clblasUplo fUplo;
+    clblasTranspose fTrans;
+
+    order = params->order;
+    fUplo = params->uplo;
+    fTrans = params->transA;
+
+
+    if (order != clblasColumnMajor)
+    {
+        order = clblasColumnMajor;
+        fUplo =  (params->uplo == clblasUpper)? clblasLower : clblasUpper;
+        fTrans = (params->transA == clblasNoTrans)? clblasTrans : clblasNoTrans;
+
+        if( params->transA == clblasConjTrans )
+            doConjugate((A + params->offa), params->N, params->N, params->lda );
+    }
+	::clMath::blas::trsv( order, fUplo, fTrans, params->diag, params->N, A, params->offa, params->lda, blasX, params->offBX, params->incx);
+	::std::cerr << "Done" << ::std::endl;
+
+	/*
+	printf("\n\n acml result X\n");
+	printf("\nblasX\n");
+	printMatrixBlock( clblasColumnMajor, 0, 0, lengthX, 1, lengthX, blasX);*/
+
+    if ((bufA == NULL) || (bufX == NULL)) {
+        /* Skip the test, the most probable reason is
+         *     matrix too big for a device.
+         */
+        releaseMemObjects(bufA, bufX);
+        deleteBuffers<T>(A, blasX, backX, deltaX);
+        delete[] events;
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+        return;
+    }
+
+    ::std::cerr << "Calling clblas xTRSV routine... ";
+
+    DataType type;
+    type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+    // Should use bufXTemp as well
+    err = (cl_int)::clMath::clblas::trsv(type, params->order, params->uplo, params->transA, params->diag, params->N, bufA,
+    					params->offa, params->lda, bufX, params->offBX, params->incx, params->numCommandQueues, base->commandQueues(),
+    					0, NULL, events);
+
+    if (err != CL_SUCCESS) {
+
+        deleteBuffers<T>(A, blasX, backX, deltaX);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::TRSV() failed";
+    }
+
+    err = waitForSuccessfulFinish(params->numCommandQueues,
+        base->commandQueues(), events);
+    if (err != CL_SUCCESS) {
+
+        deleteBuffers<T>(A, blasX, backX, deltaX);
+        delete[] events;
+        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
+    }
+    ::std::cerr << "Done" << ::std::endl;
+
+    clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0,
+        lengthX * sizeof(*backX), backX, 0,
+        NULL, NULL);
+
+    releaseMemObjects(bufA, bufX);
+
+	/*
+	printf("\n\n clblas result X\n");
+	printf("\nclBlasX\n");
+	printMatrixBlock( clblasColumnMajor, 0, 0, lengthX, 1, lengthX, backX);
+
+	printf("\n\n delta X\n\n");
+	printMatrixBlock( clblasColumnMajor, 0, 0, lengthX, 1, lengthX, deltaX);*/
+
+    // handle lda correctly based on row-major/col-major..
+    compareMatrices<T>( clblasColumnMajor, lengthX , 1, blasX, backX,
+                       lengthX, deltaX );
+    deleteBuffers<T>(A, blasX, backX, deltaX);
+    delete[] events;
+}
+
+// Instantiate the test
+
+TEST_P(TRSV, strsv) {
+    TestParams params;
+
+    getParams(&params);
+    trsvCorrectnessTest<cl_float>(&params);
+}
+
+TEST_P(TRSV, dtrsv) {
+    TestParams params;
+
+    getParams(&params);
+    trsvCorrectnessTest<cl_double>(&params);
+}
+
+TEST_P(TRSV, ctrsv) {
+    TestParams params;
+
+    getParams(&params);
+    trsvCorrectnessTest<FloatComplex>(&params);
+}
+
+TEST_P(TRSV, ztrsv) {
+    TestParams params;
+
+    getParams(&params);
+    trsvCorrectnessTest<DoubleComplex>(&params);
+}
diff --git a/src/tests/correctness/delta.h b/src/tests/correctness/delta.h
new file mode 100644
index 0000000..c0f4b23
--- /dev/null
+++ b/src/tests/correctness/delta.h
@@ -0,0 +1,36 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#ifndef DELTA_H_
+#define DELTA_H_
+
+#include <blas-math.h>
+#include <common.h>
+
+// Type-dependant constants
+template <class T>
+static cl_double DELTA_0();
+template<>
+__template_static cl_double DELTA_0<cl_float>()       { return pow(2.0, -20); }
+template<>
+__template_static cl_double DELTA_0<cl_double>()      { return pow(2.0, -50); }
+template<>
+__template_static cl_double DELTA_0<FloatComplex>()   { return pow(2.0, -20); }
+template<>
+__template_static cl_double DELTA_0<DoubleComplex>()  { return pow(2.0, -50); }
+
+#endif      // DELTA_H
+
diff --git a/src/tests/correctness/tcase-filter.cpp b/src/tests/correctness/tcase-filter.cpp
new file mode 100644
index 0000000..a389ffd
--- /dev/null
+++ b/src/tests/correctness/tcase-filter.cpp
@@ -0,0 +1,217 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include "tcase-filter.h"
+
+#if defined(SHORT_TESTS) || defined(MEDIUM_TESTS)
+
+static __inline size_t
+selectSize(size_t orig, size_t alt)
+{
+    return (orig) ? orig : alt;
+}
+
+static size_t
+nonZeroSize(size_t size1, size_t size2, size_t size3)
+{
+    size_t r = 0;
+
+    if (size1) {
+        r = size1;
+    }
+    else if (size2) {
+        r = size2;
+    }
+    else {
+        r = size3;
+    }
+
+    return r;
+}
+
+static int
+sizeEquCount(size_t size1, size_t size2, size_t size3)
+{
+    int cnt = 0;
+
+    cnt += static_cast<int>(size1 == size2);
+    cnt += static_cast<int>(size2 == size3);
+    cnt += static_cast<int>(size1 == size3);
+
+    return cnt;
+}
+
+static __inline bool
+isEquToAny(size_t size, size_t alt1, size_t alt2, size_t alt3)
+{
+    return ((size == alt1) || (size == alt2) || (size == alt3));
+}
+
+static __inline bool
+isRealConjugation(const TestParams *params, bool isComplex)
+{
+    return !isComplex &&
+           ((params->transA == clblasConjTrans) ||
+            (params->transB == clblasConjTrans));
+}
+
+#endif                          /* SHORT_TESTS || MEDIUM_TESTS */
+
+#if defined(SHORT_TESTS)
+
+bool
+canCaseBeSkipped(const TestParams *params, bool isComplex)
+{
+    size_t s;
+    size_t m, n, k, lda, ldb, ldc;
+
+    // skip cases with conjugated transposition for real data
+    if (isRealConjugation(params, isComplex)) {
+        return true;
+    }
+
+    /*
+     * Enable only cases at which all the problem dimensions are equal
+     * to each other
+     */
+    s = nonZeroSize(params->M, params->N, params->K);
+    m = selectSize(params->M, s);
+    n = selectSize(params->N, s);
+    k = selectSize(params->K, s);
+    if (sizeEquCount(m, n, k) < 3) {
+        return true;
+    }
+
+    /*
+     * filter BigLDA cases
+     */
+    s = nonZeroSize(params->lda, params->ldb, params->ldc);
+    lda = selectSize(params->lda, s);
+    ldb = selectSize(params->ldb, s);
+    ldc = selectSize(params->ldc, s);
+    if (sizeEquCount(lda, ldb, ldc) < 3) {
+        return true;
+    }
+    if (!isEquToAny(lda, m, n, k)) {
+        return true;
+    }
+
+    return false;
+}
+
+#elif defined(MEDIUM_TESTS)     /* SHORT_TESTS */
+
+#include <algorithm>
+
+#include <stdio.h>
+
+/*
+ * Evaluate best vector length that buffer with such leading dimension
+ * would have for such leading dimension.
+ */
+static unsigned int
+prognozedVecLen(size_t ld)
+{
+    size_t u = static_cast<size_t>(1) << (sizeof(size_t) * 8 - 1);
+    size_t vecLen;
+
+    // typically vecLen will not exceed 8
+    ld %= 8;
+    if (ld == 0) {
+        return 8;
+    }
+    else if (ld == 1) {
+        return 1;
+    }
+
+    // find the highest non zero bit
+    for (; (u != 0) && !(u & ld); u >>= 1);
+
+    /*
+     * Evaluated as minimum of modules based operation results against
+     * upper and lower power of 2 bounds
+     */
+    vecLen = ld - u;
+    u >>= 1;
+    vecLen = ::std::min(vecLen, u - ld);
+
+    return static_cast<unsigned int>(vecLen);
+}
+
+bool
+canCaseBeSkipped(const TestParams *params, bool isComplex)
+{
+    size_t s;
+    size_t m, n, k, lda, ldb, ldc;
+    int bigCnt = 0;
+    unsigned int vecLen;
+
+    // skip cases with conjugated transposition for real data
+    if (isRealConjugation(params, isComplex)) {
+        return true;
+    }
+
+    // set of cases for extended versions is really tiny, so enable them all
+    if (params->offA || params->offBX || params->offCY) {
+        return false;
+    }
+
+    s = nonZeroSize(params->M, params->N, params->K);
+    m = selectSize(params->M, s);
+    n = selectSize(params->N, s);
+    k = selectSize(params->K, s);
+
+    // enable BigLDA cases when problem dimensions all are equal to each other
+    s = nonZeroSize(params->lda, params->ldb, params->ldc);
+    lda = selectSize(params->lda, s);
+    ldb = selectSize(params->ldb, s);
+    ldc = selectSize(params->ldc, s);
+    bigCnt += static_cast<int>(!isEquToAny(lda, m, n, k));
+    bigCnt += static_cast<int>(!isEquToAny(ldb, m, n, k));
+    bigCnt += static_cast<int>(!isEquToAny(ldc, m, n, k));
+    if (bigCnt) {
+        if (sizeEquCount(m, n, k) < 3) {
+            return true;
+        }
+        else {
+            return false;
+        }
+    }
+
+    // enable only cases at which buffers will have the same vectorization
+    vecLen = prognozedVecLen(lda);
+    if ((prognozedVecLen(ldb) != vecLen) ||
+        (prognozedVecLen(ldc) != vecLen)) {
+
+        return true;
+    }
+
+    return false;
+}
+
+#else                           /* MEDIUM_TESTS */
+
+bool
+canCaseBeSkipped(const TestParams *params, bool isComplex)
+{
+    (void)params;
+    (void)isComplex;
+    return false;
+}
+
+#endif                          /* !SHORT_TESTS && !MEDIUM_TESTS */
+
diff --git a/src/tests/correctness/tcase-filter.h b/src/tests/correctness/tcase-filter.h
new file mode 100644
index 0000000..4ba1bdd
--- /dev/null
+++ b/src/tests/correctness/tcase-filter.h
@@ -0,0 +1,30 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Filter for skipping test cases when run time is more important than
+ * coverage
+ */
+
+#ifndef TCASEFILTER_H_
+#define TCASEFILTER_H_
+
+#include <cmdline.h>
+
+bool canCaseBeSkipped(const TestParams *params, bool isComplex);
+
+#endif /* TCASEFILTER_H_ */
diff --git a/src/tests/correctness/test-correctness.cpp b/src/tests/correctness/test-correctness.cpp
new file mode 100644
index 0000000..950382e
--- /dev/null
+++ b/src/tests/correctness/test-correctness.cpp
@@ -0,0 +1,3406 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#define DO_GEMM
+#define DO_TRMM
+#define DO_TRSM
+#define DO_SYR2K
+#define DO_SYRK
+#define DO_GEMV
+#define DO_SYMV
+#define DO_SYMM
+#define DO_TRMV
+#define DO_TPMV
+#define DO_TRSV
+#define DO_SYR
+#define DO_SPR
+#define DO_GER
+#define DO_GERC
+#define DO_SYR2
+#define DO_HER
+#define DO_HER2
+#define DO_HEMM
+#define DO_HEMV
+#define DO_HPMV
+#define DO_SPMV
+#define DO_SBMV
+#define DO_HERK
+#define DO_TPSV
+#define DO_HPR
+#define DO_SPR2
+#define DO_HPR2
+#define DO_GBMV
+#define DO_HBMV
+#define DO_TBMV
+#define DO_TBSV
+#define DO_HER2K
+#define DO_SWAP
+#define DO_COPY
+#define DO_SCAL
+#define DO_AXPY
+#define DO_DOT
+#define DO_DOTC
+#define DO_ROTG
+#define DO_ROTM
+#define DO_ROT
+#define DO_ROTMG
+#define DO_NRM2
+#define DO_ASUM
+#define DO_iAMAX
+
+//#define DO_SPL - Only used for special case testing (for devel purposes)
+//#define DO_GEMM_2 - This needs to remain commented.
+
+#include <gtest/gtest.h>
+#include <BlasBase.h>
+#include <ExtraTestSizes.h>
+#include <gemm.h>
+#include <gemm-2.h>
+#include <trmm.h>
+#include <trsm.h>
+#include <gemv.h>
+#include <symv.h>
+#include <syr2k.h>
+#include <syrk.h>
+#include <trsv.h>
+#include <trmv.h>
+#include <tpmv.h>
+#include <symm.h>
+#include <syr.h>
+#include <sbmv.h>
+#include <spr.h>
+#include <ger.h>
+#include <gerc.h>
+#include <syr2.h>
+#include <her.h>
+#include <her2.h>
+#include <hemv.h>
+#include <hpmv.h>
+#include <spmv.h>
+#include <hemm.h>
+#include <herk.h>
+#include <tpsv.h>
+#include <hpr.h>
+#include <spr2.h>
+#include <hpr2.h>
+#include <gbmv.h>
+#include <hbmv.h>
+#include <tbmv.h>
+#include <tbsv.h>
+#include <her2k.h>
+#include <swap.h>
+#include <scal.h>
+#include <copy.h>
+#include <axpy.h>
+#include <dot.h>
+#include <asum.h>
+#include <dotc.h>
+#include <rotg.h>
+#include <rotm.h>
+#include <rot.h>
+#include <rotmg.h>
+#include <nrm2.h>
+#include <iamax.h>
+
+using ::testing::TestWithParam;
+using ::testing::Values;
+using ::testing::ValuesIn;
+using ::testing::Combine;
+
+TestParams globalTestParams;
+
+// Different ranges of test parameters
+
+static const clblasOrder orderSet[] =
+    { clblasColumnMajor, clblasRowMajor };
+static const clblasTranspose transSet[] =
+    { clblasNoTrans, clblasTrans, clblasConjTrans };
+static const clblasSide sideSet[] =
+    { clblasLeft, clblasRight };
+static const clblasUplo uploSet[] =
+    { clblasUpper, clblasLower };
+static const clblasDiag diagSet[] =
+    { clblasUnit, clblasNonUnit };
+
+const size_t ZERO_VAL[1] = { 0 };
+const int ONE_VAL[1] = { 1 };
+const int verySmallRange[] =
+{1, 3, 5, 10, 11, 15, 16, 23, 21, 32, 33, 45, 40, 63, 333, 1024, 1025, 4096, 4223};
+const int completeRange[] =
+{1, 3, 5, 10, 11, 15, 16, 23, 21, 32, 33, 45, 40, 63, 333, 1024, 1025, 4096, 4223};
+#if defined SHORT_TESTS
+const int smallRange[] =
+    { 63, 128 };
+
+const int numQueues[] =
+    { 2 };
+#elif defined MEDIUM_TESTS  /* SHORT_TESTS */
+const int smallRange[] =
+    { 15, 64, 133 };
+const int numQueues[] =
+    { 3, 4 };
+#else                       /* MEDIUM_TESTS */
+const int smallRange[] =
+    { 15, 16, 33, 40, 62, 64, 128, 129, 256, 258 };
+    //{ 15, 16, 32, 33, 63, 64, 128, 129, 256, 257 };
+	//{ 3, 4, 15, 16, 32, 33, 63, 64, 128, 129, 256, 257, 333, 566, 787, 1024, 1025, 1113, 1111, 999, 883, 633, 17 };
+
+const int numQueues[] =
+    { 2, 3, 4, 5, 6, 7 };
+#endif                      /* !SHORT_TESTS && !MEDIUM_TESTS */
+
+#if defined(SHORT_TESTS) || defined(MEDIUM_TESTS)
+
+enum {
+    BIG_LDA = 500,
+    BIG_LDB = 600,
+    BIG_LDC = 700
+};
+
+const int incs[] =
+    { 33, -33 };
+
+#else                       /* SHORT_TESTS || MEDIUM_TESTS */
+
+enum {
+    BIG_LDA = 501,
+    BIG_LDB = 602,
+    BIG_LDC = 703
+};
+
+const int incs[] =
+    { 1, -1, 33, -33 };
+
+#endif                      /* !SHORT_TESTS && !MEDIUM_TESTS */
+
+#if defined(SHORT_TESTS) || defined(MEDIUM_TESTS)
+const size_t offs[] =
+    { 63, 258 };
+#else                       /* !SHORT_TESTS && !MEDIUM_TESTS */
+const size_t offs[] =
+    {0, 63, 128, 258 };
+#endif
+
+const int ldaRange[] = {0, 3192, 4097 };
+const int offsetRange[] = { 0, 100 };
+const double realAlphaRange[] = {(double)50, (double)100, (double)999999};
+const cl_float2 complexAlphaRange[] = {floatComplex(0,1), floatComplex(3,4)};
+const cl_float2 complexAlpha = floatComplex(2,3);
+
+const ComplexLong alphaBetaRange[] = {{50,50}, {20,20}};
+const ComplexLong alphaBeta = {10,10};
+const ComplexLong sflagRange[] = {{-1,0}, {0,0}, {1,0}, {-2,0}};
+
+
+
+#ifdef DO_SPL
+
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeHER2_SPL, HER2, Combine(
+        Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), ValuesIn(complexAlphaRange),
+    ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange),ValuesIn(offsetRange),ValuesIn(ldaRange),
+    Values(1) ) );
+#endif
+
+
+
+#ifdef DO_HEMV
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_HEMV, HEMV, Combine(
+    Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), Values(alphaBeta),
+    Values(alphaBeta), Values((size_t)0), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedSmall_0HEMV, HEMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(15), Values(alphaBeta),
+    Values(alphaBeta), Values((size_t)0), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(order_HEMV, HEMV, Combine(
+    ValuesIn(orderSet), Values(clblasLower), ValuesIn(smallRange), Values(alphaBeta),
+    Values(alphaBeta), ValuesIn(offs), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(uplo_HEMV, HEMV, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), Values(alphaBeta),
+    Values(alphaBeta), Values((size_t)0), ValuesIn(offs), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(alpha_beta_HEMV, HEMV, Combine(
+    Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), Values(alphaBeta),
+    Values(alphaBeta), Values((size_t)0), Values((size_t)0), ValuesIn(offs), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedBig_0HEMV, HEMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(1500, 5101), Values(alphaBeta),
+    Values(alphaBeta), Values((size_t)0), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+#else
+INSTANTIATE_TEST_CASE_P(ALL_HEMV, HEMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(alphaBetaRange),
+    ValuesIn(alphaBetaRange), ValuesIn(offs), ValuesIn(offs), ValuesIn(offs), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)),
+    Values(1)));
+
+#endif      // Correctness
+
+#endif
+
+#ifdef DO_SWAP
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(SmallRange, SWAPXY, Combine(
+        Values(100,50), Values(0), Values(1), Values(0), Values(1), Values(1) ) );
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Medium_SWAP, SWAPXY, Combine(
+        Values(64,128,256,512), Values(0,3), Values(1,-1), Values(0,3), Values(1,-1), Values(1)));
+
+#else
+INSTANTIATE_TEST_CASE_P(ALL_SWAP, SWAPXY, Combine(
+        ValuesIn(completeRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(incs), Values(1)));
+
+#endif
+#endif
+
+#ifdef DO_AXPY
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Small_AXPY, AXPY, Combine(
+        Values(100,50), ValuesIn(alphaBetaRange), Values(0), Values(1), Values(0), Values(1), Values(1) ) );
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Medium_AXPY, AXPY, Combine(
+        Values(64,128,256,512), ValuesIn(alphaBetaRange), Values(0,3), Values(1,-1), Values(0,3), Values(1,-1), Values(1)));
+
+#else
+INSTANTIATE_TEST_CASE_P(ALL_AXPY, AXPY, Combine(
+        ValuesIn(completeRange), ValuesIn(alphaBetaRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(incs), Values(1)));
+
+#endif
+#endif
+
+#ifdef DO_ROTG
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Small_ROTG, ROTG, Combine(
+        Values(1, 5), Values(1, 6), Values(2, 8), Values(3, 7), Values(1) ) );
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Medium_ROTG, ROTG, Combine(
+        Values(64,128,256,512), Values(64, 128, 256, 512), Values(0,3), Values(0,3), Values(1)));
+
+#else
+INSTANTIATE_TEST_CASE_P(ALL_ROTG, ROTG, Combine(
+        ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1)));
+
+#endif
+#endif
+
+#ifdef DO_ROTM
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Small_ROTM, ROTM, Combine(
+        Values(1, 5, 10, 20), Values(1, 6), Values(1, -1), Values(1, 6), Values(1, -1), Values(1, 6), ValuesIn(sflagRange), Values(1)));
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Medium_ROTM, ROTM, Combine(
+        Values(64,128,256,512), Values(0,3), Values(1, -3, 3, 1), Values(0,3), Values(1, -3, 3, 1), Values(0, 3), ValuesIn(sflagRange), Values(1)));
+#else
+INSTANTIATE_TEST_CASE_P(ALL_ROTM, ROTM, Combine(
+        ValuesIn(completeRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(incs),
+        ValuesIn(offsetRange), ValuesIn(sflagRange), Values(1)));
+#endif
+#endif
+
+#ifdef DO_ROT
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Small_ROT, ROT, Combine(
+        Values(1, 5, 10, 20), Values(1, 6), Values(1, -1), Values(1, 6), Values(1, -1), Values(1, 6), Values(1, 2), Values(1)));
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Medium_ROT, ROT, Combine(
+        Values(64,128,256,512), Values(0,3), Values(1, -3, 3, 1), Values(0,3), Values(1, -3, 3, 1), Values(0, 3), Values(0, 4), Values(1)));
+#else
+INSTANTIATE_TEST_CASE_P(ALL_ROT, ROT, Combine(
+        ValuesIn(completeRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(incs),
+        ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+#endif
+#endif
+
+#ifdef DO_ROTMG
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Small_ROTMG, ROTMG, Combine(
+        Values(1, 6), Values(1, 6), Values(1, 6), Values(1, 6), Values(1, 6), ValuesIn(sflagRange), Values(1)));
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Medium_ROTMG, ROTMG, Combine(
+        Values(1, 3, 15), Values(0, 3, 15), Values(0, 3, 15), Values(0, 3, 15), Values(0, 3, 15), ValuesIn(sflagRange), Values(1)));
+#else
+INSTANTIATE_TEST_CASE_P(ALL_ROTMG, ROTMG, Combine(
+        ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange),
+        ValuesIn(offsetRange), ValuesIn(sflagRange), Values(1)));
+#endif
+#endif
+
+//NRM2
+
+#ifdef DO_NRM2
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_NRM2, NRM2, Combine(
+    ValuesIn(smallRange), Values(1), Values(1), Values(1), Values(1)) );
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall0_NRM2, NRM2, Combine(
+    Values(61), Values(4, -11), Values(0), Values(1), Values(1)) );
+
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Medium_NRM2, NRM2, Combine(
+    ValuesIn(smallRange), Values(-10), Values(1), Values(1), Values(1) ) );
+
+INSTANTIATE_TEST_CASE_P(SelectedBig0_NRM2, NRM2, Combine(
+    Values(4900), Values(1), Values(4), Values(1), Values(1) ) );
+
+#else       // Correctness
+INSTANTIATE_TEST_CASE_P(ALL_NRM2, NRM2, Combine(
+    ValuesIn(completeRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) );
+
+#endif      // Correctness
+#endif
+
+#ifdef DO_ASUM
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_ASUM, ASUM, Combine(
+    ValuesIn(smallRange), Values(1), Values(1), Values(1), Values(1)) );
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall0_ASUM, ASUM, Combine(
+    Values(61), Values(4, -11), Values(0), Values(1), Values(1)) );
+
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Medium_ASUM, ASUM, Combine(
+    ValuesIn(smallRange), Values(-10), Values(1), Values(1), Values(1) ) );
+
+INSTANTIATE_TEST_CASE_P(SelectedBig0_ASUM, ASUM, Combine(
+    Values(4900), Values(1), Values(4), Values(1), Values(1) ) );
+
+#else       // Correctness
+INSTANTIATE_TEST_CASE_P(ALL_ASUM, ASUM, Combine(
+    ValuesIn(completeRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) );
+
+#endif      // Correctness
+#endif
+
+#ifdef DO_iAMAX
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_iAMAX, iAMAX, Combine(
+    ValuesIn(smallRange), Values(1), Values(1), Values(1), Values(1)) );
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall0_iAMAX, iAMAX, Combine(
+    Values(61), Values(4, -1), Values(0), Values(1), Values(1)) );
+
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Medium_iAMAX, iAMAX, Combine(
+    ValuesIn(smallRange), Values(-10), Values(1), Values(1), Values(1) ) );
+
+INSTANTIATE_TEST_CASE_P(SelectedBig0_iAMAX, iAMAX, Combine(
+    Values(4900), Values(1), Values(4), Values(1), Values(1) ) );
+
+#else       // Correctness
+INSTANTIATE_TEST_CASE_P(ALL_iAMAX, iAMAX, Combine(
+    ValuesIn(completeRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) );
+
+#endif      // Correctness
+#endif
+
+#ifdef DO_HPMV
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_HPMV, HPMV, Combine(
+	Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), Values(alphaBeta),
+	Values(alphaBeta), Values((size_t)0), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedSmall_0HPMV, HPMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(15), Values(alphaBeta),
+	Values(alphaBeta), Values((size_t)0), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(order_HPMV, HPMV, Combine(
+    ValuesIn(orderSet), Values(clblasLower), ValuesIn(smallRange), Values(alphaBeta),
+    Values(alphaBeta), ValuesIn(offs), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(uplo_HPMV, HPMV, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), Values(alphaBeta),
+    Values(alphaBeta), Values((size_t)0), ValuesIn(offs), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(alpha_beta_HPMV, HPMV, Combine(
+    Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), Values(alphaBeta),
+    Values(alphaBeta), Values((size_t)0), Values((size_t)0), ValuesIn(offs), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedBig_0HPMV, HPMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(1500, 5101), Values(alphaBeta),
+	Values(alphaBeta), Values((size_t)0), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+#else
+INSTANTIATE_TEST_CASE_P(ALL_HPMV, HPMV, Combine(
+	ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(alphaBetaRange),
+	ValuesIn(alphaBetaRange), ValuesIn(offs), ValuesIn(offs), ValuesIn(offs), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)),
+    Values(1)));
+
+#endif      // Correctness
+
+#endif
+
+#ifdef DO_SYMM
+
+#if defined(SHORT_TESTS)
+/*INSTANTIATE_TEST_CASE_P(Short_SYMM, SYMM, Combine(
+    Values(clblasRowMajor), Values(clblasLeft),Values(clblasLower), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(complexAlphaRange),
+    ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 0)), Values(1)));*/
+INSTANTIATE_TEST_CASE_P(SelectedSmall_0SYMM, SYMM, Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet),ValuesIn(uploSet), Values(15),Values(15), Values(complexAlpha),
+    Values(complexAlpha), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 0)), Values(1)));
+
+#elif defined(MEDIUM_TESTS)
+/*INSTANTIATE_TEST_CASE_P(order_SYMM, SYMM, Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet),Values(clblasLower), ValuesIn(smallRange),ValuesIn(smallRange) ,ValuesIn(complexAlphaRange),
+    ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, 0, 0, 9, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(uplo_SYMM, SYMM, Combine(
+    Values(clblasRowMajor), Values(clblasLeft),ValuesIn(uploSet), ValuesIn(smallRange),ValuesIn(smallRange), ValuesIn(complexAlphaRange),
+    ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 9, 0)), Values(1)));*/
+INSTANTIATE_TEST_CASE_P(alpha_beta_SYMM, SYMM, Combine(
+    Values(clblasRowMajor), Values(clblasLeft),Values(clblasLower), Values(64),Values(133), Values(complexAlpha),
+    Values(complexAlpha), Values(clMath::ExtraTestSizes(0, (size_t)0, (size_t)0, 3, 7, 11)), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedBig_0SYMM, SYMM, Combine(
+    ValuesIn(orderSet), Values(clblasLeft),Values(clblasLower), Values(1100),Values(4000), Values(complexAlpha),
+    Values(complexAlpha), Values(clMath::ExtraTestSizes(0, (size_t)0, (size_t)0, 0, 0, 0)), Values(1)));
+
+#else
+INSTANTIATE_TEST_CASE_P(ALL_SYMM_FriendlyOffsets, SYMM, Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet),ValuesIn(uploSet), ValuesIn(smallRange),ValuesIn(smallRange), ValuesIn(complexAlphaRange),
+    ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, (size_t)0, (size_t)0, 64, 32, 128)),
+    Values(1)));
+INSTANTIATE_TEST_CASE_P(ALL_SYMM_UnfriendlyOffsets, SYMM, Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet),ValuesIn(uploSet), ValuesIn(smallRange),ValuesIn(smallRange), ValuesIn(complexAlphaRange),
+    ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, (size_t)0, (size_t)0, 6, 3, 12)),
+    Values(1)));
+
+#endif      // Correctness
+#endif
+
+
+#ifdef DO_HEMM
+
+#if defined(SHORT_TESTS)
+/*INSTANTIATE_TEST_CASE_P(Short_HEMM, HEMM, Combine(
+    Values(clblasRowMajor), Values(clblasLeft),Values(clblasLower), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(complexAlphaRange),
+    ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 0)), Values(1)));*/
+INSTANTIATE_TEST_CASE_P(SelectedSmall_0HEMM, HEMM, Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet),ValuesIn(uploSet), Values(15),Values(15), Values(complexAlpha),
+    Values(complexAlpha), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 0)), Values(1)));
+
+#elif defined(MEDIUM_TESTS)
+/*INSTANTIATE_TEST_CASE_P(order_HEMM, HEMM, Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet),Values(clblasLower), ValuesIn(smallRange),ValuesIn(smallRange) ,ValuesIn(complexAlphaRange),
+    ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, 0, 0, 9, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(uplo_HEMM, HEMM, Combine(
+    Values(clblasRowMajor), Values(clblasLeft),ValuesIn(uploSet), ValuesIn(smallRange),ValuesIn(smallRange), ValuesIn(complexAlphaRange),
+    ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 9, 0)), Values(1)));*/
+INSTANTIATE_TEST_CASE_P(alpha_beta_HEMM, HEMM, Combine(
+    Values(clblasRowMajor), Values(clblasLeft),Values(clblasLower), Values(64),Values(133), Values(complexAlpha),
+    Values(complexAlpha), Values(clMath::ExtraTestSizes(0, (size_t)0, (size_t)0, 0, 0, 9)), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedBig_0HEMM, HEMM, Combine(
+    ValuesIn(orderSet), Values(clblasLeft),Values(clblasLower), Values(1010),Values( 4000), Values(complexAlpha),
+    Values(complexAlpha), Values(clMath::ExtraTestSizes(0, (size_t)0, (size_t)0, 0, 1, 0)), Values(1)));
+
+#else
+INSTANTIATE_TEST_CASE_P(ALL_HEMM, HEMM, Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet),ValuesIn(uploSet), ValuesIn(smallRange),ValuesIn(smallRange), ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, (size_t)512, (size_t)511, 9, 0, 0)), Values(1)));
+
+#endif      // Correctness
+#endif
+
+
+#ifdef DO_SPMV
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_SPMV, SPMV, Combine(
+	Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), Values(alphaBeta),
+	Values(alphaBeta), Values((size_t)0), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedSmall_0SPMV, SPMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(15), Values(alphaBeta),
+	Values(alphaBeta), Values((size_t)0), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(order_SPMV, SPMV, Combine(
+    ValuesIn(orderSet), Values(clblasLower), ValuesIn(smallRange), Values(alphaBeta),
+    Values(alphaBeta), ValuesIn(offs), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(uplo_SPMV, SPMV, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), Values(alphaBeta),
+    Values(alphaBeta), Values((size_t)0), ValuesIn(offs), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(alpha_beta_SPMV, SPMV, Combine(
+    Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), Values(alphaBeta),
+    Values(alphaBeta), Values((size_t)0), Values((size_t)0), ValuesIn(offs), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedBig_0SPMV, SPMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(1500, 5101), Values(alphaBeta),
+	Values(alphaBeta), Values((size_t)0), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+#else
+INSTANTIATE_TEST_CASE_P(ALL_SPMV, SPMV, Combine(
+	ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(alphaBetaRange),
+	ValuesIn(alphaBetaRange), ValuesIn(offs), ValuesIn(offs), ValuesIn(offs), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)),
+    Values(1)));
+
+#endif  // Correctness
+
+#endif
+
+
+#ifdef DO_GEMM_2
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDA_OFF_NX, GEMM2, Combine(
+    Values(clblasColumnMajor), Values(clblasNoTrans), ValuesIn(transSet),
+    ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes(500, 501, 502, 1, 3, 10)), Values(1)));
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDA_OFF_TN, GEMM2, Combine(
+    Values(clblasColumnMajor), Values(clblasTrans), Values(clblasNoTrans ),
+    ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes(500, (size_t)501, (size_t)502, 3, 2, 1)), Values(1)));
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDA_OFF_HN, GEMM2, Combine(
+    Values(clblasColumnMajor), Values(clblasConjTrans), Values(clblasNoTrans ),
+    ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes(500, (size_t)501, (size_t)502, 3, 2, 1)), Values(1)));
+
+#if !defined(SHORT_TESTS) && !defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_NX, GEMM2, Combine(
+    Values(clblasColumnMajor), Values(clblasNoTrans), ValuesIn(transSet),
+    ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_TN, GEMM2, Combine(
+    Values(clblasColumnMajor), Values(clblasTrans), Values(clblasNoTrans ),
+    ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_HN, GEMM2, Combine(
+    Values(clblasColumnMajor), Values(clblasConjTrans), Values(clblasNoTrans ),
+    ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+#endif
+
+#endif //DO_GEMM_2
+
+#ifdef DO_GEMM
+// xGEMM tests
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange, GEMM, Combine(
+    Values(clblasColumnMajor), ValuesIn(transSet), ValuesIn(transSet),
+    ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange, GEMM, Combine(
+    Values(clblasRowMajor), ValuesIn(transSet), ValuesIn(transSet),
+    ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+// We know, that SmallRange does not have values more that 257,
+// so lda is set to 500.
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDA, GEMM, Combine(
+    Values(clblasColumnMajor), ValuesIn(transSet), ValuesIn(transSet),
+    ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes(500, 501, 502, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange_BigLDA, GEMM, Combine(
+    Values(clblasRowMajor), ValuesIn(transSet), ValuesIn(transSet),
+    ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes(500, 501, 502, 0, 0, 0)), Values(1)));
+
+// Cases for extended versions with offsets
+
+#if defined(SHORT_TESTS) || defined(MEDIUM_TESTS)
+
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_0, GEMM, Combine(
+    Values(clblasColumnMajor), ValuesIn(transSet), ValuesIn(transSet),
+    Values(67), Values(138), Values(220),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 600, 700)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_0, GEMM, Combine(
+    Values(clblasRowMajor), ValuesIn(transSet), ValuesIn(transSet),
+    Values(67), Values(138), Values(220),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 600, 700)), Values(1)));
+
+#else                               /* SHORT_TESTS || MEDIUM_TESTS */
+
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_0, GEMM, Combine(
+    Values(clblasColumnMajor), ValuesIn(transSet), ValuesIn(transSet),
+    Values(67), Values(135), Values(228),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_1, GEMM, Combine(
+    Values(clblasColumnMajor), ValuesIn(transSet), ValuesIn(transSet),
+    Values(64), Values(64), Values(64),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 0, 501, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_2, GEMM, Combine(
+    Values(clblasColumnMajor), ValuesIn(transSet), ValuesIn(transSet),
+    Values(128), Values(64), Values(77),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 502)), Values(1)));
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_3, GEMM, Combine(
+    Values(clblasColumnMajor), ValuesIn(transSet), ValuesIn(transSet),
+    Values(112), Values(86), Values(68),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 501, 502)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_0, GEMM, Combine(
+    Values(clblasRowMajor), ValuesIn(transSet), ValuesIn(transSet),
+    Values(67), Values(135), Values(228),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_1, GEMM, Combine(
+    Values(clblasRowMajor), ValuesIn(transSet), ValuesIn(transSet),
+    Values(64), Values(64), Values(64),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 0, 501, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_2, GEMM, Combine(
+    Values(clblasRowMajor), ValuesIn(transSet), ValuesIn(transSet),
+    Values(128), Values(64), Values(77),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 502)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_3, GEMM, Combine(
+    Values(clblasRowMajor), ValuesIn(transSet), ValuesIn(transSet),
+    Values(112), Values(86), Values(68),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 501, 502)), Values(1)));
+
+#endif                              /* !SHORT_TESTS || !MEDIUM_TESTS */
+
+// Big matrices
+#if !defined SHORT_TESTS
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_0, GEMM, Combine(
+        ValuesIn(orderSet),
+        Values(clblasNoTrans), Values(clblasNoTrans),
+        Values(2801), Values(2903), Values(3005),
+        Values(clMath::ExtraTestSizes()), Values(1)));
+
+#if !defined(MEDIUM_TESTS)
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_1, GEMM, Combine(
+        ValuesIn(orderSet),
+        Values(clblasNoTrans), Values(clblasNoTrans),
+        Values(4777), Values(4333), Values(5000),
+        Values(clMath::ExtraTestSizes()), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_2, GEMM, Combine(
+        ValuesIn(orderSet),
+        Values(clblasTrans), Values(clblasNoTrans),
+        Values(5777), Values(5333), Values(3000),
+        Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedBig_3, GEMM, Combine(
+        ValuesIn(orderSet),
+        Values(clblasTrans), Values(clblasConjTrans),
+        Values(6777), Values(3333), Values(3000),
+        Values(clMath::ExtraTestSizes()), Values(1)));
+
+#endif // !MEDIUM_TESTS
+#endif // !SHORT_TESTS
+
+// Small matrices and Custom cases
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall_0, GEMM, Combine(
+        ValuesIn(orderSet),
+        Values(clblasNoTrans), Values(clblasNoTrans),
+        Values(1), Values(1), Values(1),
+        Values(clMath::ExtraTestSizes()), Values(1)));
+
+#if !defined SHORT_TESTS
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall_1, GEMM, Combine(
+        ValuesIn(orderSet),
+        Values(clblasNoTrans), Values(clblasNoTrans),
+        Values(2), Values(1), Values(3),
+        Values(clMath::ExtraTestSizes()), Values(1)));
+
+#if !defined(MEDIUM_TESTS)
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall_2, GEMM, Combine(
+        ValuesIn(orderSet),
+        Values(clblasTrans), Values(clblasNoTrans),
+        Values(3), Values(2), Values(1),
+        Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedSmall_3, GEMM, Combine(
+        ValuesIn(orderSet),
+        Values(clblasTrans), Values(clblasConjTrans),
+        Values(4), Values(3), Values(2),
+        Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedSmall_4, GEMM, Combine(
+        ValuesIn(orderSet),
+        Values(clblasConjTrans), Values(clblasNoTrans),
+        Values(17), Values(13), Values(1),
+        Values(clMath::ExtraTestSizes()), Values(1)));
+
+        // Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, GEMM, Combine(
+        ValuesIn(orderSet), ValuesIn(transSet), ValuesIn(transSet),
+        Values(32), Values(32), Values(32),
+        Values(clMath::ExtraTestSizes()), Values(1)));
+
+#endif /* !MEDIUM_TESTS */
+#endif /* !SHORT_TESTS */
+
+#endif // DO_GEMM
+
+
+#ifdef DO_TRMM
+// xTRMM tests
+
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange, TRMM, Combine(
+    Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange, TRMM, Combine(
+    Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+// We know, that SmallRange does not have values more that 257,
+// so lda is set to 500.
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDA, TRMM,  Combine(
+    Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes(500, 501, 0, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange_BigLDA, TRMM, Combine(
+    Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes(500, 501, 0, 0, 0, 0)), Values(1)));
+
+#if defined(SHORT_TESTS) || defined(MEDIUM_TESTS)
+
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_0, TRMM, Combine(
+    Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    Values(115), Values(158),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 502, 606, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_0, TRMM, Combine(
+    Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    Values(115), Values(158),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 502, 606, 0)), Values(1)));
+
+#else                        /* SHORT_TESTS || MEDIUM_TESTS */
+
+// Cases for extended versions with offsets
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_0, TRMM, Combine(
+    Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    Values(115), Values(113),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_1, TRMM, Combine(
+    Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    Values(128), Values(66),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 0, 501, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_2, TRMM, Combine(
+    Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    Values(53), Values(67),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 501, 0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_0, TRMM, Combine(
+    Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    Values(115), Values(113),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_1, TRMM, Combine(
+    Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    Values(128), Values(66),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 0, 501, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_2, TRMM, Combine(
+    Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    Values(53), Values(67),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 501, 0)), Values(1)));
+
+#endif                        /* !SHORT_TESTS && !MEDIUM_TESTS */
+
+// Big matrices
+
+#if !defined SHORT_TESTS
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_0, TRMM, Combine(
+    ValuesIn(orderSet),
+    Values(clblasRight), Values(clblasUpper), Values(clblasTrans),
+    Values(clblasNonUnit),
+    Values(2801), Values(2903),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+#if !defined(MEDIUM_TESTS)
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_1, TRMM, Combine(
+    ValuesIn(orderSet),
+    Values(clblasRight), Values(clblasUpper), Values(clblasTrans),
+    Values(clblasNonUnit),
+    Values(4567), Values(4321),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_2, TRMM, Combine(
+    ValuesIn(orderSet),
+    Values(clblasLeft), Values(clblasUpper), Values(clblasNoTrans),
+    Values(clblasNonUnit),
+    Values(5567), Values(5321),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedBig_3, TRMM, Combine(
+    ValuesIn(orderSet),
+    Values(clblasLeft), Values(clblasLower), Values(clblasTrans),
+    Values(clblasUnit),
+    Values(6567), Values(3321),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+#endif // !MEDIUM_TESTS
+#endif // !SHORT_TESTS
+
+// Small matrices and Custom tests
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall_0, TRMM, Combine(
+    ValuesIn(orderSet),
+    Values(clblasRight), Values(clblasUpper), Values(clblasTrans),
+    Values(clblasNonUnit),
+    Values(1), Values(1),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+#if !defined SHORT_TESTS
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall_1, TRMM, Combine(
+    ValuesIn(orderSet),
+    Values(clblasRight), Values(clblasUpper), Values(clblasTrans),
+    Values(clblasNonUnit),
+    Values(2), Values(1),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+#if !defined(MEDIUM_TESTS)
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall_2, TRMM, Combine(
+    ValuesIn(orderSet),
+    Values(clblasLeft), Values(clblasUpper), Values(clblasNoTrans),
+    Values(clblasNonUnit),
+    Values(3), Values(2),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedSmall_3, TRMM, Combine(
+    ValuesIn(orderSet),
+    Values(clblasLeft), Values(clblasLower), Values(clblasTrans),
+    Values(clblasUnit),
+    Values(4), Values(3),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedSmall_4, TRMM, Combine(
+    ValuesIn(orderSet),
+    Values(clblasLeft), Values(clblasUpper), Values(clblasNoTrans),
+    Values(clblasUnit),
+    Values(17), Values(1),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, TRMM,  Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    Values(32), Values(32),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+
+#endif /* !MEDIUM_TESTS */
+#endif /* !SHORT_TESTS */
+#endif // DO_TRMM
+
+#ifdef DO_TRSM
+// xTRSM tests
+
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange, TRSM, Combine(
+    Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange, TRSM, Combine(
+    Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+// We know, that SmallRange does not have values more that 257,
+// so lda is set to 500.
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDA, TRSM,  Combine(
+    Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes(500, 501, 0, 0, 0, 0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange_BigLDA, TRSM, Combine(
+    Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes(500, 501, 0, 0, 0, 0)), Values(1)));
+
+#if defined(SHORT_TESTS) || defined(MEDIUM_TESTS)
+
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_0, TRSM, Combine(
+    Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    Values(115), Values(158),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 502, 606, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_0, TRSM, Combine(
+    Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    Values(115), Values(158),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 502, 606, 0)), Values(1)));
+
+#else                               /* SHORT_TESTS || MEDIUM_TESTS */
+
+// Cases for extended versions with offsets
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_0, TRSM, Combine(
+    Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    Values(115), Values(113),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_1, TRSM, Combine(
+    Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    Values(128), Values(66),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 0, 501, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_2, TRSM, Combine(
+    Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    Values(53), Values(67),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 501, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_0, TRSM, Combine(
+    Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    Values(115), Values(113),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_1, TRSM, Combine(
+    Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    Values(128), Values(66),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 0, 501, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_2, TRSM, Combine(
+    Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    Values(53), Values(67),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 501, 0)), Values(1)));
+
+#endif                              /* !SHORT_TESTS && !MEDIUM_TESTS */
+
+// Big matrices
+
+#if !defined SHORT_TESTS
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_0, TRSM, Combine(
+    ValuesIn(orderSet),
+    Values(clblasRight), Values(clblasUpper), Values(clblasTrans),
+    Values(clblasNonUnit),
+    Values(2801), Values(2903),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+#if !defined(MEDIUM_TESTS)
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_1, TRSM, Combine(
+    ValuesIn(orderSet),
+    Values(clblasRight), Values(clblasUpper), Values(clblasTrans),
+    Values(clblasNonUnit),
+    Values(4567), Values(4321),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_2, TRSM, Combine(
+    ValuesIn(orderSet),
+    Values(clblasLeft), Values(clblasUpper), Values(clblasNoTrans),
+    Values(clblasNonUnit),
+    Values(5567), Values(5321),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedBig_3, TRSM, Combine(
+    ValuesIn(orderSet),
+    Values(clblasLeft), Values(clblasLower), Values(clblasTrans),
+    Values(clblasUnit),
+    Values(6567), Values(3321),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+#endif // !MEDIUM_TESTS
+#endif // !SHORT_TESTS
+
+// Small matrices and Custom tests
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall_0, TRSM, Combine(
+    ValuesIn(orderSet),
+    Values(clblasRight), Values(clblasUpper), Values(clblasTrans),
+    Values(clblasNonUnit),
+    Values(1), Values(1),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+#if !defined SHORT_TESTS
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall_1, TRSM, Combine(
+    ValuesIn(orderSet),
+    Values(clblasRight), Values(clblasUpper), Values(clblasTrans),
+    Values(clblasNonUnit),
+    Values(2), Values(1),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+#if !defined(MEDIUM_TESTS)
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall_2, TRSM, Combine(
+    ValuesIn(orderSet),
+    Values(clblasLeft), Values(clblasUpper), Values(clblasNoTrans),
+    Values(clblasNonUnit),
+    Values(3), Values(2),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedSmall_3, TRSM, Combine(
+    ValuesIn(orderSet),
+    Values(clblasLeft), Values(clblasLower), Values(clblasTrans),
+    Values(clblasUnit),
+    Values(4), Values(3),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedSmall_4, TRSM, Combine(
+    ValuesIn(orderSet),
+    Values(clblasLeft), Values(clblasUpper), Values(clblasNoTrans),
+    Values(clblasUnit),
+    Values(17), Values(1),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, TRSM,  Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    Values(32), Values(32),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+#endif /* !MEDIUM_TESTS */
+#endif /* !SHORT_TESTS */
+#endif // DO_TRSM
+
+#ifdef DO_GEMV
+// xGEMV tests
+
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange, GEMV, Combine(
+    Values(clblasColumnMajor), ValuesIn(transSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange, GEMV, Combine(
+    Values(clblasRowMajor), ValuesIn(transSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+// We know, that SmallRange does not have values more that 257,
+// so lda is set to 500.
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDA, GEMV,  Combine(
+    Values(clblasColumnMajor), ValuesIn(transSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes(500, 501, 502, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange_BigLDA, GEMV, Combine(
+    Values(clblasRowMajor), ValuesIn(transSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes(500, 501, 502, 0, 0, 0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SmallRange_VariousInc, GEMV, Combine(
+    ValuesIn(orderSet), ValuesIn(transSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    ValuesIn(clMath::makeContainerETS(ZERO_VAL, incs, incs,
+                                   ZERO_VAL, ZERO_VAL, ZERO_VAL)),
+    Values(1)));
+
+// Cases for the extended version with offsets
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx, GEMV,  Combine(
+    Values(clblasColumnMajor), ValuesIn(transSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    ValuesIn(clMath::makeContainerETS(ZERO_VAL, ONE_VAL, ONE_VAL, offs,
+                                   ZERO_VAL, ZERO_VAL)),
+           Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx, GEMV,  Combine(
+    Values(clblasRowMajor), ValuesIn(transSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    ValuesIn(clMath::makeContainerETS(ZERO_VAL, ONE_VAL, ONE_VAL, offs,
+                                   ZERO_VAL, ZERO_VAL)),
+           Values(1)));
+
+// Big matrices
+#if !defined SHORT_TESTS
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_0, GEMV, Combine(
+    ValuesIn(orderSet), Values(clblasTrans),
+    Values(2800), Values(2800),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+#if !defined(MEDIUM_TESTS)
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_1, GEMV, Combine(
+    ValuesIn(orderSet), Values(clblasTrans),
+    Values(4567), Values(4321),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_2, GEMV, Combine(
+    ValuesIn(orderSet), Values(clblasNoTrans),
+    Values(5567), Values(5321),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedBig_3, GEMV, Combine(
+    ValuesIn(orderSet), Values(clblasTrans),
+    Values(6567), Values(3321),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+#endif // !MEDIUM_TESTS
+#endif // !SHORT_TESTS
+
+// Small matrices and Custom tests
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall_0, GEMV, Combine(
+    ValuesIn(orderSet), Values(clblasTrans),
+    Values(1), Values(1),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+#if !defined SHORT_TESTS
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall_1, GEMV, Combine(
+    ValuesIn(orderSet), Values(clblasTrans),
+    Values(2), Values(1),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+#if !defined(MEDIUM_TESTS)
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall_2, GEMV, Combine(
+    ValuesIn(orderSet), Values(clblasNoTrans),
+    Values(3), Values(2),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedSmall_3, GEMV, Combine(
+    ValuesIn(orderSet), Values(clblasTrans),
+    Values(4), Values(3),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedSmall_4, GEMV, Combine(
+    ValuesIn(orderSet), Values(clblasNoTrans),
+    Values(17), Values(1),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, GEMV,  Combine(
+    ValuesIn(orderSet), ValuesIn(transSet),
+    Values(32), Values(32),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+#endif /* !MEDIUM_TESTS */
+#endif /* !SHORT_TESTS */
+#endif // DO_GEMV
+
+#ifdef DO_SYMV
+// xSYMV tests
+
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange, SYMV, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet),
+    ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange, SYMV, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet),
+    ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+// We know, that SmallRange does not have values more that 257,
+// so lda is set to 500.
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDA, SYMV,  Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet),
+    ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes(500, 501, 502, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange_BigLDA, SYMV, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet),
+    ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes(500, 501, 502, 0, 0, 0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SmallRange_VariousInc, SYMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    ValuesIn(smallRange),
+    ValuesIn(clMath::makeContainerETS(ZERO_VAL, incs, incs,
+                                   ZERO_VAL, ZERO_VAL, ZERO_VAL)),
+    Values(1)));
+
+// cases for the extended versions with offsets
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx, SYMV, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet),
+    ValuesIn(smallRange),
+    ValuesIn(clMath::makeContainerETS(ZERO_VAL, ONE_VAL, ONE_VAL, offs,
+                                   ZERO_VAL, ZERO_VAL)),
+             Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx, SYMV, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet),
+    ValuesIn(smallRange),
+    ValuesIn(clMath::makeContainerETS(ZERO_VAL, ONE_VAL, ONE_VAL, offs,
+                                   ZERO_VAL, ZERO_VAL)),
+             Values(1)));
+
+// Big matrices
+#if !defined SHORT_TESTS
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_0, SYMV, Combine(
+    ValuesIn(orderSet), Values(clblasUpper),
+    Values(2801),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+#if !defined MEDIUM_TESTS
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_1, SYMV, Combine(
+    ValuesIn(orderSet), Values(clblasUpper),
+    Values(4567),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_2, SYMV, Combine(
+    ValuesIn(orderSet), Values(clblasLower),
+    Values(5567),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedBig_3, SYMV, Combine(
+    ValuesIn(orderSet), Values(clblasUpper),
+    Values(6567),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+#endif // !MEDIUM_TESTS
+#endif // !SHORT_TESTS
+
+// Small matrices and Custom tests
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall_0, SYMV, Combine(
+    ValuesIn(orderSet), Values(clblasUpper),
+    Values(1),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+#if !defined SHORT_TESTS
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall_1, SYMV, Combine(
+    ValuesIn(orderSet), Values(clblasUpper),
+    Values(2),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+#if !defined(MEDIUM_TESTS)
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall_2, SYMV, Combine(
+    ValuesIn(orderSet), Values(clblasLower),
+    Values(3),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedSmall_3, SYMV, Combine(
+    ValuesIn(orderSet), Values(clblasUpper),
+    Values(4),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedSmall_4, SYMV, Combine(
+    ValuesIn(orderSet), Values(clblasLower),
+    Values(5),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, SYMV,  Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(32),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+#endif /* !MEDIUM_TESTS */
+#endif /* !SHORT_TESTS */
+#endif
+
+#ifdef DO_SYR2K
+// xSYR2K tests
+
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange, SYR2K, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange, SYR2K, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+// We know, that SmallRange does not have values more that 257,
+// so lda is set to 500.
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDA, SYR2K,  Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes(500, 501, 502, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange_BigLDA, SYR2K, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes(500, 501, 502, 0, 0, 0)), Values(1)));
+
+// cases for the extended versions with the offsets
+#if defined(SHORT_TESTS) || defined(MEDIUM_TESTS)
+
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_0, SYR2K, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(254), Values(353),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 602, 704)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_0, SYR2K, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(254), Values(353),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 602, 704)), Values(1)));
+
+#else                               /* SHORT_TESTS || MEDIUM_TESTS */
+
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_0, SYR2K, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(255), Values(253),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_1, SYR2K, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(128), Values(64),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 0, 501, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_2, SYR2K, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(75), Values(200),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 502)), Values(1)));
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_3, SYR2K, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(111), Values(256),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 501, 502)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_0, SYR2K, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(255), Values(253),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_1, SYR2K, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(128), Values(64),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 0, 501, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_2, SYR2K, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(75), Values(200),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 502)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_3, SYR2K, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(111), Values(256),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 501, 502)), Values(1)));
+
+#endif                              /* !SHORT_TESTS && !MEDIUM_TESTS */
+
+// Big matrices
+#if !defined SHORT_TESTS
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_0, SYR2K, Combine(
+    ValuesIn(orderSet), Values(clblasUpper), Values(clblasTrans),
+    Values(2801), Values(2903),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+#if !defined(MEDIUM_TESTS)
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_1, SYR2K, Combine(
+    ValuesIn(orderSet), Values(clblasUpper), Values(clblasTrans),
+    Values(4567), Values(4321),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_2, SYR2K, Combine(
+    ValuesIn(orderSet), Values(clblasUpper), Values(clblasNoTrans),
+    Values(5567), Values(5321),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedBig_3, SYR2K, Combine(
+    ValuesIn(orderSet), Values(clblasLower), Values(clblasTrans),
+    Values(6567), Values(3321),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+#endif // !MEDIUM_TESTS
+#endif // !SHORT_TESTS
+
+// Small matrices and Custom tests
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall_0, SYR2K, Combine(
+    ValuesIn(orderSet), Values(clblasUpper), Values(clblasTrans),
+    Values(1), Values(1),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+#if !defined SHORT_TESTS
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall_1, SYR2K, Combine(
+    ValuesIn(orderSet), Values(clblasUpper), Values(clblasTrans),
+    Values(2), Values(1),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+#if !defined(MEDIUM_TESTS)
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall_2, SYR2K, Combine(
+    ValuesIn(orderSet), Values(clblasUpper), Values(clblasNoTrans),
+    Values(3), Values(2),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedSmall_3, SYR2K, Combine(
+    ValuesIn(orderSet), Values(clblasLower), Values(clblasTrans),
+    Values(4), Values(3),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedSmall_4, SYR2K, Combine(
+    ValuesIn(orderSet), Values(clblasUpper), Values(clblasNoTrans),
+    Values(17), Values(1),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, SYR2K,  Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(32), Values(32),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+#endif /* !MEDIUM_TESTS */
+#endif /* !SHORT_TESTS */
+#endif // DO_SYR2K
+
+#ifdef DO_HERK
+/*
+ ::std::tr1::tuple<
+        clblasOrder,     // order
+        clblasUplo,      // uplo
+        clblasTranspose, // transA
+        int,                // N
+        int,                // K
+        ComplexLong,        // alpha
+        ComplexLong,        // beta
+        ExtraTestSizes,     // offa, offc, lda, ldc.
+        int                 // numCommandQueues
+*/
+#if !defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(SPL_HERK, HERK, Combine(
+    Values(clblasColumnMajor, clblasRowMajor), ValuesIn(uploSet), Values(clblasNoTrans, clblasConjTrans),
+    Values(513), Values(513), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+#endif
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_HERK, HERK, Combine(
+    Values(clblasRowMajor), Values(clblasUpper), Values(clblasNoTrans),
+    ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedSmall0_HERK, HERK, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasConjTrans),
+    Values(14), Values(15), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange),
+    Values(clMath::ExtraTestSizes(0,0,0,9,0,0)), Values(1)));
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Order_HERK, HERK, Combine(
+    ValuesIn(orderSet), Values(clblasUpper), Values(clblasNoTrans),
+    ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange),
+    Values(clMath::ExtraTestSizes(0,0,0,0,10,0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(Uplo_HERK, HERK, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), Values(clblasNoTrans),
+    ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange),
+    Values(clMath::ExtraTestSizes(0,0,0,9,0,0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(Trans_HERK, HERK, Combine(
+    Values(clblasRowMajor), Values(clblasUpper), Values(clblasNoTrans, clblasConjTrans),
+    ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange),
+    Values(clMath::ExtraTestSizes(0,0,0,0,10,0)), Values(1)));
+
+#else       // Correctness
+INSTANTIATE_TEST_CASE_P(ALL_HERK, HERK, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans, clblasConjTrans),
+    ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange),
+    Values(clMath::ExtraTestSizes(0,0,0,9,10,0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig0_HERK, HERK, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans, clblasConjTrans),
+    Values(2510, 4300), Values(1500,4600), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange),
+    Values(clMath::ExtraTestSizes(0,0,0,9,0,0)), Values(1)));
+
+#endif      // Correctness
+
+#endif // DO_HERK
+
+
+#ifdef DO_HER2K
+
+#if !defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(SPL_HER2K, HER2K, Combine(
+    Values(clblasColumnMajor, clblasRowMajor), ValuesIn(uploSet), Values(clblasNoTrans, clblasConjTrans),
+    Values(513), Values(513), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+#endif
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_HER2K, HER2K, Combine(
+    Values(clblasRowMajor), Values(clblasUpper), Values(clblasNoTrans),
+    ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedSmall0_HER2K, HER2K, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasConjTrans),
+    Values(14), Values(15), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange),
+    Values(clMath::ExtraTestSizes(0,0,0,9,0,0)), Values(1)));
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Order_HER2K, HER2K, Combine(
+    ValuesIn(orderSet), Values(clblasUpper), Values(clblasNoTrans),
+    ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange),
+    Values(clMath::ExtraTestSizes(0,0,0,0,10,0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(Uplo_HER2K, HER2K, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), Values(clblasNoTrans),
+    ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange),
+    Values(clMath::ExtraTestSizes(0,0,0,9,0,0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(Trans_HER2K, HER2K, Combine(
+    Values(clblasRowMajor), Values(clblasUpper), Values(clblasNoTrans, clblasConjTrans),
+    ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange),
+    Values(clMath::ExtraTestSizes(0,0,0,0,10,0)), Values(1)));
+
+#else       // Correctness
+INSTANTIATE_TEST_CASE_P(ALL_HER2K, HER2K, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans, clblasConjTrans),
+    ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange),
+    Values(clMath::ExtraTestSizes(0,0,0,9,10,0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig0_HER2K, HER2K, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans, clblasConjTrans),
+    Values(2510, 4300), Values(1500,4600), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange),
+    Values(clMath::ExtraTestSizes(0,0,0,9,0,0)), Values(1)));
+
+#endif      // Correctness
+
+#endif // DO_HER2K
+
+
+#ifdef DO_SYRK
+// xSYRK tests
+
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange, SYRK, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange, SYRK, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+// We know, that SmallRange does not have values more that 257,
+// so lda is set to 500.
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDA, SYRK,  Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes(500, 0, 501, 0, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange_BigLDA, SYRK, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes(500, 0, 501, 0, 0, 0)), Values(1)));
+
+// cases for the extended versions with the offsets
+#if defined(SHORT_TESTS) || defined(MEDIUM_TESTS)
+
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_0, SYRK, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(252), Values(353),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 702)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_0, SYRK, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(252), Values(353),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 702)), Values(1)));
+
+#else                               /* SHORT_TESTS || MEDIUM_TESTS */
+
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_0, SYRK, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(255), Values(253),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_1, SYRK, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(128), Values(64),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 501)), Values(1)));
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_2, SYRK, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(75), Values(200),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 501)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_0, SYRK, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(255), Values(253),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 0)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_1, SYRK, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(128), Values(64),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 501)), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_2, SYRK, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(75), Values(200),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 501)), Values(1)));
+
+#endif                              /* !SHORT_TESTS && !MEDIUM_TESTS */
+
+// Big matrices
+#if !defined(SHORT_TESTS)
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_0, SYRK, Combine(
+    ValuesIn(orderSet), Values(clblasUpper), Values(clblasTrans),
+    Values(2801), Values(2903),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+#if !defined(MEDIUM_TESTS)
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_1, SYRK, Combine(
+    ValuesIn(orderSet), Values(clblasUpper), Values(clblasTrans),
+    Values(4567), Values(4321),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_2, SYRK, Combine(
+    ValuesIn(orderSet), Values(clblasUpper), Values(clblasNoTrans),
+    Values(5567), Values(5321),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedBig_3, SYRK, Combine(
+    ValuesIn(orderSet), Values(clblasLower), Values(clblasTrans),
+    Values(6567), Values(3321),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+#endif // !MEDIUM_TESTS
+#endif // !SHORT_TESTS
+
+// Small matrices and Custom tests
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall_0, SYRK, Combine(
+    ValuesIn(orderSet), Values(clblasUpper), Values(clblasTrans),
+    Values(1), Values(1),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+#if !defined SHORT_TESTS
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall_1, SYRK, Combine(
+    ValuesIn(orderSet), Values(clblasUpper), Values(clblasTrans),
+    Values(2), Values(1),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+#if !defined(MEDIUM_TESTS)
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall_2, SYRK, Combine(
+    ValuesIn(orderSet), Values(clblasUpper), Values(clblasNoTrans),
+    Values(3), Values(2),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedSmall_3, SYRK, Combine(
+    ValuesIn(orderSet), Values(clblasLower), Values(clblasTrans),
+    Values(4), Values(3),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+INSTANTIATE_TEST_CASE_P(SelectedSmall_4, SYRK, Combine(
+    ValuesIn(orderSet), Values(clblasUpper), Values(clblasNoTrans),
+    Values(17), Values(1),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, SYRK,  Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(32), Values(32),
+    Values(clMath::ExtraTestSizes()), Values(1)));
+
+#endif /* !MEDIUM_TESTS */
+#endif /* !SHORT_TESTS */
+
+#endif // DO_SYRK
+
+
+#ifdef DO_TRMV
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(ShortTRMV, TRMV, Combine(
+    Values(clblasRowMajor), Values(clblasLower),
+    Values(clblasNoTrans), Values(clblasUnit),ValuesIn(smallRange),Values(0),
+    Values(1), Values(0), Values(0), Values(1)));
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Order_TRMV, TRMV, Combine(
+    ValuesIn(orderSet), Values(clblasLower),
+    Values(clblasNoTrans), Values(clblasUnit),ValuesIn(smallRange),Values(0),
+    Values(1), Values(0,9), Values(0), Values(1)));
+INSTANTIATE_TEST_CASE_P(Uplo_TRMV, TRMV, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet),
+    Values(clblasNoTrans), Values(clblasUnit),ValuesIn(smallRange),Values(0),
+    Values(1), Values(0), Values(0,10), Values(1)));
+INSTANTIATE_TEST_CASE_P(Trans_TRMV, TRMV, Combine(
+    Values(clblasRowMajor), Values(clblasLower),
+    ValuesIn(transSet), Values(clblasUnit),ValuesIn(smallRange),Values(0),
+    Values(1), Values(0,9), Values(0), Values(1)));
+INSTANTIATE_TEST_CASE_P(Diag_TRMV, TRMV, Combine(
+    Values(clblasRowMajor), Values(clblasLower),
+    Values(clblasNoTrans), ValuesIn(diagSet), ValuesIn(smallRange),Values(0),
+    Values(1), Values(0), Values(0,10), Values(1)));
+
+#else       // Correctness
+INSTANTIATE_TEST_CASE_P(All_TRMV, TRMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),ValuesIn(smallRange),Values(0,4097),
+    ValuesIn(incs), Values(0, 10), Values(0, 9), Values(1)));
+
+#endif      // Correctness
+
+#endif
+
+#ifdef DO_TPMV
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(ShortTPMV, TPMV, Combine(
+    Values(clblasRowMajor), Values(clblasLower),
+    Values(clblasNoTrans), Values(clblasUnit),ValuesIn(smallRange),Values(0),
+    Values(1), Values(0), Values(0), Values(1)));
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Order_TPMV, TPMV, Combine(
+    ValuesIn(orderSet), Values(clblasLower),
+    Values(clblasNoTrans), Values(clblasUnit),ValuesIn(smallRange),Values(0),
+    Values(1), Values(0,9), Values(0), Values(1)));
+INSTANTIATE_TEST_CASE_P(Uplo_TPMV, TPMV, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet),
+    Values(clblasNoTrans), Values(clblasUnit),ValuesIn(smallRange),Values(0),
+    Values(1), Values(0), Values(0,10), Values(1)));
+INSTANTIATE_TEST_CASE_P(Trans_TPMV, TPMV, Combine(
+    Values(clblasRowMajor), Values(clblasLower),
+    ValuesIn(transSet), Values(clblasUnit),ValuesIn(smallRange),Values(0),
+    Values(1), Values(0,9), Values(0), Values(1)));
+INSTANTIATE_TEST_CASE_P(Diag_TPMV, TPMV, Combine(
+    Values(clblasRowMajor), Values(clblasLower),
+    Values(clblasNoTrans), ValuesIn(diagSet), ValuesIn(smallRange),Values(0),
+    Values(1), Values(0), Values(0,10), Values(1)));
+
+#else       // Correctness
+INSTANTIATE_TEST_CASE_P(All_TPMV, TPMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),ValuesIn(smallRange),Values(0,4097),
+    ValuesIn(incs), Values(0, 10), Values(0, 9), Values(1)));
+
+#endif      // Correctness
+
+#endif
+
+#ifdef DO_TRSV
+
+#ifdef SHORT_TESTS
+
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeTRSV, TRSV, Combine(
+    Values(clblasRowMajor), Values(clblasUpper),
+    Values(clblasNoTrans), Values(clblasUnit),ValuesIn(smallRange),
+    Values(0), Values(1),  Values(0), Values(0), Values(1)));
+
+#endif
+
+#ifdef MEDIUM_TESTS
+
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeTRSV, TRSV, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet),
+    Values(clblasTrans), Values(clblasNonUnit), ValuesIn(smallRange),
+    Values(0), Values(1),  Values(0), Values(0), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SmallRange_VariousIncTRSV, TRSV, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet),
+    Values(clblasNoTrans, clblasConjTrans), Values(clblasUnit), ValuesIn(smallRange),
+    Values(0), ValuesIn(incs),  Values(0), Values(0), Values(1)));
+
+#endif
+
+#if !defined SHORT_TESTS && !defined MEDIUM_TESTS
+
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeTRSV, TRSV, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),ValuesIn(smallRange),
+    Values(0), Values(1),  Values(0,10), Values(0,9), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeTRSV, TRSV, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),ValuesIn(smallRange),
+    Values(0), Values(1),  Values(0,10), Values(0,9), Values(1)));
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDATRSV, TRSV,  Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),ValuesIn(smallRange),
+    Values(500), Values(1),  Values(0,10), Values(0,9), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange_BigLDATRSV, TRSV, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),ValuesIn(smallRange),
+    Values(500), Values(1),  Values(0,10), Values(0,9), Values(1)));
+INSTANTIATE_TEST_CASE_P(SmallRange_VariousIncTRSV, TRSV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),ValuesIn(smallRange),
+    Values(0), ValuesIn(incs),  Values(0,10), Values(0,9), Values(1)));
+
+#endif
+
+#endif
+
+#ifdef DO_TPSV
+
+#ifdef SHORT_TESTS
+
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeTPSV, TPSV, Combine(
+    Values(clblasRowMajor), Values(clblasUpper),
+    Values(clblasNoTrans), Values(clblasUnit),ValuesIn(smallRange),
+    Values(0), Values(1),  Values(0), Values(0), Values(1)));
+
+#endif
+
+#ifdef MEDIUM_TESTS
+
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeTPSV, TPSV, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet),
+    Values(clblasTrans), Values(clblasNonUnit), ValuesIn(smallRange),
+    Values(0), Values(1),  Values(0), Values(0), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SmallRange_VariousIncTPSV, TPSV, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet),
+    Values(clblasNoTrans, clblasConjTrans), Values(clblasUnit), ValuesIn(smallRange),
+    Values(0), ValuesIn(incs),  Values(0), Values(0), Values(1)));
+
+#endif
+
+#if !defined SHORT_TESTS && !defined MEDIUM_TESTS
+
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeTPSV, TPSV, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),ValuesIn(smallRange),
+    Values(0), Values(1),  Values(0,10), Values(0,9), Values(1)));
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeTPSV, TPSV, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),ValuesIn(smallRange),
+    Values(0), Values(1),  Values(0,10), Values(0,9), Values(1)));
+INSTANTIATE_TEST_CASE_P(SmallRange_VariousIncTPSV, TPSV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),ValuesIn(smallRange),
+    Values(0), ValuesIn(incs),  Values(0,10), Values(0,9), Values(1)));
+#endif
+
+#endif
+
+/*#ifdef DO_SYMM
+
+
+	  order = ::std::tr1::get<0>(GetParam());
+      side = ::std::tr1::get<1>(GetParam());
+      uplo = ::std::tr1::get<2>(GetParam());
+      M = ::std::tr1::get<3>(GetParam());
+      N = ::std::tr1::get<4>(GetParam());
+      lda = ::std::tr1::get<5>(GetParam());
+      ldb = ::std::tr1::get<6>(GetParam());
+  	  ldc = ::std::tr1::get<7>(GetParam());
+      offa = ::std::tr1::get<8>(GetParam());
+      numCommandQueues = ::std::tr1::get<9>(GetParam());
+
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeSYMM, SYMM, Combine(
+    Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(3192), Values(3192), Values(3192), Values(0),
+    Values(1) ) );
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeSYMM, SYMM, Combine(
+    Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(3192), Values(3192), Values(3192), Values(0),
+    Values(1) ) );
+INSTANTIATE_TEST_CASE_P(ColumnMajor_VariousLDASYMM, SYMM, Combine(
+    Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    ValuesIn(ldaRange), ValuesIn(ldaRange), ValuesIn(ldaRange), Values(0),
+    Values(1) ) );
+INSTANTIATE_TEST_CASE_P(RowMajor_VariousLDASYMM, SYMM, Combine(
+    Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    ValuesIn(ldaRange), ValuesIn(ldaRange), ValuesIn(ldaRange), Values(0),
+    Values(1) ) );
+#endif
+*/
+
+#ifdef DO_SYR
+/*
+ 		clblasOrder,     // order
+        clblasUplo,      // uplo
+        int,                // N
+        double,             //alpha
+        int,                // offx
+        int,                // incx, should be greater than 0
+        int,                // offa
+        int,                // lda, 0 - undefined
+        int                 // numCommandQueues
+*/
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_SYR, SYR, Combine(
+    Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    Values(0), Values(1), Values(0), Values(0), Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedSmall_SYR, SYR, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(15), ValuesIn(realAlphaRange),
+    Values(0), Values(1), Values(0), Values(0), Values(1) ) );
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Order_SYR, SYR, Combine(
+    ValuesIn(orderSet), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    Values(0,9), Values(1), Values(0,10), Values(0), Values(1) ) );
+INSTANTIATE_TEST_CASE_P(Uplo_SYR, SYR, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    Values(0,9), Values(1), Values(0,10), Values(0), Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedBig_SYR, SYR, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), Values(1500), ValuesIn(realAlphaRange),
+    Values(0), Values(1), Values(0), Values(0), Values(1) ) );
+
+#else       // Correctness
+
+INSTANTIATE_TEST_CASE_P(ALL, SYR, Combine(ValuesIn(orderSet), ValuesIn(uploSet),
+	ValuesIn(smallRange), ValuesIn(realAlphaRange), ValuesIn(offsetRange), ValuesIn(incs),
+	ValuesIn(offsetRange), ValuesIn(ldaRange), Values(1) ) );
+
+#endif
+
+#endif
+
+
+#ifdef DO_SPR
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_SPR, SPR, Combine(
+    Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    Values(0), Values(1), Values(0), Values(0), Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedSmall_SPR, SPR, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(15), ValuesIn(realAlphaRange),
+    Values(0), Values(1), Values(0), Values(0), Values(1) ) );
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Order_SPR, SPR, Combine(
+    ValuesIn(orderSet), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    Values(0,9), Values(1), Values(0,10), Values(0), Values(1) ) );
+INSTANTIATE_TEST_CASE_P(Uplo_SPR, SPR, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    Values(0,9), Values(1), Values(0,10), Values(0), Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedBig_SPR, SPR, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), Values(1500, 5101), ValuesIn(realAlphaRange),
+    Values(0), Values(1), Values(0), Values(0), Values(1) ) );
+
+#else       // Correctness
+INSTANTIATE_TEST_CASE_P(All_SPR, SPR, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(ldaRange), Values(1) ) );
+
+#endif      // Correctness
+
+#endif
+
+#ifdef DO_GER
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_GER, GER, Combine(
+    Values(clblasRowMajor),ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(0), Values(1), Values(1), Values(0), Values(0), Values(0),
+    Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedSmall0_GER, GER, Combine(
+    ValuesIn(orderSet), Values(61), Values(32),
+    Values(0), Values(4,-11), Values(-30,1), Values(0), Values(0), Values(0),
+    Values(1) ) );
+
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Order_GER, GER, Combine(
+    ValuesIn(orderSet), ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(0), Values(-10), Values(21), Values(0,9), Values(0), Values(0),
+    Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedBig0_GER, GER, Combine(
+    ValuesIn(orderSet), Values(4900), Values(3999),
+    Values(0), Values(4), Values(-33), Values(0), Values(0), Values(0),
+    Values(1) ) );
+
+#else       // Correctness
+INSTANTIATE_TEST_CASE_P(ALL_GER, GER, Combine(
+    ValuesIn(orderSet), ValuesIn(smallRange), ValuesIn(smallRange),
+    ValuesIn(ldaRange), ValuesIn(incs), ValuesIn(incs), ValuesIn(offsetRange),ValuesIn(offsetRange),ValuesIn(offsetRange),
+    Values(1) ) );
+
+#endif      // Correctness
+
+#endif
+
+
+#ifdef DO_GERC
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_GERC, GERC, Combine(
+    Values(clblasRowMajor),ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(0), Values(1), Values(1), Values(0), Values(0), Values(0),
+    Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedSmall0_GERC, GERC, Combine(
+    ValuesIn(orderSet), Values(61), Values(32),
+    Values(0), Values(4,-11), Values(-30,1), Values(0), Values(0), Values(0),
+    Values(1) ) );
+
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Order_GERC, GERC, Combine(
+    ValuesIn(orderSet), ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(0), Values(-10), Values(21), Values(0,9), Values(0), Values(0,19),
+    Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedBig0_GERC, GERC, Combine(
+    ValuesIn(orderSet), Values(4900), Values(3999),
+    Values(0), Values(4), Values(-33), Values(0), Values(0), Values(0),
+    Values(1) ) );
+
+#else       // Correctness
+INSTANTIATE_TEST_CASE_P(ALL_GERC, GERC, Combine(
+    ValuesIn(orderSet), ValuesIn(smallRange), ValuesIn(smallRange),
+    ValuesIn(ldaRange), ValuesIn(incs), ValuesIn(incs), ValuesIn(offsetRange),ValuesIn(offsetRange),ValuesIn(offsetRange),
+    Values(1) ) );
+
+#endif      // Correctness
+
+#endif
+
+#ifdef DO_HER
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_HER, HER, Combine(
+    Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    Values(0), Values(1), Values(0), Values(0), Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedSmall_HER, HER, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(15), ValuesIn(realAlphaRange),
+    Values(0), Values(1), Values(0), Values(0), Values(1) ) );
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Order_HER, HER, Combine(
+    ValuesIn(orderSet), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    Values(0), Values(1), Values(0,10), Values(0,9), Values(1) ) );
+INSTANTIATE_TEST_CASE_P(Uplo_HER, HER, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    Values(0), Values(1), Values(0,10), Values(0,9), Values(1) ) );
+
+#else       // Correctness
+
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeHER, HER, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    ValuesIn(ldaRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange),
+    Values(1) ) );
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeHER, HER, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    ValuesIn(ldaRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange),
+    Values(1) ) );
+INSTANTIATE_TEST_CASE_P(ColumnMajor_VariousLDAHER, HER, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    ValuesIn(ldaRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange),
+    Values(1) ) );
+INSTANTIATE_TEST_CASE_P(RowMajor_VariousLDAHER, HER, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    ValuesIn(ldaRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange),
+    Values(1) ) );
+#endif
+
+#endif
+
+#ifdef DO_HPR
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_HPR, HPR, Combine(
+    Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    Values(0), Values(1), Values(0), Values(0), Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedSmall_HPR, HPR, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(15), ValuesIn(realAlphaRange),
+    Values(0), Values(1), Values(0), Values(0), Values(1) ) );
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Order_HPR, HPR, Combine(
+    ValuesIn(orderSet), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    Values(0), Values(1), Values(0,10), Values(0,9), Values(1) ) );
+INSTANTIATE_TEST_CASE_P(Uplo_HPR, HPR, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    Values(0), Values(1), Values(0,10), Values(0,9), Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedBig_HPR, HPR, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), Values(1500, 5101), ValuesIn(realAlphaRange),
+    Values(0), Values(1), Values(0), Values(0), Values(1) ) );
+
+#else       // Correctness
+INSTANTIATE_TEST_CASE_P(All_HPR, HPR, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    ValuesIn(ldaRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) );
+
+#endif      // Correctness
+
+#endif
+
+
+#ifdef DO_HER2
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_HER2, HER2, Combine(
+    Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), ValuesIn(complexAlphaRange),
+    Values(0), Values(1), Values(0), Values(0), Values(0), Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedSmall_HER2, HER2, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(15), ValuesIn(complexAlphaRange),
+    Values(0), Values(1), Values(0), Values(0), Values(0), Values(1) ) );
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Order_HER2, HER2, Combine(
+    ValuesIn(orderSet), Values(clblasLower), ValuesIn(smallRange), ValuesIn(complexAlphaRange),
+    Values(0,9), Values(1), Values(0,10), Values(0,9), Values(0), Values(1) ) );
+INSTANTIATE_TEST_CASE_P(Uplo_HER2, HER2, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(complexAlphaRange),
+    Values(0,10), Values(1), Values(0,10), Values(0,9), Values(0), Values(1) ) );
+
+#else       // Correctness
+
+INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeHER2, HER2, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(complexAlphaRange),
+    ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange),ValuesIn(offsetRange),ValuesIn(ldaRange),
+    Values(1) ) );
+INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeHER2, HER2, Combine(
+        Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(complexAlphaRange),
+    ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange),ValuesIn(offsetRange),ValuesIn(ldaRange),
+    Values(1) ) );
+INSTANTIATE_TEST_CASE_P(ColumnMajor_VariousLDAHER2, HER2, Combine(
+        Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(complexAlphaRange),
+    ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange),ValuesIn(offsetRange),ValuesIn(ldaRange),
+    Values(1) ) );
+INSTANTIATE_TEST_CASE_P(RowMajor_VariousLDAHER2, HER2, Combine(
+        Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(complexAlphaRange),
+    ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange),ValuesIn(offsetRange),ValuesIn(ldaRange),
+    Values(1) ) );
+
+#endif
+
+#endif
+
+#ifdef DO_HPR2
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_HPR2, HPR2, Combine(
+    Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), ValuesIn(complexAlphaRange),
+    Values(0), Values(1), Values(0), Values(0), Values(0), Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedSmall_HPR2, HPR2, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(15), ValuesIn(complexAlphaRange),
+    Values(0), Values(1), Values(0), Values(0), Values(0), Values(1) ) );
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Order_HPR2, HPR2, Combine(
+    ValuesIn(orderSet), Values(clblasLower), ValuesIn(smallRange), ValuesIn(complexAlphaRange),
+    Values(0,9), Values(1), Values(0,10), Values(0,9), Values(0), Values(1) ) );
+INSTANTIATE_TEST_CASE_P(Uplo_HPR2, HPR2, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(complexAlphaRange),
+    Values(0,10), Values(1), Values(0,10), Values(0,9), Values(0), Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedBig_HPR2, HPR2, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), Values(1500, 5101), ValuesIn(complexAlphaRange),
+    Values(0), Values(1), Values(0), Values(0), Values(0), Values(1) ) );
+
+#else       // Correctness
+INSTANTIATE_TEST_CASE_P(All_HPR2, HPR2, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(complexAlphaRange),
+    ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(ldaRange), Values(1) ) );
+
+#endif      // Correctness
+
+#endif
+
+
+/*INSTANTIATE_TEST_CASE_P(ALL_HEMM_WITH_OFFSETS_ZERO, HEMM, Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 0)),
+    //Values(clMath::ExtraTestSizes(0, 0, 0, 12, 0, 1)),
+    Values(1) ) );
+
+INSTANTIATE_TEST_CASE_P(ALL_HEMM, HEMM, Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 12, 13, 15)),
+    Values(1) ) );
+
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_0, HEMM, Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet),
+    Values(5600), Values(5600),ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 0)),
+    Values(1) ) );
+
+
+*/
+
+
+
+/*
+INSTANTIATE_TEST_CASE_P(SYMM_VERYSMALL, SYMM, Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(verySmallRange), ValuesIn(verySmallRange),ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 0)),
+    Values(1) ) );*/
+
+/*INSTANTIATE_TEST_CASE_P(ALL_SYMM, SYMM, Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 1, 3, 13)),
+    Values(1) ) );
+
+INSTANTIATE_TEST_CASE_P(ALL_SYMM_WITH_OFFSETS_ZERO, SYMM, Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 0)),
+    Values(1) ) );
+
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_0, SYMM, Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet),
+    Values(5600), Values(5600),ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange),
+    Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 0)),
+    Values(1) ) );
+*/
+
+
+#ifdef DO_SYR2
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_SYR2, SYR2, Combine(
+    Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    Values(0), Values(1), Values(0), Values(0), Values(0), Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedSmall_SYR2, SYR2, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(15), ValuesIn(realAlphaRange),
+    Values(0), Values(1), Values(0), Values(0), Values(0), Values(1) ) );
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Order_SYR2, SYR2, Combine(
+    ValuesIn(orderSet), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    Values(0,9), Values(1), Values(0,10), Values(0,9), Values(0), Values(1) ) );
+INSTANTIATE_TEST_CASE_P(Uplo_SYR2, SYR2, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    Values(0,10), Values(1), Values(0,10), Values(0,9), Values(0), Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedBig_SYR2, SYR2, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), Values(1500, 2800), ValuesIn(realAlphaRange),
+    Values(0), Values(1), Values(0), Values(0), Values(0), Values(1) ) );
+
+#else       // Correctness
+
+INSTANTIATE_TEST_CASE_P(ALL, SYR2, Combine(ValuesIn(orderSet), ValuesIn(uploSet),
+	ValuesIn(smallRange), ValuesIn(realAlphaRange), ValuesIn(offsetRange), ValuesIn(incs),
+	ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(ldaRange), Values(1)));
+
+#endif
+
+#endif
+
+
+#ifdef DO_SPR2
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_SPR2, SPR2, Combine(
+    Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    Values(0), Values(1), Values(0), Values(0), Values(0), Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedSmall_SPR2, SPR2, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(15), ValuesIn(realAlphaRange),
+    Values(0), Values(1), Values(0), Values(0), Values(0), Values(1) ) );
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Order_SPR2, SPR2, Combine(
+    ValuesIn(orderSet), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    Values(0,9), Values(1), Values(0,10), Values(0,9), Values(0), Values(1) ) );
+INSTANTIATE_TEST_CASE_P(Uplo_SPR2, SPR2, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    Values(0,10), Values(1), Values(0,10), Values(0,9), Values(0), Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedBig_SPR2, SPR2, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), Values(1500, 5101), ValuesIn(realAlphaRange),
+    Values(0), Values(1), Values(0), Values(0), Values(0), Values(1) ) );
+
+#else       // Correctness
+INSTANTIATE_TEST_CASE_P(All_SPR2, SPR2, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange),
+    ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(ldaRange), Values(1) ) );
+
+#endif      // Correctness
+
+#endif
+
+
+#ifdef DO_GBMV
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_GBMV, GBMV, Combine(
+    Values(clblasRowMajor), Values(clblasNoTrans),
+    ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall0_GBMV, GBMV, Combine(
+    ValuesIn(orderSet), Values(clblasConjTrans),
+    Values(14), Values(15), Values(10), Values(8),Values(ExtraTestSizes(0, (int)-1, (int)1, 9, 0, 0)),
+    ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Order_GBMV, GBMV, Combine(
+    ValuesIn(orderSet), Values(clblasNoTrans),
+    ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(ExtraTestSizes(0, (int)1, (int)33, 10, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(Trans_GBMV, GBMV, Combine(
+    Values(clblasRowMajor), ValuesIn(transSet),
+    ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(ExtraTestSizes(0, (int)-33, (int)1, 0, 10, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedVerySmall_GBMV, GBMV, Combine(
+    ValuesIn(orderSet), ValuesIn(transSet),
+    Values(1, 2, 4, 9), Values(3, 6, 11), Values(5), Values(7),Values(ExtraTestSizes(0, (int)-1, (int)1, 9, 0, 0)),
+    ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig0_GBMV, GBMV, Combine(
+    Values(clblasRowMajor), ValuesIn(transSet), Values(2599), Values(999),
+    Values(2000), Values(565), Values(clMath::ExtraTestSizes(0,(int)30,(int)1,9,0,6)),
+    ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+#else       // Correctness
+INSTANTIATE_TEST_CASE_P(ALL_GBMV, GBMV, Combine(
+    ValuesIn(orderSet), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange),
+    ValuesIn(smallRange), ValuesIn(smallRange),Values(clMath::ExtraTestSizes(0,(int)22,(int)-20,9,10,0)),
+    ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig1_GBMV, GBMV, Combine(
+    ValuesIn(orderSet), ValuesIn(transSet), Values(2510, 2300), Values(1500,2400),
+    Values(2509, 2299), Values(1499,2399),Values(clMath::ExtraTestSizes(0,(int)3,(int)-2,9,0,6)),
+    ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+#endif      // Correctness
+
+#endif // DO_GBMV
+
+
+
+#ifdef DO_SBMV
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_SBMV, SBMV, Combine(
+    Values(clblasRowMajor), Values(clblasUpper),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall0_SBMV, SBMV, Combine(
+    ValuesIn(orderSet), Values(clblasLower),
+    Values(14), Values(10), Values(ExtraTestSizes(0, (int)-1, (int)1, 9, 0, 0)),
+    ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Order_SBMV, SBMV, Combine(
+    ValuesIn(orderSet), Values(clblasUpper),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(ExtraTestSizes(0, (int)1, (int)33, 10, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(Uplo__SBMV, SBMV, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(ExtraTestSizes(0, (int)-33, (int)1, 0, 10, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedVerySmall_SBMV, SBMV, Combine(
+    ValuesIn(orderSet), Values(clblasUpper),
+    Values(7), Values(5),Values(ExtraTestSizes(0, (int)-1, (int)1, 9, 0, 0)),
+    ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig0_SBMV, SBMV, Combine(
+    Values(clblasRowMajor), Values(clblasLower),
+    Values(2000), Values(565), Values(clMath::ExtraTestSizes(0,(int)30,(int)1,9,0,6)),
+    ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+#else       // Correctness
+INSTANTIATE_TEST_CASE_P(ALL_SBMV, SBMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes(0,(int)22,(int)-20,9,10,0)),
+    ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig1_SBMV, SBMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(2510, 2300), Values(1500,1700),
+    Values(clMath::ExtraTestSizes(0,(int)3,(int)-2,9,0,6)),
+    ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+#endif      // Correctness
+
+#endif // DO_SBMV
+
+//HBMV
+#ifdef DO_HBMV
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_HBMV, HBMV, Combine(
+    Values(clblasRowMajor), Values(clblasUpper),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall0_HBMV, HBMV, Combine(
+    ValuesIn(orderSet), Values(clblasLower),
+    Values(14), Values(10), Values(ExtraTestSizes(0, (int)-1, (int)1, 9, 0, 0)),
+    ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Order_HBMV, HBMV, Combine(
+    ValuesIn(orderSet), Values(clblasUpper),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(ExtraTestSizes(0, (int)1, (int)33, 10, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(Trans_HBMV, HBMV, Combine(
+    Values(clblasRowMajor), Values(clblasLower),
+    ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(ExtraTestSizes(0, (int)-33, (int)1, 0, 10, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedVerySmall_HBMV, HBMV, Combine(
+    ValuesIn(orderSet), Values(clblasUpper),
+    Values(7), Values(5),Values(ExtraTestSizes(0, (int)-1, (int)1, 9, 0, 0)),
+    ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig0_HBMV, HBMV, Combine(
+    Values(clblasRowMajor), Values(clblasLower),
+    Values(2000), Values(565), Values(clMath::ExtraTestSizes(0,(int)30,(int)1,9,0,6)),
+    ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+#else       // Correctness
+INSTANTIATE_TEST_CASE_P(ALL_HBMV, HBMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(smallRange),
+    Values(clMath::ExtraTestSizes(0,(int)22,(int)-20,9,10,0)),
+    ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig1_HBMV, HBMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(2510, 2300), Values(1500,1700),
+    Values(clMath::ExtraTestSizes(0,(int)3,(int)-2,9,0,6)),
+    ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+#endif      // Correctness
+
+#endif // DO_HBMV
+
+
+#ifdef DO_TBMV
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_TBMV, TBMV, Combine(
+    Values(clblasRowMajor), Values(clblasUpper), Values(clblasNoTrans), Values(clblasNonUnit),
+    ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall0_TBMV, TBMV, Combine(
+    ValuesIn(orderSet), Values(clblasLower), Values(clblasTrans), Values(clblasUnit),
+    Values(14), Values(13), Values(ExtraTestSizes(0, (int)-1, (int)1, 9, 0, 0)), Values(1)));
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Order_TBMV, TBMV, Combine(
+    ValuesIn(orderSet), Values(clblasUpper), Values(clblasNoTrans), Values(clblasNonUnit),
+    ValuesIn(smallRange), ValuesIn(smallRange),Values(ExtraTestSizes(0, (int)1, (int)33, 10, 0, 0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(Uplo_TBMV, TBMV, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), Values(clblasNoTrans), Values(clblasNonUnit),
+    ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 10)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(Trans_TBMV, TBMV, Combine(
+    Values(clblasRowMajor), Values(clblasLower), ValuesIn(transSet), Values(clblasUnit),
+    ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)-33, (int)1, 0, 10, 0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(Diag_TBMV, TBMV, Combine(
+    Values(clblasRowMajor), Values(clblasUpper), Values(clblasNoTrans), ValuesIn(diagSet),
+    ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)1, 8, 0, 0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedVerySmall_TBMV, TBMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),
+    Values(1, 2, 4, 9), Values(3), Values(ExtraTestSizes(0, (int)-1, (int)1, 9, 0, 0)), Values(1)));
+
+#else       // Correctness
+INSTANTIATE_TEST_CASE_P(ALL_TBMV, TBMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),
+    ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(0,(int)22,(int)-20,9,10,0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_TBMV, TBMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),
+    Values(2509, 2299), Values(1499,2199), Values(clMath::ExtraTestSizes(0,(int)3,(int)-2,9,0,6)), Values(1)));
+
+#endif      // Correctness
+
+#endif // DO_TBMV
+
+
+#ifdef DO_TBSV
+
+#if defined(SHORT_TESTS)
+/*
+INSTANTIATE_TEST_CASE_P(Short_TBSV, TBSV, Combine(
+    Values(clblasRowMajor), Values(clblasUpper), Values(clblasNoTrans), Values(clblasNonUnit),
+    ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall0_TBSV, TBSV, Combine(
+    ValuesIn(orderSet), Values(clblasLower), Values(clblasTrans), Values(clblasUnit),
+    Values(14), Values(13), Values(ExtraTestSizes(0, (int)-1, (int)1, 9, 0, 0)), Values(1)));
+*/
+
+INSTANTIATE_TEST_CASE_P(Short_TBSV, TBSV, Combine(
+    Values(clblasRowMajor), Values(clblasLower), Values(clblasNoTrans), Values(clblasNonUnit),
+    ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedSmall0_TBSV, TBSV, Combine(
+    Values(clblasRowMajor), Values(clblasLower), Values(clblasNoTrans), Values(clblasUnit),
+    Values(14), Values(13), Values(ExtraTestSizes(0, (int)-2, (int)1, 9, 0, 0)), Values(1)));
+
+#elif defined(MEDIUM_TESTS)
+
+INSTANTIATE_TEST_CASE_P(Order_TBSV, TBSV, Combine(
+    Values(clblasRowMajor), Values(clblasLower), Values(clblasNoTrans), Values(clblasNonUnit),
+    ValuesIn(smallRange), ValuesIn(smallRange),Values(ExtraTestSizes(0, (int)1, (int)33, 10, 0, 0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(Uplo_TBSV, TBSV, Combine(
+    Values(clblasRowMajor), Values(clblasLower), Values(clblasNoTrans), Values(clblasNonUnit),
+    ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 10)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(Trans_TBSV, TBSV, Combine(
+    Values(clblasRowMajor), Values(clblasLower), Values(clblasNoTrans), Values(clblasUnit),
+    ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)-33, (int)1, 0, 10, 0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(Diag_TBSV, TBSV, Combine(
+    Values(clblasRowMajor), Values(clblasLower), Values(clblasNoTrans), ValuesIn(diagSet),
+    ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)1, 8, 0, 0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedVerySmall_TBSV, TBSV, Combine(
+    Values(clblasRowMajor), Values(clblasLower), Values(clblasNoTrans), ValuesIn(diagSet),
+    Values(1, 2, 4, 9), Values(3), Values(ExtraTestSizes(0, (int)-1, (int)1, 9, 0, 0)), Values(1)));
+/*
+INSTANTIATE_TEST_CASE_P(Order_TBSV, TBSV, Combine(
+    ValuesIn(orderSet), Values(clblasUpper), Values(clblasNoTrans), Values(clblasNonUnit),
+    ValuesIn(smallRange), ValuesIn(smallRange),Values(ExtraTestSizes(0, (int)1, (int)33, 10, 0, 0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(Uplo_TBSV, TBSV, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), Values(clblasNoTrans), Values(clblasNonUnit),
+    ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 10)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(Trans_TBSV, TBSV, Combine(
+    Values(clblasRowMajor), Values(clblasLower), ValuesIn(transSet), Values(clblasUnit),
+    ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)-33, (int)1, 0, 10, 0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(Diag_TBSV, TBSV, Combine(
+    Values(clblasRowMajor), Values(clblasUpper), Values(clblasNoTrans), ValuesIn(diagSet),
+    ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)1, 8, 0, 0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedVerySmall_TBSV, TBSV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),
+    Values(1, 2, 4, 9), Values(3), Values(ExtraTestSizes(0, (int)-1, (int)1, 9, 0, 0)), Values(1)));
+*/
+#else       // Correctness
+INSTANTIATE_TEST_CASE_P(ALL_TBSV, TBSV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),
+    ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(0,(int)22,(int)-20,9,10,0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_TBSV, TBSV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),
+    Values(2509, 2299), Values(1499,2199), Values(clMath::ExtraTestSizes(0,(int)3,(int)-2,9,0,6)), Values(1)));
+
+#endif      // Correctness
+
+#endif // DO_TBSV
+
+//COPY
+
+#ifdef DO_COPY
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_COPY, COPY, Combine(
+    ValuesIn(smallRange), Values(1), Values(1), Values(1), Values(1), Values(1)) );
+INSTANTIATE_TEST_CASE_P(SelectedSmall0_COPY, COPY, Combine(
+    Values(61), Values(4, -11), Values(1), Values(0), Values(1), Values(1) ) );
+
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Medium_COPY, COPY, Combine(
+    ValuesIn(smallRange), Values(-10), Values(1), Values(1), Values(1), Values(1) ) );
+
+INSTANTIATE_TEST_CASE_P(SelectedBig0_COPY, COPY, Combine(
+    Values(4900), Values(1), Values(1), Values(4), Values(1), Values(1) ) );
+
+#else       // Correctness
+INSTANTIATE_TEST_CASE_P(ALL_COPY, COPY, Combine(
+    ValuesIn(completeRange), ValuesIn(incs), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) );
+
+#endif      // Correctness
+
+#endif
+
+//DOT
+
+#ifdef DO_DOT
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_DOT, DOT, Combine(
+    ValuesIn(smallRange), Values(1), Values(1), Values(1), Values(1), Values(1), Values(1)) );
+INSTANTIATE_TEST_CASE_P(SelectedSmall0_DOT, DOT, Combine(
+    Values(61), Values(4, -11), Values(1), Values(0), Values(1), Values(1) , Values(1)) );
+
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Medium_DOT, DOT, Combine(
+    ValuesIn(smallRange), Values(-10), Values(1), Values(1), Values(1), Values(1), Values(1) ) );
+
+INSTANTIATE_TEST_CASE_P(SelectedBig0_DOT, DOT, Combine(
+    Values(4900), Values(1), Values(1), Values(4), Values(1), Values(1), Values(1) ) );
+
+#else       // Correctness
+INSTANTIATE_TEST_CASE_P(ALL_DOT, DOT, Combine(
+    ValuesIn(completeRange), ValuesIn(incs), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) );
+
+#endif      // Correctness
+
+#endif
+
+#ifdef DO_DOTC
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_DOTC, DOTC, Combine(
+    ValuesIn(smallRange), Values(1), Values(1), Values(1), Values(1), Values(1), Values(1)) );
+INSTANTIATE_TEST_CASE_P(SelectedSmall0_DOTC, DOTC, Combine(
+    Values(61), Values(4, -11), Values(1), Values(0), Values(1), Values(1) , Values(1)) );
+
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Medium_DOTC, DOTC, Combine(
+    ValuesIn(smallRange), Values(-10), Values(1), Values(1), Values(1), Values(1), Values(1) ) );
+
+INSTANTIATE_TEST_CASE_P(SelectedBig0_DOTC, DOTC, Combine(
+    Values(4900), Values(1), Values(1), Values(4), Values(1), Values(1), Values(1) ) );
+
+#else       // Correctness
+INSTANTIATE_TEST_CASE_P(ALL_DOTC, DOTC, Combine(
+    ValuesIn(completeRange), ValuesIn(incs), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) );
+
+#endif      // Correctness
+
+#endif
+
+
+#ifdef DO_SCAL
+
+#if defined(SHORT_TESTS)
+INSTANTIATE_TEST_CASE_P(Short_SCAL, SCAL, Combine(
+    ValuesIn(smallRange), ValuesIn(alphaBetaRange), Values(0), Values(1), Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedSmall0_SCAL, SCAL, Combine(
+    Values(61), ValuesIn(alphaBetaRange), Values(0), Values(4,-11), Values(1) ) );
+
+
+#elif defined(MEDIUM_TESTS)
+INSTANTIATE_TEST_CASE_P(Medium_SCAL, SCAL, Combine(
+    ValuesIn(smallRange), ValuesIn(alphaBetaRange), Values(0), Values(-10), Values(1) ) );
+
+INSTANTIATE_TEST_CASE_P(SelectedBig0_SCAL, SCAL, Combine(
+    Values(4900), ValuesIn(alphaBetaRange), Values(0), Values(4), Values(1) ) );
+
+#else       // Correctness
+INSTANTIATE_TEST_CASE_P(ALL_SCAL, SCAL, Combine(
+    ValuesIn(completeRange), ValuesIn(alphaBetaRange), ValuesIn(offsetRange), ValuesIn(incs), Values(1) ) );
+
+#endif      // Correctness
+
+#endif
+
+
+// Big matrices
+#if !defined SHORT_TESTS
+
+#ifdef DO_TRMV
+INSTANTIATE_TEST_CASE_P(SelectedBig_0TRMV, TRMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(clblasTrans), ValuesIn(diagSet),Values(2800),
+    Values(0), ValuesIn(incs), Values(0, 10), Values(0, 9), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_1TRMV, TRMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(clblasTrans), ValuesIn(diagSet),Values(4567),
+    Values(0), ValuesIn(incs), Values(0, 10), Values(0, 9), Values(1)));
+#endif
+
+#ifdef DO_TRSV
+INSTANTIATE_TEST_CASE_P(SelectedBig_0TRSV, TRSV, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet),
+    Values(clblasTrans), ValuesIn(diagSet),Values(2800),
+    Values(0), Values(1),  Values(0), Values(0), Values(1)));
+
+#endif
+
+#ifdef DO_TPSV
+INSTANTIATE_TEST_CASE_P(SelectedBig_0TPSV, TPSV, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet),
+    Values(clblasTrans), ValuesIn(diagSet),Values(2800),
+    Values(0), Values(1),  Values(0), Values(0), Values(1)));
+#endif
+
+#ifdef DO_HER
+INSTANTIATE_TEST_CASE_P(SelectedBig_0HER, HER, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet), Values(2800), Values((double)50),
+    Values(0), Values(1), Values(0), Values(0),
+    Values(1) ) );
+#endif
+
+
+#ifdef DO_HER2
+INSTANTIATE_TEST_CASE_P(SelectedBig_0HER2, HER2, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet), Values(2800), Values((cl_float2)floatComplex(0,1)),
+        Values(0), Values(1), Values(0),
+        Values(0), Values(0),Values(1) ) );
+#endif
+
+
+#if !defined(MEDIUM_TESTS)
+
+#ifdef DO_TRMV
+INSTANTIATE_TEST_CASE_P(SelectedBig_2TRMV, TRMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(clblasNoTrans), ValuesIn(diagSet), Values(5567),
+    Values(0), ValuesIn(incs), Values(0, 10), Values(0, 9), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_3TRMV, TRMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(clblasNoTrans), ValuesIn(diagSet), Values(6567),
+    Values(0), ValuesIn(incs), Values(0, 10), Values(0, 9), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_4TRMV, TRMV, Combine(
+   ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(clblasNoTrans), ValuesIn(diagSet), Values(7567),
+    Values(0), ValuesIn(incs), Values(0, 10), Values(0, 9), Values(1)));
+#endif
+
+#ifdef DO_TPMV
+INSTANTIATE_TEST_CASE_P(SelectedBig_2TPMV, TPMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(clblasNoTrans), ValuesIn(diagSet), Values(5567),Values(0),
+    ValuesIn(incs), Values(0, 10), Values(0, 9), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_3TPMV, TPMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(clblasNoTrans), ValuesIn(diagSet), Values(6567),Values(0),
+    ValuesIn(incs), Values(0, 10), Values(0, 9), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_4TPMV, TPMV, Combine(
+   ValuesIn(orderSet), ValuesIn(uploSet),
+   Values(clblasNoTrans), ValuesIn(diagSet), Values(7567),Values(0),
+   ValuesIn(incs), Values(0, 10), Values(0, 9), Values(1)));
+#endif
+
+
+#ifdef DO_TRSV
+INSTANTIATE_TEST_CASE_P(SelectedBig_1TRSV, TRSV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(clblasTrans), ValuesIn(diagSet),Values(4567),
+    Values(0), ValuesIn(incs),  Values(0,10), Values(0,9), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_2TRSV, TRSV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(clblasNoTrans), ValuesIn(diagSet), Values(5567),
+    Values(0), ValuesIn(incs),  Values(0,10), Values(0,9), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_3TRSV, TRSV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(clblasNoTrans), ValuesIn(diagSet), Values(6567),
+    Values(0), ValuesIn(incs),  Values(0,10), Values(0,9), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_4TRSV, TRSV, Combine(
+   ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(clblasNoTrans), ValuesIn(diagSet), Values(7567),
+    Values(0), ValuesIn(incs),  Values(0,10), Values(0,9), Values(1)));
+#endif
+
+#ifdef DO_TPSV
+INSTANTIATE_TEST_CASE_P(SelectedBig_1TPSV, TPSV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(clblasTrans), ValuesIn(diagSet),Values(4567),
+    Values(0), ValuesIn(incs),  Values(0,10), Values(0,9), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_2TPSV, TPSV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(clblasNoTrans), ValuesIn(diagSet), Values(5567),
+    Values(0), ValuesIn(incs),  Values(0,10), Values(0,9), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_3TPSV, TPSV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(clblasNoTrans), ValuesIn(diagSet), Values(6567),
+    Values(0), ValuesIn(incs),  Values(0,10), Values(0,9), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(SelectedBig_4TPSV, TPSV, Combine(
+   ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(clblasNoTrans), ValuesIn(diagSet), Values(7567),
+    Values(0), ValuesIn(incs),  Values(0,10), Values(0,9), Values(1)));
+#endif
+
+
+#ifdef DO_HER
+INSTANTIATE_TEST_CASE_P(SelectedBig_1HER, HER, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(3192), ValuesIn(realAlphaRange),
+    ValuesIn(ldaRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange),
+    Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedBig_2HER, HER, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(2048), ValuesIn(realAlphaRange),
+    ValuesIn(ldaRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange),
+    Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedBig_3HER, HER, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(3192), ValuesIn(realAlphaRange),
+    ValuesIn(ldaRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange),
+    Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedBig_4HER, HER, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(2055), ValuesIn(realAlphaRange),
+    ValuesIn(ldaRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange),
+    Values(1) ) );
+#endif
+
+#ifdef DO_HER2
+INSTANTIATE_TEST_CASE_P(SelectedBig_1HER2, HER2, Combine(
+        ValuesIn(orderSet), ValuesIn(uploSet), Values(3192), ValuesIn(complexAlphaRange),
+    ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange),
+    ValuesIn(offsetRange), ValuesIn(ldaRange),Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedBig_2HER2, HER2, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(2048), ValuesIn(complexAlphaRange),
+    ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange),
+        ValuesIn(ldaRange),Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedBig_3HER2, HER2, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(3192), ValuesIn(complexAlphaRange),
+    ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange),
+    ValuesIn(ldaRange),Values(1) ) );
+INSTANTIATE_TEST_CASE_P(SelectedBig_4HER2, HER2, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(2055), ValuesIn(complexAlphaRange),
+    ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange),ValuesIn(offsetRange),
+    ValuesIn(ldaRange),Values(1) ) );
+#endif
+
+
+#endif /* !MEDIUM_TESTS */
+#endif /* !SHORT_TESTS */
+
+// Small matrices
+
+#ifdef DO_TRMV
+INSTANTIATE_TEST_CASE_P(SelectedSmall_0TRMV, TRMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(clblasNoTrans), ValuesIn(diagSet), Values(1),
+    Values(0), Values(1), Values(0, 10), Values(0, 9), Values(1)));
+#endif
+
+#ifdef DO_TPMV
+INSTANTIATE_TEST_CASE_P(SelectedSmall_0TPMV, TPMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(clblasNoTrans), ValuesIn(diagSet), Values(1),Values(0),
+    Values(1), Values(0, 10), Values(0, 9), Values(1)));
+#endif
+
+#ifdef DO_TRSV
+INSTANTIATE_TEST_CASE_P(SelectedSmall_0TRSV, TRSV, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet),
+    Values(clblasNoTrans), Values(clblasNonUnit), Values(1),
+    Values(0), Values(1),  Values(0,10), Values(0,9), Values(1)));
+#endif
+
+#ifdef DO_TPSV
+INSTANTIATE_TEST_CASE_P(SelectedSmall_0TPSV, TPSV, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet),
+    Values(clblasNoTrans), Values(clblasNonUnit), Values(1),
+    Values(0), Values(1),  Values(0,10), Values(0,9), Values(1)));
+#endif
+
+
+#ifdef DO_HER
+INSTANTIATE_TEST_CASE_P(SelectedSmall_0HER, HER, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet), Values(4), ValuesIn(realAlphaRange),
+    Values(0), ValuesIn(incs), Values(0,9), Values(0,11),
+    Values(1) ) );
+#endif
+
+#ifdef DO_HER2
+INSTANTIATE_TEST_CASE_P(SelectedSmall_0HER2, HER2, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet), Values(4), ValuesIn(complexAlphaRange),
+    Values(0,7), ValuesIn(incs), Values(0,9), Values(0,11),
+    Values(0),Values(1) ) );
+#endif
+
+
+#if !defined SHORT_TESTS
+
+#ifdef DO_TRMV
+INSTANTIATE_TEST_CASE_P(SelectedSmall_1TRMV, TRMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(clblasNoTrans), ValuesIn(diagSet), Values(2),
+    Values(0), Values(1), Values(0, 10), Values(0, 9), Values(1)));
+#endif
+
+#ifdef DO_TPMV
+INSTANTIATE_TEST_CASE_P(SelectedSmall_1TPMV, TPMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(clblasNoTrans), ValuesIn(diagSet), Values(2),Values(0),
+    Values(1), Values(0, 10), Values(0, 9), Values(1)));
+#endif
+
+#ifdef DO_TRSV
+INSTANTIATE_TEST_CASE_P(SelectedSmall_1TRSV, TRSV, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet),
+    Values(clblasNoTrans), Values(clblasUnit), Values(2),
+    Values(0), Values(1),  Values(10), Values(9), Values(1)));
+#endif
+
+
+#ifdef DO_HER
+INSTANTIATE_TEST_CASE_P(SelectedSmall_1HER, HER, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet), Values(12), ValuesIn(realAlphaRange),
+    Values(0), ValuesIn(incs), Values(0), Values(1),
+    Values(1) ) );
+
+#endif
+
+#ifdef DO_HER2
+INSTANTIATE_TEST_CASE_P(SelectedSmall_1HER2, HER2, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(12), ValuesIn(complexAlphaRange),
+    Values(0,1), ValuesIn(incs), Values(0),Values(9),
+    Values(0),Values(1) ) );
+#endif
+
+
+#if !defined(MEDIUM_TESTS)
+
+#ifdef DO_TRMV
+INSTANTIATE_TEST_CASE_P(SelectedSmall_2TRMV, TRMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(clblasNoTrans), ValuesIn(diagSet), Values(13),
+    Values(0), Values(1), Values(0, 10), Values(0, 9), Values(1)));
+#endif
+
+#ifdef DO_TPMV
+INSTANTIATE_TEST_CASE_P(SelectedSmall_2TPMV, TPMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(clblasNoTrans), ValuesIn(diagSet), Values(13),Values(0),
+    Values(1), Values(0, 10), Values(0, 9), Values(1)));
+#endif
+
+#ifdef DO_TRSV
+INSTANTIATE_TEST_CASE_P(SelectedSmall_2TRSV, TRSV, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet),
+    Values(clblasNoTrans), Values(clblasNonUnit), Values(13),
+    Values(0), Values(1), Values(0,10), Values(0,9), Values(1)));
+#endif
+
+#ifdef DO_TPSV
+INSTANTIATE_TEST_CASE_P(SelectedSmall_2TPSV, TPSV, Combine(
+    Values(clblasRowMajor), ValuesIn(uploSet),
+    Values(clblasTrans), Values(clblasUnit), Values(13),
+    Values(0), Values(1), Values(0,10), Values(0,9), Values(1)));
+#endif
+
+
+#ifdef DO_HER
+INSTANTIATE_TEST_CASE_P(SelectedSmallHER_2HER, HER, Combine(
+   ValuesIn(orderSet), ValuesIn(uploSet), Values(65), ValuesIn(realAlphaRange),
+    Values(0), ValuesIn(incs), Values(0), Values(0),
+    Values(1) ) );
+
+#endif
+
+#ifdef DO_HER2
+INSTANTIATE_TEST_CASE_P(SelectedSmallHER2_2HER2, HER2, Combine(
+   ValuesIn(orderSet), ValuesIn(uploSet), Values(65), ValuesIn(complexAlphaRange),
+   Values(0), ValuesIn(incs), Values(0), Values(0),
+   Values(0), Values(1) ) );
+#endif
+
+#endif /* !MEDIUM_TESTS */
+#endif /* !SHORT_TESTS */
+
+// Custom test - use command line arguments to tweak it
+#if !defined SHORT_TESTS && !defined MEDIUM_TESTS
+#ifdef DO_TRMV
+INSTANTIATE_TEST_CASE_P(Custom, TRMV,  Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet), Values(32),
+    Values(0), Values(1), Values(0, 10), Values(0, 9), Values(1)));
+#endif
+
+#ifdef DO_TRSV
+INSTANTIATE_TEST_CASE_P(Custom, TRSV,  Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet), Values(32),
+    Values(0), Values(1),  Values(0,10), Values(0,9), Values(1)));
+#endif
+
+#ifdef DO_TPSV
+INSTANTIATE_TEST_CASE_P(Custom, TPSV,  Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet), Values(32),
+    Values(0), Values(1),  Values(0,10), Values(0,9), Values(1)));
+#endif
+
+#ifdef DO_GER
+INSTANTIATE_TEST_CASE_P(Custom, GER, Combine(
+    ValuesIn(orderSet),
+    Values(32), Values(32),
+    Values(0), Values(1), Values(1), Values(0, 9), Values(0, 11), Values(0, 10),
+    Values(1) ) );
+#endif
+
+#ifdef DO_GERC
+INSTANTIATE_TEST_CASE_P(Custom, GERC, Combine(
+    ValuesIn(orderSet),
+    Values(32), Values(32),
+    Values(0), Values(1), Values(1), Values(0, 9), Values(0, 11), Values(0, 10),
+    Values(1) ) );
+#endif
+
+#ifdef DO_HER
+INSTANTIATE_TEST_CASE_P(Custom, HER, Combine(
+   ValuesIn(orderSet), ValuesIn(uploSet), Values(32), Values(99.0),
+    Values(0), Values(1), Values(6, 2), Values(0, 5),
+    Values(1) ) );
+
+#endif
+
+#ifdef DO_HER2
+INSTANTIATE_TEST_CASE_P(Custom, HER2, Combine(
+   ValuesIn(orderSet), ValuesIn(uploSet), Values(32), ValuesIn(complexAlphaRange),
+   Values(0), Values(1), Values(0), Values(0),Values(40), Values(1) ) );
+#endif
+
+#endif /* !SHORT_TESTS */
+// Multiple command queues tests
+
+#if defined SHORT_TESTS
+#define QUEUES_TEST_MATRIX_SIZES 257
+#elif defined MEDIUM_TESTS
+#define QUEUES_TEST_MATRIX_SIZES 385
+#else
+#define QUEUES_TEST_MATRIX_SIZES 513,1025
+#endif
+
+#if !defined(SHORT_TESTS)
+
+#ifdef DO_GEMM
+INSTANTIATE_TEST_CASE_P(MultipleQueues, GEMM, Combine(
+    ValuesIn(orderSet), ValuesIn(transSet), ValuesIn(transSet),
+    Values(QUEUES_TEST_MATRIX_SIZES),
+    Values(QUEUES_TEST_MATRIX_SIZES),
+    Values(QUEUES_TEST_MATRIX_SIZES),
+    Values(clMath::ExtraTestSizes()), ValuesIn(numQueues)));
+#endif
+
+#if !defined(MEDIUM_TESTS)
+
+
+#ifdef DO_TRMM
+
+INSTANTIATE_TEST_CASE_P(MultipleQueues, TRMM, Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES),
+    Values(clMath::ExtraTestSizes()), ValuesIn(numQueues)));
+#endif
+
+#ifdef DO_TRSM
+INSTANTIATE_TEST_CASE_P(MultipleQueues, TRSM, Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES),
+    Values(clMath::ExtraTestSizes()), ValuesIn(numQueues)));
+#endif
+
+#endif                      /* MEDIUM_TESTS */
+
+
+#ifdef DO_GEMV
+INSTANTIATE_TEST_CASE_P(MultipleQueues, GEMV, Combine(
+    ValuesIn(orderSet), ValuesIn(transSet),
+    Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), ValuesIn(numQueues)));
+#endif
+
+#ifdef DO_SYMV
+INSTANTIATE_TEST_CASE_P(MultipleQueues, SYMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(QUEUES_TEST_MATRIX_SIZES),
+    Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), ValuesIn(numQueues)));
+#endif
+
+#ifdef DO_SYR2K
+INSTANTIATE_TEST_CASE_P(MultipleQueues, SYR2K, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES),
+    Values(clMath::ExtraTestSizes()), ValuesIn(numQueues)));
+#endif
+
+#ifdef DO_SYRK
+INSTANTIATE_TEST_CASE_P(MultipleQueues, SYRK, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES),
+    Values(clMath::ExtraTestSizes()), ValuesIn(numQueues)));
+#endif
+
+#if !defined MEDIUM_TESTS
+
+#ifdef DO_HERK
+INSTANTIATE_TEST_CASE_P(MultipleQueues, HERK, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES),
+    ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange),
+    Values(clMath::ExtraTestSizes()), ValuesIn(numQueues)));
+#endif
+
+#ifdef DO_HER2K
+INSTANTIATE_TEST_CASE_P(MultipleQueues, HER2K, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES),
+    ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange),
+    Values(clMath::ExtraTestSizes()), ValuesIn(numQueues)));
+#endif
+
+#ifdef DO_TRMV
+INSTANTIATE_TEST_CASE_P(MultipleQueues, TRMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),Values(QUEUES_TEST_MATRIX_SIZES),
+    Values(0), Values(1), Values(0, 10), Values(0, 9), ValuesIn(numQueues)));
+#endif
+
+#ifdef DO_TPMV
+INSTANTIATE_TEST_CASE_P(MultipleQueues, TPMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),Values(QUEUES_TEST_MATRIX_SIZES),
+    Values(0), Values(1), Values(0, 10), Values(0, 9), ValuesIn(numQueues)));
+#endif
+
+#ifdef DO_HEMV
+INSTANTIATE_TEST_CASE_P(MultipleQueues, HEMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(alphaBetaRange),
+    ValuesIn(alphaBetaRange), Values(0, 10), Values(0, 9), Values(0, 8), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), ValuesIn(numQueues)));
+#endif
+
+#ifdef DO_HPMV
+INSTANTIATE_TEST_CASE_P(MultipleQueues, HPMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(alphaBetaRange),
+    ValuesIn(alphaBetaRange), Values(0, 10), Values(0, 9), Values(0, 8), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), ValuesIn(numQueues)));
+#endif
+
+
+#ifdef DO_SPMV
+INSTANTIATE_TEST_CASE_P(MultipleQueues, SPMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(alphaBetaRange),
+    ValuesIn(alphaBetaRange), Values(0, 10), Values(0, 9), Values(0, 8), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), ValuesIn(numQueues)));
+#endif
+
+#ifdef DO_TRSV
+INSTANTIATE_TEST_CASE_P(MultipleQueues, TRSV, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet),
+    Values(clblasConjTrans), ValuesIn(diagSet),Values(QUEUES_TEST_MATRIX_SIZES),
+    Values(0), Values(1),  Values(0,10), Values(0,9), ValuesIn(numQueues)));
+#endif
+
+#ifdef DO_TPSV
+INSTANTIATE_TEST_CASE_P(MultipleQueues, TPSV, Combine(
+    Values(clblasColumnMajor), ValuesIn(uploSet),
+    Values(clblasTrans), ValuesIn(diagSet),Values(QUEUES_TEST_MATRIX_SIZES),
+    Values(0), Values(1),  Values(0,10), Values(0,9), ValuesIn(numQueues)));
+#endif
+
+#ifdef DO_SYR
+INSTANTIATE_TEST_CASE_P(MultipleQueues, SYR, Combine(
+	ValuesIn(orderSet), ValuesIn(uploSet), Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(realAlphaRange),
+	ValuesIn(offsetRange), ValuesIn(incs),
+	ValuesIn(offsetRange), ValuesIn(ldaRange), ValuesIn(numQueues) ) );
+
+#endif
+
+#ifdef DO_SPR
+INSTANTIATE_TEST_CASE_P(MultipleQueues, SPR, Combine(
+	ValuesIn(orderSet), ValuesIn(uploSet), Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(realAlphaRange),
+	ValuesIn(offsetRange), ValuesIn(incs),
+	ValuesIn(offsetRange), ValuesIn(ldaRange), ValuesIn(numQueues) ) );
+
+#endif
+
+#ifdef DO_GER
+INSTANTIATE_TEST_CASE_P(MultipleQueues, GER, Combine(
+    ValuesIn(orderSet),
+    Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES),
+    Values(0), Values(1), Values(1), Values(0, 9), Values(0, 11), Values(0, 10),
+    ValuesIn(numQueues) ) );
+#endif
+
+#ifdef DO_GERC
+INSTANTIATE_TEST_CASE_P(MultipleQueues, GERC, Combine(
+    ValuesIn(orderSet),
+    Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES),
+    Values(0), Values(1), Values(1), Values(0, 9), Values(0, 11), Values(0, 10),
+    ValuesIn(numQueues) ) );
+#endif
+
+#ifdef DO_HER
+INSTANTIATE_TEST_CASE_P(MultipleQueues, HER, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(realAlphaRange),
+    ValuesIn(ldaRange), Values(1), Values(0), Values(0),
+    ValuesIn(numQueues) ) );
+
+#endif
+
+#ifdef DO_HPR
+INSTANTIATE_TEST_CASE_P(MultipleQueues, HPR, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(realAlphaRange),
+    ValuesIn(ldaRange), Values(1), Values(0), Values(0),
+    ValuesIn(numQueues) ) );
+
+#endif
+
+#ifdef DO_HER2
+INSTANTIATE_TEST_CASE_P(MultipleQueues, HER2, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(complexAlphaRange),
+    Values(0), Values(1), Values(0),Values(1), ValuesIn(ldaRange),
+    ValuesIn(numQueues) ) );
+#endif
+
+#ifdef DO_HPR2
+INSTANTIATE_TEST_CASE_P(MultipleQueues, HPR2, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(complexAlphaRange),
+    Values(0), Values(1), Values(0),Values(1), ValuesIn(ldaRange),
+    ValuesIn(numQueues) ) );
+#endif
+
+
+#ifdef DO_SYR2
+#endif
+
+#ifdef DO_SPR2
+INSTANTIATE_TEST_CASE_P(MultipleQueues, SPR2, Combine(
+	ValuesIn(orderSet), ValuesIn(uploSet), Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(realAlphaRange),
+	ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange),
+	ValuesIn(offsetRange), ValuesIn(ldaRange), ValuesIn(numQueues) ) );
+#endif
+
+#ifdef DO_GBMV
+INSTANTIATE_TEST_CASE_P(MultipleQueues, GBMV, Combine(
+    ValuesIn(orderSet), ValuesIn(transSet), Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES),
+    Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES),Values(clMath::ExtraTestSizes(0,(int)1,(int)1,0,0,0)),
+    ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), ValuesIn(numQueues)));
+#endif
+
+#ifdef DO_TBMV
+INSTANTIATE_TEST_CASE_P(MultipleQueues, TBMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(QUEUES_TEST_MATRIX_SIZES),
+    Values(QUEUES_TEST_MATRIX_SIZES), Values(clMath::ExtraTestSizes(0,(int)1,(int)1,0,0,0)), ValuesIn(numQueues)));
+#endif
+
+#ifdef DO_SCAL
+INSTANTIATE_TEST_CASE_P(MultipleQueues, SCAL, Combine(
+    Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(alphaBetaRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(numQueues)));
+#endif
+
+#ifdef DO_COPY
+INSTANTIATE_TEST_CASE_P(MultipleQueues, COPY, Combine(
+    Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(incs), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(numQueues)));
+#endif
+
+#ifdef DO_SWAP
+INSTANTIATE_TEST_CASE_P(MultipleQueues, SWAPXY, Combine(
+        Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(offsetRange), ValuesIn(incs),
+        ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(numQueues) ) );
+#endif
+
+#ifdef DO_DOT
+INSTANTIATE_TEST_CASE_P(MultipleQueues, DOT, Combine(
+    Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(incs), ValuesIn(incs),
+    ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(numQueues) ) );
+#endif
+
+#ifdef DO_DOTC
+INSTANTIATE_TEST_CASE_P(MultipleQueues, DOTC, Combine(
+    Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(incs), ValuesIn(incs),
+    ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(numQueues) ) );
+#endif
+
+#ifdef DO_AXPY
+INSTANTIATE_TEST_CASE_P(MultipleQueues, AXPY, Combine(
+        Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(alphaBetaRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(numQueues)));
+#endif
+
+#ifdef DO_ROTG
+INSTANTIATE_TEST_CASE_P(MultipleQueues, ROTG, Combine(
+    ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(numQueues)));
+#endif
+
+#ifdef DO_ROTM
+INSTANTIATE_TEST_CASE_P(MultipleQueues, ROTM, Combine(
+    Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(incs),
+    ValuesIn(offsetRange), ValuesIn(sflagRange), ValuesIn(numQueues)));
+#endif
+
+#ifdef DO_ROT
+INSTANTIATE_TEST_CASE_P(MultipleQueues, ROT, Combine(
+    Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(incs),
+    ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), ValuesIn(numQueues)));
+#endif
+
+#ifdef DO_ROTMG
+INSTANTIATE_TEST_CASE_P(MultipleQueues, ROTMG, Combine(
+    ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange),
+    ValuesIn(offsetRange), ValuesIn(sflagRange), ValuesIn(numQueues)));
+#endif
+
+#ifdef DO_NRM2
+INSTANTIATE_TEST_CASE_P(MultipleQueues, NRM2, Combine(
+    Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(incs),
+    ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(numQueues) ) );
+#endif
+
+#ifdef DO_ASUM
+INSTANTIATE_TEST_CASE_P(MultipleQueues, ASUM, Combine(
+    Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(incs),
+    ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(numQueues) ) );
+#endif
+
+#ifdef DO_iAMAX
+INSTANTIATE_TEST_CASE_P(MultipleQueues, iAMAX, Combine(
+    Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(incs),
+    ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(numQueues) ) );
+#endif
+
+#endif /* !MEDIUM_TESTS */
+#endif /* SHORT_TESTS */
+
+#undef QUEUES_TEST_MATRIX_SIZES
+
+///////////////////////////////////////////////////////////////////////////////
+
+int
+main(int argc, char *argv[])
+{
+    ::clMath::BlasBase *base;
+    TestParams params;
+    int ret;
+
+    if( (argc > 1) && ( !strcmp(argv[1], "--test-help") || !strcmp(argv[1], "-?") || !strcmp(argv[1], "-h") ) )
+	{
+        printUsage("test-correctness");
+		::testing::InitGoogleTest(&argc, argv);
+        return 0;
+    }
+
+	//	The library takes an environment variable to control how to cache kernels; automate the setting of this
+	//	environment variable in our different test programs to set it to reasonable values
+	//	Read environmental variable to limit or disable ( 0 ) the size of the kernel cache in memory
+	char* kCacheEnv = getenv( "AMD_CLBLAS_KCACHE_LIMIT_MB" );
+	if( kCacheEnv == NULL )
+	{
+#if defined( SHORT_TESTS )
+#else
+	putenv( (char*)"AMD_CLBLAS_KCACHE_LIMIT_MB=256" );
+#endif
+	}
+
+    ::testing::InitGoogleTest(&argc, argv);
+    ::std::cerr << "Initialize OpenCL and clblas..." << ::std::endl;
+    base = ::clMath::BlasBase::getInstance();
+    if (base == NULL) {
+        ::std::cerr << "Fatal error, OpenCL or clblas initialization failed! "
+                       "Leaving the test." << ::std::endl;
+        return -1;
+    }
+
+    base->setSeed(DEFAULT_SEED);
+
+    if (argc != 1) {
+        params.optFlags = NO_FLAGS;
+        params.devType = CL_DEVICE_TYPE_GPU;
+        params.devName = NULL;
+        if (parseBlasCmdLineArgs(argc, argv, &params) != 0) {
+            printUsage(argv[0]);
+            return 1;
+        }
+        if (params.optFlags & SET_SEED) {
+            base->setSeed(params.seed);
+        }
+        if (params.optFlags & SET_ALPHA) {
+            base->setAlpha(params.alpha);
+        }
+        if (params.optFlags & SET_BETA) {
+            base->setBeta(params.beta);
+        }
+        if (params.optFlags & SET_M) {
+            base->setM(params.M);
+        }
+        if (params.optFlags & SET_N) {
+            base->setN(params.N);
+        }
+        if (params.optFlags & SET_K) {
+            base->setK(params.K);
+        }
+        if (params.optFlags & SET_INCX) {
+            base->setIncX(params.incx);
+        }
+        if (params.optFlags & SET_INCY) {
+            base->setIncY(params.incy);
+        }
+        if (params.optFlags & SET_DEVICE_TYPE) {
+            if (!base->setDeviceType(&params.devType, params.devName)) {
+                ::std::cerr << "Fatal error, OpenCL or clblas "
+                        "initialization failed! Leaving the test." <<
+                        ::std::endl;
+                return -1;
+            }
+        }
+        if (params.optFlags & SET_NUM_COMMAND_QUEUES) {
+            base->setNumCommandQueues(params.numCommandQueues);
+        }
+    }
+
+    parseEnv(&params);
+    if (params.optFlags & SET_USE_IMAGES) {
+        base->setUseImages(params.useImages);
+    }
+
+	/* Use of image based buffers is deprecated
+    if (base->useImages()) {
+        if (base->addScratchImages()) {
+            std::cerr << "FATAL ERROR, CANNOT CREATE SCRATCH IMAGES!" << std::endl;
+        }
+    }
+	*/
+
+    base->printEnvInfo();
+    ret = RUN_ALL_TESTS();
+
+    if (base->useImages()) {
+        base->removeScratchImages();
+    }
+
+    /*
+     * Explicitely tell the singleton to release all resources,
+     * before we return from main.
+     */
+    base->release( );
+
+    return ret;
+}
diff --git a/src/tests/correctness/trsm-delta.h b/src/tests/correctness/trsm-delta.h
new file mode 100644
index 0000000..2967344
--- /dev/null
+++ b/src/tests/correctness/trsm-delta.h
@@ -0,0 +1,240 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <blas-math.h>
+#include <common.h>
+
+// Type-dependant constants
+template <class T>
+static cl_double DELTA_0();
+template<>
+__template_static cl_double DELTA_0<cl_float>()       { return pow(2.0, -20); }
+template<>
+__template_static cl_double DELTA_0<cl_double>()      { return pow(2.0, -50); }
+template<>
+__template_static cl_double DELTA_0<FloatComplex>()   { return pow(2.0, -20); }
+template<>
+__template_static cl_double DELTA_0<DoubleComplex>()  { return pow(2.0, -50); }
+
+size_t
+trsmBlockSize(size_t elemSize)
+{
+    /* TODO: Right now TRSM generators use block size of 16 elements for the
+     *       double complex type, and of 32 elements for another types.
+     *       If this changes, we have to fetch block size from TRSM generator
+     *       somehow.
+     */
+    return (elemSize == sizeof(DoubleComplex)) ? 16 : 32;
+}
+
+template <typename T>
+void
+trsmDelta(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    T *A,
+    size_t lda,
+    T *B,
+    size_t ldb,
+    T alpha,
+    cl_double *delta)
+{
+    cl_double *deltaCLBLAS, s;
+    int i, k, j, jStart, jEnd, idx;
+    int zinc;
+    size_t z = 0;
+    size_t bsize;
+    bool isUpper;
+    T v;
+
+    isUpper = ((uplo == clblasUpper) && (transA == clblasNoTrans)) ||
+              ((uplo == clblasLower) && (transA != clblasNoTrans));
+
+    deltaCLBLAS = new cl_double[M * N];
+    bsize = trsmBlockSize(sizeof(T));
+
+    if (side == clblasLeft) {
+        // Calculate delta of TRSM evaluated with the Gauss' method
+
+        for (k = 0; k < (int)N; k++) {
+            if (isUpper) {
+                for (i = (int)M - 1; i >= 0; i--) {
+                    v = getElement<T>(order, clblasNoTrans, i, k, B, ldb);
+                    if (diag == clblasNonUnit) {
+                        v = v / getElement<T>(order, transA, i, i, A, lda);
+                    }
+                    s = module(v) * DELTA_0<T>() * module(alpha);
+                    if (i == (int)(M - 1)) {
+                        delta[i * N + k] = s;
+                    }
+                    else {
+                        delta[i * N + k] = s + delta[(i + 1) * N + k];
+                    }
+                    assert(delta[i* N + k] >= 0);
+                }
+            }
+            else {
+                for (i = 0; i < (int)M; i++) {
+                    v = getElement<T>(order, clblasNoTrans, i, k, B, ldb);
+                    if (diag == clblasNonUnit) {
+                        v = v / getElement<T>(order, transA, i, i, A, lda);
+                    }
+                    s = module(v) * DELTA_0<T>() * module(alpha);
+                    if (i == 0) {
+                        delta[i * N + k] = s;
+                    }
+                    else {
+                        delta[i * N + k] = s + delta[(i - 1) * N + k];
+                    }
+                    assert(delta[i* N + k] >= 0);
+                }
+            }
+        }
+
+        // Calculate clblas TRSM delta
+
+        for (k = 0; k < (int)N; k++) {
+            for (i = 0; i < (int)M; i++) {
+                s = 0.0;
+
+                /*
+                 *  For the upper triangular matrix the solving process proceeds
+                 *  from the bottom to the top, and the bottommost block's
+                 *  delta influents most of all. For the lower triangular matrix
+                 *  the situation is opposite.
+                 */
+                if (isUpper) {
+                    jStart = i / (int)bsize;
+                    // index of the block just after the last matrix block
+                    jEnd = ((int)M + (int)bsize - 1) / (int)bsize;
+                    z = 1;
+                    zinc = 1;
+                }
+                else {
+                    jStart = 0;
+                    jEnd = i / (int)bsize + 1;
+                    z = jEnd - jStart;
+                    zinc = -1;
+                }
+
+                for (j = jStart; j < jEnd; j++) {
+                    idx = j * (int)bsize + i % (int)bsize;
+                    if (idx >= (int)M) {
+                        continue;
+                    }
+                    s += z * delta[idx * N + k];
+                    z += zinc;
+                }
+
+                deltaCLBLAS[i * N + k] = s * bsize;
+                assert(deltaCLBLAS[i* N + k] >= 0);
+            }
+        }
+    }
+    else {
+        // Calculate delta of TRSM evaluated with the Gauss' method
+
+        for (i = 0; i < (int)M; i++) {
+            if (isUpper) {
+                for (k = 0; k < (int)N; k++) {
+                    v = getElement<T>(order, clblasNoTrans, i, k, B, ldb);
+                    if (diag == clblasNonUnit) {
+                        v = v / getElement<T>(order, transA, k, k, A, lda);
+                    }
+                    s = module(v) * DELTA_0<T>() * module(alpha);
+                    if (k == 0) {
+                        delta[i * N + k] = s;
+                    }
+                    else {
+                        delta[i * N + k] = s + delta[i * N + (k - 1)];
+                    }
+                    assert(delta[i* N + k] >= 0);
+                }
+            }
+            else {
+                for (k = (int)N - 1; k >= 0; k--) {
+                    v = getElement<T>(order, clblasNoTrans, i, k, B, ldb);
+                    if (diag == clblasNonUnit) {
+                        v = v / getElement<T>(order, transA, k, k, A, lda);
+                    }
+                    s = module(v) * DELTA_0<T>() * module(alpha);
+                    if (k == (int)(N - 1)) {
+                        delta[i * N + k] = s;
+                    }
+                    else {
+                        delta[i * N + k] = s + delta[i * N + (k + 1)];
+                    }
+                    assert(delta[i* N + k] >= 0);
+                }
+            }
+        }
+
+        // Calculate clblas TRSM delta
+
+        for (i = 0; i < (int)M; i++) {
+            for (k = 0; k < (int)N; k++) {
+                s = 0.0;
+
+                /*
+                 * Approach is the same as for the left side matrix, but delta
+                 * is calculated over the rows rather than the columns.
+                 * Now, since the matrices are swapped, the largest and
+                 * tightest blocks are swapped as well. Therefore, pass
+                 * direction for the upper and lower triangular matrix is also
+                 * swapped.
+                 */
+                if (isUpper) {
+                    jStart = 0;
+                    jEnd = k / (int)bsize + 1;
+                    z = jEnd - jStart;
+                    zinc = -1;
+                }
+                else {
+                    jStart = k / (int)bsize;
+                    jEnd = (k + (int)bsize - 1) / (int)bsize;
+                    z = 1;
+                    zinc = 1;
+                }
+
+                for (j = jStart; j < jEnd; j++) {
+                    idx = j * (int)bsize + k % (int)bsize;
+                    if (idx >= (int)N) {
+                        continue;
+                    }
+                    s += z * delta[i * N + idx];
+                    z += zinc;
+                }
+
+                deltaCLBLAS[i * N + k] = s * bsize;
+                assert(deltaCLBLAS[i* N + k] >= 0);
+            }
+        }
+    }
+
+    for (k = 0; k < (int)N; k++) {
+        for (i = 0; i < (int)M; i++) {
+            delta[i * N + k] += deltaCLBLAS[i * N + k];
+        }
+    }
+
+    delete[] deltaCLBLAS;
+}
diff --git a/src/tests/correctness/trsv-delta.h b/src/tests/correctness/trsv-delta.h
new file mode 100644
index 0000000..872fdba
--- /dev/null
+++ b/src/tests/correctness/trsv-delta.h
@@ -0,0 +1,296 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#ifndef TRSV_DELTA_H_
+#define TRSV_DELTA_H_
+
+#include "delta.h"
+
+static size_t
+trsvBlockSize(size_t elemSize)
+{
+    /* TODO: Right now TRSV generators use block size of 16 elements for the
+     *       double complex type, and of 32 elements for another types.
+     *       If this changes, we have to fetch block size from TRSV generator
+     *       somehow.
+     */
+    return (elemSize == sizeof(DoubleComplex)) ? 16 : 32;
+}
+
+template <typename T>
+void
+trsvDelta(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    T *A,
+    size_t lda,
+    T *X,
+	int incx,
+    cl_double *delta)
+{
+    cl_double *deltaCLBLAS, s;
+    int i, j, jStart, jEnd, idx;
+    int zinc;
+    size_t z = 0;
+    size_t bsize, lenX;
+    bool isUpper = false;
+	size_t previncxi=0;
+    T v;
+
+   	isUpper = ((uplo == clblasUpper) && (transA == clblasNoTrans)) ||
+             ((uplo == clblasLower) && (transA != clblasNoTrans));
+	// incx = abs(incx);
+	lenX = 1 + (N-1)*abs(incx);
+    deltaCLBLAS = new cl_double[lenX];
+    bsize = trsvBlockSize(sizeof(T));
+
+        // Calculate delta of TRSV evaluated with the Gauss' method
+
+            if (isUpper) {
+                for (i = (int)N - 1; i >= 0; i--) {
+					size_t incxi;
+
+					incxi = (incx > 0) ? (i*incx) : (N-1-i)*abs(incx);
+                    v = getElement<T>(clblasColumnMajor, clblasNoTrans, incxi, 0, X, lenX);
+                    if (diag == clblasNonUnit) {
+                        T tempA;
+                        if(lda > 0)
+                        {
+                            tempA = getElement<T>(order, transA, i, i, A, lda);
+                    }
+                        else
+                        {
+                            tempA = getElementPacked(order, clblasNoTrans, uplo, i, i, A, N);
+                        }
+                        v = v / tempA;
+                    }
+                    s = module(v) * DELTA_0<T>();
+                    if (i == (int)(N - 1)) {
+                        delta[ incxi ] = s;
+                    }
+                    else {
+                        delta[ incxi ] = s + delta[ previncxi ];
+                    }
+                    assert(delta[ incxi ] >= 0);
+					previncxi = incxi;
+                }
+            }
+            else {
+                for (i = 0; i < (int)N; i++) {
+					size_t incxi;
+
+					incxi = (incx > 0) ? (i*incx) : (N-1-i)*abs(incx);
+                    v = getElement<T>(clblasColumnMajor, clblasNoTrans, incxi, 0, X, lenX);
+                    if (diag == clblasNonUnit) {
+                        T tempA;
+                        if(lda > 0)
+                        {
+                            tempA = getElement<T>(order, transA, i, i, A, lda);
+                    }
+                        else
+                        {
+                            tempA = getElementPacked(order, clblasNoTrans, uplo, i, i, A, N);
+                        }
+                        v = v / tempA;
+                    }
+                    s = module(v) * DELTA_0<T>();
+                    if (i == 0) {
+                        delta[ incxi ] = s;
+                    }
+                    else {
+                        delta[ incxi ] = s + delta[ previncxi ];
+                    }
+                    assert(delta[ incxi ] >= 0);
+					previncxi = incxi;
+                }
+            }
+
+        // Calculate clblas TRSV delta
+
+            for (i = 0; i < (int)N; i++) {
+				size_t incxi;
+                s = 0.0;
+
+                /*
+                 *  For the upper triangular matrix the solving process proceeds
+                 *  from the bottom to the top, and the bottommost block's
+                 *  delta influents most of all. For the lower triangular matrix
+                 *  the situation is opposite.
+                 */
+                if (isUpper) {
+                    jStart = i / (int)bsize;
+                    // index of the block just after the last matrix block
+                    jEnd = ((int)N + (int)bsize - 1) / (int)bsize;
+                    z = 1;
+                    zinc = 1;
+                }
+                else {
+                    jStart = 0;
+                    jEnd = i / (int)bsize + 1;
+                    z = jEnd - jStart;
+                    zinc = -1;
+                }
+
+                for (j = jStart; j < jEnd; j++) {
+					size_t incxi;
+
+                    idx = j * (int)bsize + i % (int)bsize;
+                    if (idx >= (int)N) {
+                        continue;
+                    }
+					incxi = (incx > 0) ? (idx*incx) : (N-1-idx)*abs(incx);
+                    s += z * delta[ incxi ];
+                    z += zinc;
+                }
+
+				incxi = (incx > 0) ? (i*incx) : (N-1-i)*abs(incx);
+                deltaCLBLAS[ incxi ] = s * bsize;
+                assert(deltaCLBLAS[ incxi ] >= 0);
+            }
+
+			for (i = 0; i < (int)N; i++) {
+				size_t incxi;
+
+				incxi = (incx > 0) ? (i*incx) : (N-1-i)*abs(incx);
+				delta[ incxi ] += deltaCLBLAS[ incxi ];
+			}
+
+    delete[] deltaCLBLAS;
+}
+
+template <typename T>
+void
+tbsvDelta(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    T *A,
+    size_t lda,
+    T *X,
+    int incx,
+    cl_double *delta)
+{
+    cl_double *deltaCLBLAS, s;
+    int i, j, jStart, jEnd, idx;
+    int zinc;
+    size_t z = 0;
+    size_t bsize, lenX;
+    bool isUpper = false;
+    size_t previncxi=0;
+    T v;
+
+    isUpper = ((uplo == clblasUpper) && (transA == clblasNoTrans)) ||
+             ((uplo == clblasLower) && (transA != clblasNoTrans));
+    lenX = 1 + (N-1)*abs(incx);
+    deltaCLBLAS = new cl_double[lenX];
+    bsize = trsvBlockSize(sizeof(T));
+
+        // Calculate delta of TRSV evaluated with the Gauss' method
+
+            if (isUpper) {
+                for (i = (int)N - 1; i >= 0; i--) {
+                    size_t incxi;
+
+                    incxi = (incx > 0) ? (i*incx) : (N-1-i)*abs(incx);
+                    v = getElement<T>(clblasColumnMajor, clblasNoTrans, incxi, 0, X, lenX);
+                    if (diag == clblasNonUnit) {
+                        v = v / getElementBanded<T>(order, uplo, i, i, K, A, lda);
+                    }
+                    s = module(v) * DELTA_0<T>();
+                    if (i == (int)(N - 1)) {
+                        delta[ incxi ] = s;
+                    }
+                    else {
+                        delta[ incxi ] = s + delta[ previncxi ];
+                    }
+                    assert(delta[ incxi ] >= 0);
+                    previncxi = incxi;
+                }
+            }
+            else {
+                for (i = 0; i < (int)N; i++) {
+                    size_t incxi;
+
+                    incxi = (incx > 0) ? (i*incx) : (N-1-i)*abs(incx);
+                    v = getElement<T>(clblasColumnMajor, clblasNoTrans, incxi, 0, X, lenX);
+                    if (diag == clblasNonUnit) {
+                        v = v / getElementBanded<T>(order, uplo, i, i, K, A, lda);
+                    }
+                    s = module(v) * DELTA_0<T>();
+                    if (i == 0) {
+                        delta[ incxi ] = s;
+                    }
+                    else {
+                        delta[ incxi ] = s + delta[ previncxi ];
+                    }
+                    assert(delta[ incxi ] >= 0);
+                    previncxi = incxi;
+                }
+            }
+
+        // Calculate clblas TRSV delta
+
+            for (i = 0; i < (int)N; i++) {
+                size_t incxi;
+                s = 0.0;
+                if (isUpper) {
+                    jStart = i / (int)bsize;
+                    // index of the block just after the last matrix block
+                    jEnd = ((int)N + (int)bsize - 1) / (int)bsize;
+                    z = 1;
+                    zinc = 1;
+                }
+                else {
+                    jStart = 0;
+                    jEnd = i / (int)bsize + 1;
+                    z = jEnd - jStart;
+                    zinc = -1;
+                }
+
+                for (j = jStart; j < jEnd; j++) {
+                    size_t incxi;
+
+                    idx = j * (int)bsize + i % (int)bsize;
+                    if (idx >= (int)N) {
+                        continue;
+                    }
+                    incxi = (incx > 0) ? (idx*incx) : (N-1-idx)*abs(incx);
+                    s += z * delta[ incxi ];
+                    z += zinc;
+                }
+
+                incxi = (incx > 0) ? (i*incx) : (N-1-i)*abs(incx);
+                deltaCLBLAS[ incxi ] = s * bsize;
+                assert(deltaCLBLAS[ incxi ] >= 0);
+            }
+
+            for (i = 0; i < (int)N; i++) {
+                size_t incxi;
+
+                incxi = (incx > 0) ? (i*incx) : (N-1-i)*abs(incx);
+                delta[ incxi ] += deltaCLBLAS[ incxi ];
+            }
+
+    delete[] deltaCLBLAS;
+}
+#endif
+
diff --git a/src/tests/functional/BlasBase-func.cpp b/src/tests/functional/BlasBase-func.cpp
new file mode 100644
index 0000000..f745762
--- /dev/null
+++ b/src/tests/functional/BlasBase-func.cpp
@@ -0,0 +1,117 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <gtest/gtest.h>
+#include <math.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <BlasBase.h>
+
+namespace clMath {
+
+static size_t
+imageMaxDimension(cl_context context, int widthHeight)
+{
+    cl_int err;
+    cl_device_id devices[2];
+    size_t i, retSize;
+    size_t rc = (size_t)-1;
+    cl_device_info par;
+
+    par = (widthHeight) ? CL_DEVICE_IMAGE2D_MAX_HEIGHT :
+                          CL_DEVICE_IMAGE2D_MAX_WIDTH;
+
+    err = clGetContextInfo(context, CL_CONTEXT_DEVICES,
+        sizeof(devices), devices, &retSize);
+    if (err == CL_SUCCESS) {
+        size_t s;
+
+        retSize /= sizeof(cl_device_id);
+        for (i = 0; (i < retSize) && (err == CL_SUCCESS); i++) {
+            err = clGetDeviceInfo(devices[i], par, sizeof(s), &s, NULL);
+            if (err == CL_SUCCESS) {
+                rc = std::min(rc, s);
+            }
+        }
+    }
+
+    if (err != CL_SUCCESS) {
+        rc = 0;
+    }
+
+    return rc;
+}
+
+static size_t
+imageMaxWidth(cl_context context)
+{
+    return imageMaxDimension(context, 0);
+}
+
+static size_t
+imageMaxHeight(cl_context context)
+{
+    return imageMaxDimension(context, 1);
+}
+
+clblasStatus
+BlasBase::addScratchImages(void)
+{
+    //cl_ulong memSize, allocSize;
+    //size_t width, height;
+    //clblasStatus status;
+    //float scale;
+
+    ///*
+    // * get maximum amount of memory each image can takes, not
+    // * forgetting that it can be up to three matrices residing
+    // * in memory objects
+    // */
+    //allocSize = maxMemAllocSize();
+    //memSize = availGlobalMemSize(0);
+    //if (allocSize > memSize / 5) {
+    //    allocSize = memSize / 5;
+    //    scale = 1.4f;
+    //}
+    //else {
+    //    scale = 1.5f;
+    //}
+
+    //height = static_cast<size_t>(sqrt(static_cast<double>(allocSize) / sizeof(cl_float)));
+    //width  = height / 4;
+    //height = static_cast<size_t>(height / scale);
+    //width  = static_cast<size_t>(width * scale);
+
+    //if (height > imageMaxHeight(context_)) {
+    //    height = imageMaxHeight(context_);
+    //}
+    //if (width > imageMaxWidth(context_)) {
+    //    width = imageMaxWidth(context_);
+    //}
+
+    //imageA_ = clblasAddScratchImage(context_, width, height, &status);
+    //if (imageA_) {
+    //    imageB_ = clblasAddScratchImage(context_, width, height, &status);
+    //}
+
+    //return status;
+	return clblasNotImplemented;
+
+}
+
+}   // namespace
diff --git a/src/tests/functional/func-error.cpp b/src/tests/functional/func-error.cpp
new file mode 100644
index 0000000..064e182
--- /dev/null
+++ b/src/tests/functional/func-error.cpp
@@ -0,0 +1,1354 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include "blas-wrapper.h"
+#include "clBLAS-wrapper.h"
+#include "BlasBase.h"
+#include "blas-random.h"
+#include "timer.h"
+#include "func.h"
+
+
+
+template <typename M>
+class ErrorClass
+{
+    M metod;
+protected:
+    bool generateData();
+public:
+    void error(cl_int err_etalon);
+//    nano_time_t runRepeat(int rep, cl_int* err);
+};
+
+template <typename T> bool
+ErrorClass<T>::generateData()
+{
+    metod.generateData();
+    bool ret = metod.prepareDataToRun();
+
+    if (!ret) {
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+    }
+    return ret;
+}
+
+template <typename M> void
+ErrorClass<M>::error(cl_int err_etalon)
+{
+    metod.initDefault(1024, 1);
+    cl_command_queue queues = metod.queues[0];
+    if (generateData()) {
+        switch (err_etalon) {
+        case CL_INVALID_EVENT_WAIT_LIST:
+            metod.inEvent = NULL;
+            metod.inEventCount = 1;
+            break;
+        case CL_INVALID_EVENT:
+            metod.outEvent = NULL;
+            metod.inEventCount = 1;
+            break;
+            case CL_INVALID_CONTEXT:
+            clReleaseContext(metod.context);
+            break;
+        case CL_INVALID_COMMAND_QUEUE:
+            metod.queues[0] = NULL;
+            break;
+        case clblasInvalidMatA:
+        case clblasInvalidVecX:
+        case CL_INVALID_MEM_OBJECT:
+            metod.bufA = NULL;
+            metod.bufAP = NULL;
+            metod.bufX = NULL;
+            metod.bufY = NULL;
+            break;
+        case CL_INVALID_DEVICE:
+            break;
+        case clblasInsufficientMemMatA:
+        case clblasInsufficientMemMatB:
+        case clblasInsufficientMemVecX:
+        case CL_INVALID_VALUE:
+			metod.size = 2048;
+            //metod.bufA = NULL;
+            break;
+        default:
+            FAIL() << "Unknown Error cod " << err_etalon;
+        }
+
+        cl_int err = metod.run();
+
+	    metod.queues[0] = queues;
+
+        ASSERT_EQ(err, err_etalon) << "clFinish()";
+
+    }
+    metod.destroy();
+}
+
+#ifdef DO_THEIRS
+// Instantiate the test
+TEST(ERROR, InvalidCommandQueue) {
+    ErrorClass<GemmMetod<float> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitList) {
+    ErrorClass<GemmMetod<float> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObject) {
+    ErrorClass<GemmMetod<float> > ec;
+    ec.error(clblasInvalidMatA);
+}
+TEST(ERROR, InvalidValue) {
+    ErrorClass<GemmMetod<float> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDevice) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+        ErrorClass<GemmMetod<double> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+
+// Instantiate the test
+#endif
+
+#ifdef DO_TRMV
+TEST(ERROR, InvalidCommandQueuetrmv) {
+    ErrorClass<TrmvMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListtrmv) {
+    ErrorClass<TrmvMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjecttrmv) {
+    ErrorClass<TrmvMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidMatA);
+}
+TEST(ERROR, InvalidValuetrmv) {
+    ErrorClass<TrmvMetod<FloatComplex> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDevicetrmv) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+	ErrorClass<TrmvMetod<cl_double> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_TRSV
+TEST(ERROR, InvalidCommandQueue_trsv) {
+    ErrorClass<TrsvMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitList_trsv) {
+    ErrorClass<TrsvMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObject_trsv) {
+    ErrorClass<TrsvMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidMatA);
+}
+TEST(ERROR, InvalidValue_trsv) {
+    ErrorClass<TrsvMetod<FloatComplex> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDevice_trsv) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+        ErrorClass<TrsvMetod<double> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_TPSV
+TEST(ERROR, InvalidCommandQueue_tpsv) {
+    ErrorClass<TpsvMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitList_tpsv) {
+    ErrorClass<TpsvMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObject_tpsv) {
+    ErrorClass<TpsvMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidMatA);
+}
+TEST(ERROR, InvalidValue_tpsv) {
+    ErrorClass<TpsvMetod<FloatComplex> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDevice_tpsv) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+        ErrorClass<TpsvMetod<double> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_TPMV
+TEST(ERROR, InvalidCommandQueue_tpmv) {
+    ErrorClass<TpmvMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitList_tpmv) {
+    ErrorClass<TpmvMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObject_tpmv) {
+    ErrorClass<TpmvMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidMatA);
+}
+TEST(ERROR, InvalidValue_tpmv) {
+    ErrorClass<TpmvMetod<FloatComplex> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDevice_tpmv) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+        ErrorClass<TpmvMetod<double> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+
+#ifdef DO_SYMM
+TEST(ERROR, InvalidCommandQueuesymm) {
+    ErrorClass<SymmMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListsymm) {
+    ErrorClass<SymmMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectsymm) {
+    ErrorClass<SymmMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidMatA);
+}
+TEST(ERROR, InvalidValuesymm) {
+    ErrorClass<SymmMetod<FloatComplex> > ec;
+    ec.error(clblasInsufficientMemMatB);
+}
+
+TEST(ERROR, InvalidDevicesymm) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<SymmMetod<cl_double> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_SYR
+TEST(ERROR, InvalidCommandQueuesyr) {
+    ErrorClass<SyrMetod<cl_float> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListsyr) {
+    ErrorClass<SyrMetod<cl_float> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectsyr) {
+    ErrorClass<SyrMetod<cl_float> > ec;
+    ec.error(clblasInvalidMatA);
+}
+TEST(ERROR, InvalidValuesyr) {
+    ErrorClass<SyrMetod<cl_float> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDevicesyr) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<SyrMetod<cl_double> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_SPR
+TEST(ERROR, InvalidCommandQueuespr) {
+    ErrorClass<SprMetod<cl_float> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListspr) {
+    ErrorClass<SprMetod<cl_float> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectspr) {
+    ErrorClass<SprMetod<cl_float> > ec;
+    ec.error(clblasInvalidMatA);
+}
+TEST(ERROR, InvalidValuespr) {
+    ErrorClass<SprMetod<cl_float> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDevicespr) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<SprMetod<cl_double> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+
+#ifdef DO_SYR2
+TEST(ERROR, InvalidCommandQueuesyr2) {
+    ErrorClass<Syr2Metod<cl_float> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListsyr2) {
+    ErrorClass<Syr2Metod<cl_float> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectsyr2) {
+    ErrorClass<Syr2Metod<cl_float> > ec;
+    ec.error(clblasInvalidMatA);
+}
+TEST(ERROR, InvalidValuesyr2) {
+    ErrorClass<Syr2Metod<cl_float> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDevicesyr2) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<Syr2Metod<cl_double> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_GER
+TEST(ERROR, InvalidCommandQueueger) {
+    ErrorClass<GerMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListger) {
+    ErrorClass<GerMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectger) {
+    ErrorClass<GerMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidMatA);
+}
+TEST(ERROR, InvalidValueger) {
+    ErrorClass<GerMetod<FloatComplex> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDeviceger) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+        ErrorClass<GerMetod<cl_double> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_GERC
+TEST(ERROR, InvalidCommandQueuegerc) {
+    ErrorClass<GercMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListgerc) {
+    ErrorClass<GercMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectgerc) {
+    ErrorClass<GercMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidMatA);
+}
+TEST(ERROR, InvalidValuegerc) {
+    ErrorClass<GercMetod<FloatComplex> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDevicegerc) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+        ErrorClass<GercMetod<DoubleComplex> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+
+#ifdef DO_HER
+TEST(ERROR, InvalidCommandQueueher) {
+    ErrorClass<HerMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListher) {
+    ErrorClass<HerMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjecther) {
+    ErrorClass<HerMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidMatA);
+}
+
+TEST(ERROR, InvalidValueher) {
+
+    ErrorClass<HerMetod<DoubleComplex> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDeviceher) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<HerMetod<DoubleComplex> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_HER2
+TEST(ERROR, InvalidCommandQueueher2) {
+    ErrorClass<Her2Metod<FloatComplex> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListher2) {
+    ErrorClass<Her2Metod<FloatComplex> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjecther2) {
+    ErrorClass<Her2Metod<FloatComplex> > ec;
+    ec.error(clblasInvalidMatA);
+}
+
+TEST(ERROR, InvalidValueher2) {
+
+    ErrorClass<Her2Metod<DoubleComplex> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDeviceher2) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<Her2Metod<DoubleComplex> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_HEMM
+TEST(ERROR, InvalidCommandQueuehemm) {
+    ErrorClass<HemmMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListhemm) {
+    ErrorClass<HemmMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjecthemm) {
+    ErrorClass<HemmMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidMatA);
+}
+
+TEST(ERROR, InvalidValuehemm) {
+
+    ErrorClass<HemmMetod<DoubleComplex> > ec;
+    ec.error(clblasInsufficientMemMatB);
+}
+
+TEST(ERROR, InvalidDevicehemm) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<HemmMetod<DoubleComplex> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_HEMV
+TEST(ERROR, InvalidCommandQueuehemv) {
+    ErrorClass<HemvMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListhemv) {
+    ErrorClass<HemvMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjecthemv) {
+    ErrorClass<HemvMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidMatA);
+}
+
+TEST(ERROR, InvalidValuehemv) {
+
+    ErrorClass<HemvMetod<DoubleComplex> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDevicehemv) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<HemvMetod<DoubleComplex> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_HERK
+TEST(ERROR, InvalidCommandQueueherk) {
+    ErrorClass<HerkMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListherk) {
+    ErrorClass<HerkMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectherk) {
+    ErrorClass<HerkMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidMatA);
+}
+
+TEST(ERROR, InvalidValueherk) {
+    ErrorClass<HerkMetod<DoubleComplex> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDeviceherk) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<HerkMetod<DoubleComplex> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+
+#ifdef DO_HPMV
+
+TEST(ERROR, InvalidCommandQueuehpmv) {
+    ErrorClass<HpmvMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListhpmv) {
+    ErrorClass<HpmvMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjecthpmv) {
+    ErrorClass<HpmvMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidMatA);
+}
+
+TEST(ERROR, InvalidValuehpmv) {
+
+    ErrorClass<HpmvMetod<DoubleComplex> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDevicehpmv) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<HpmvMetod<DoubleComplex> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+
+#ifdef DO_SPMV
+TEST(ERROR, InvalidCommandQueuespmv) {
+    ErrorClass<SpmvMetod<cl_float> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListspmv) {
+    ErrorClass<SpmvMetod<cl_float> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectspmv) {
+    ErrorClass<SpmvMetod<cl_float> > ec;
+    ec.error(clblasInvalidMatA);
+}
+
+TEST(ERROR, InvalidValuespmv) {
+
+    ErrorClass<SpmvMetod<cl_double> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDevicespmv) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<SpmvMetod<cl_double> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_SPR2
+TEST(ERROR, InvalidCommandQueuespr2) {
+    ErrorClass<Spr2Metod<cl_float> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListspr2) {
+    ErrorClass<Spr2Metod<cl_float> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectspr2) {
+    ErrorClass<Spr2Metod<cl_float> > ec;
+    ec.error(clblasInvalidMatA);
+}
+TEST(ERROR, InvalidValuespr2) {
+    ErrorClass<Spr2Metod<cl_float> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDevicespr2) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<Spr2Metod<cl_double> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+
+#ifdef DO_HPR
+TEST(ERROR, InvalidCommandQueuehpr) {
+    ErrorClass<HprMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListhpr) {
+    ErrorClass<HprMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjecthpr) {
+    ErrorClass<HprMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidMatA);
+}
+
+TEST(ERROR, InvalidValuehpr) {
+
+    ErrorClass<HprMetod<DoubleComplex> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDevicehpr) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<HprMetod<DoubleComplex> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_HPR2
+TEST(ERROR, InvalidCommandQueuehpr2) {
+    ErrorClass<Hpr2Metod<FloatComplex> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListhpr2) {
+    ErrorClass<Hpr2Metod<FloatComplex> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjecthpr2) {
+    ErrorClass<Hpr2Metod<FloatComplex> > ec;
+    ec.error(clblasInvalidMatA);
+}
+
+TEST(ERROR, InvalidValuehpr2) {
+
+    ErrorClass<Hpr2Metod<DoubleComplex> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDevicehpr2) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<Hpr2Metod<DoubleComplex> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+
+#ifdef DO_GBMV
+TEST(ERROR, InvalidCommandQueueGBMV) {
+    ErrorClass<GbmvMetod<cl_float> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListGBMV) {
+    ErrorClass<GbmvMetod<cl_float> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectGBMV) {
+    ErrorClass<GbmvMetod<cl_float> > ec;
+    ec.error(clblasInvalidMatA);
+}
+
+TEST(ERROR, InvalidValueGBMV) {
+    ErrorClass<GbmvMetod<cl_double> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDeviceGBMV) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<GbmvMetod<cl_double> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_SBMV
+TEST(ERROR, InvalidCommandQueuesbmv) {
+    ErrorClass<SbmvMetod<cl_float> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListsbmv) {
+    ErrorClass<SbmvMetod<cl_float> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectsbmv) {
+    ErrorClass<SbmvMetod<cl_float> > ec;
+    ec.error(clblasInvalidMatA);
+}
+
+TEST(ERROR, InvalidValuesbmv) {
+    ErrorClass<SbmvMetod<cl_float> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDevicesbmv) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<SbmvMetod<cl_double> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_HBMV
+TEST(ERROR, InvalidCommandQueuehbmv) {
+    ErrorClass<HbmvMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListhbmv) {
+    ErrorClass<HbmvMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjecthbmv) {
+    ErrorClass<HbmvMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidMatA);
+}
+
+TEST(ERROR, InvalidValuehbmv) {
+    ErrorClass<HbmvMetod<FloatComplex> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDevicehbmv) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<HbmvMetod<DoubleComplex> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+
+#ifdef DO_TBMV
+TEST(ERROR, InvalidCommandQueueTBMV) {
+    ErrorClass<TbmvMetod<cl_float> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListTBMV) {
+    ErrorClass<TbmvMetod<cl_float> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectTBMV) {
+    ErrorClass<TbmvMetod<cl_float> > ec;
+    ec.error(clblasInvalidMatA);
+}
+
+TEST(ERROR, InvalidValueTBMV) {
+
+    ErrorClass<TbmvMetod<cl_double> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDeviceTBMV) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<TbmvMetod<cl_double> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_TBSV
+TEST(ERROR, InvalidCommandQueueTBSV) {
+    ErrorClass<TbsvMetod<cl_float> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListTBSV) {
+    ErrorClass<TbsvMetod<cl_float> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectTBSV) {
+    ErrorClass<TbsvMetod<cl_float> > ec;
+    ec.error(clblasInvalidMatA);
+}
+
+TEST(ERROR, InvalidValueTBSV) {
+
+    ErrorClass<TbsvMetod<cl_double> > ec;
+    ec.error(clblasInsufficientMemVecX);
+}
+
+TEST(ERROR, InvalidDeviceTBSV) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<TbsvMetod<cl_double> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_HER2K
+TEST(ERROR, InvalidCommandQueueher2k) {
+    ErrorClass<Her2kMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListher2k) {
+    ErrorClass<Her2kMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjecther2k) {
+    ErrorClass<Her2kMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidMatA);
+}
+
+TEST(ERROR, InvalidValueher2k) {
+    ErrorClass<Her2kMetod<DoubleComplex> > ec;
+    ec.error(clblasInsufficientMemMatA);
+}
+
+TEST(ERROR, InvalidDeviceher2k) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<Her2kMetod<DoubleComplex> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_SCAL
+TEST(ERROR, InvalidCommandQueuescal) {
+    ErrorClass<ScalMetod<float> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListscal) {
+    ErrorClass<ScalMetod<double> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectscal) {
+    ErrorClass<ScalMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidVecX);
+}
+
+TEST(ERROR, InvalidValuescal) {
+    ErrorClass<ScalMetod<DoubleComplex> > ec;
+    ec.error(clblasInsufficientMemVecX);
+}
+
+TEST(ERROR, InvalidDevicescal) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<ScalMetod<DoubleComplex> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_SSCAL
+TEST(ERROR, InvalidCommandQueuesscal) {
+    ErrorClass<SscalMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListsscal) {
+    ErrorClass<SscalMetod<DoubleComplex> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectsscal) {
+    ErrorClass<SscalMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidVecX);
+}
+
+TEST(ERROR, InvalidValuesscal) {
+    ErrorClass<SscalMetod<DoubleComplex> > ec;
+    ec.error(clblasInsufficientMemVecX);
+}
+
+TEST(ERROR, InvalidDevicesscal) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<SscalMetod<DoubleComplex> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_SWAP
+TEST(ERROR, InvalidCommandQueueswap) {
+    ErrorClass<SwapMetod<float> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListswap) {
+    ErrorClass<SwapMetod<double> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectswap) {
+    ErrorClass<SwapMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidVecX);
+}
+
+TEST(ERROR, InvalidValueswap) {
+    ErrorClass<SwapMetod<DoubleComplex> > ec;
+    ec.error(clblasInsufficientMemVecX);
+}
+
+TEST(ERROR, InvalidDeviceswap) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<SwapMetod<DoubleComplex> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_COPY
+TEST(ERROR, InvalidCommandQueuecopy) {
+    ErrorClass<CopyMetod<float> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListcopy) {
+    ErrorClass<CopyMetod<double> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectcopy) {
+    ErrorClass<CopyMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidVecX);
+}
+
+TEST(ERROR, InvalidValuecopy) {
+    ErrorClass<CopyMetod<DoubleComplex> > ec;
+    ec.error(clblasInsufficientMemVecX);
+}
+
+TEST(ERROR, InvalidDevicecopy) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<CopyMetod<DoubleComplex> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+
+#ifdef DO_AXPY
+TEST(ERROR, InvalidCommandQueueaxpy) {
+    ErrorClass<AxpyMetod<float> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListaxpy) {
+    ErrorClass<AxpyMetod<double> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectaxpy) {
+    ErrorClass<AxpyMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidVecX);
+}
+
+TEST(ERROR, InvalidValueaxpy) {
+    ErrorClass<AxpyMetod<DoubleComplex> > ec;
+    ec.error(clblasInsufficientMemVecX);
+}
+
+TEST(ERROR, InvalidDeviceaxpy) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<AxpyMetod<DoubleComplex> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+//DOT
+#ifdef DO_DOT
+TEST(ERROR, InvalidCommandQueuedot) {
+    ErrorClass<DotMetod<cl_float> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListdot) {
+    ErrorClass<DotMetod<cl_double> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectdot) {
+    ErrorClass<DotMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidVecX);
+}
+TEST(ERROR, InvalidValuedot) {
+    ErrorClass<DotMetod<DoubleComplex> > ec;
+    ec.error(clblasInsufficientMemVecX);
+}
+
+TEST(ERROR, InvalidDevicedot) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<DotMetod<DoubleComplex> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_ASUM
+TEST(ERROR, InvalidCommandQueueasum) {
+    ErrorClass<AsumMetod<cl_float> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListasum) {
+    ErrorClass<AsumMetod<cl_double> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectasum) {
+    ErrorClass<AsumMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidVecX);
+}
+TEST(ERROR, InvalidValueasum) {
+    ErrorClass<AsumMetod<DoubleComplex> > ec;
+    ec.error(clblasInsufficientMemVecX);
+}
+
+TEST(ERROR, InvalidDeviceasum) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<AsumMetod<DoubleComplex> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_iAMAX
+TEST(ERROR, InvalidCommandQueueiamax) {
+    ErrorClass<iAmaxMetod<cl_float> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListiamax) {
+    ErrorClass<iAmaxMetod<cl_double> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectiamax) {
+    ErrorClass<iAmaxMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidVecX);
+}
+TEST(ERROR, InvalidValueiamax) {
+    ErrorClass<iAmaxMetod<DoubleComplex> > ec;
+    ec.error(clblasInsufficientMemVecX);
+}
+
+TEST(ERROR, InvalidDeviceiamax) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<iAmaxMetod<DoubleComplex> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+//DOTC
+#ifdef DO_DOTC
+TEST(ERROR, InvalidCommandQueuedotc) {
+    ErrorClass<DotcMetod<FloatComplex> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListdotc) {
+    ErrorClass<DotcMetod<DoubleComplex> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectdotc) {
+    ErrorClass<DotcMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidVecX);
+}
+TEST(ERROR, InvalidValuedotc) {
+    ErrorClass<DotcMetod<DoubleComplex> > ec;
+    ec.error(clblasInsufficientMemVecX);
+}
+
+TEST(ERROR, InvalidDevicedotc) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<DotMetod<DoubleComplex> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+
+#ifdef DO_ROTG
+TEST(ERROR, InvalidCommandQueuerotg) {
+    ErrorClass<RotgMetod<float> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListrotg) {
+    ErrorClass<RotgMetod<double> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectrotg) {
+    ErrorClass<RotgMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidVecX);
+}
+
+/*  Skipping Invalid value- because rotg doesn't depend on parameter N,
+                            So even passing an invalid N doesn't matter
+TEST(ERROR, InvalidValuerotg) {
+    ErrorClass<RotgMetod<DoubleComplex> > ec;
+    ec.error(clblasInsufficientMemVecX);
+}
+*/
+
+TEST(ERROR, InvalidDevicerotg) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<RotgMetod<DoubleComplex> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_ROTM
+TEST(ERROR, InvalidCommandQueuerotm) {
+    ErrorClass<RotmMetod<float> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListrotm) {
+    ErrorClass<RotmMetod<double> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectrotm) {
+    ErrorClass<RotmMetod<float> > ec;
+    ec.error(clblasInvalidVecX);
+}
+
+TEST(ERROR, InvalidValuerotm) {
+    ErrorClass<RotmMetod<double> > ec;
+    ec.error(clblasInsufficientMemVecX);
+}
+
+TEST(ERROR, InvalidDevicerotm) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<RotmMetod<double> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_ROT
+TEST(ERROR, InvalidCommandQueuerot) {
+    ErrorClass<RotMetod<float> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListrot) {
+    ErrorClass<RotMetod<double> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectrot) {
+    ErrorClass<RotMetod<FloatComplex> > ec;
+    ec.error(clblasInvalidVecX);
+}
+
+TEST(ERROR, InvalidValuerot) {
+    ErrorClass<RotMetod<DoubleComplex> > ec;
+    ec.error(clblasInsufficientMemVecX);
+}
+
+TEST(ERROR, InvalidDevicerot) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<RotMetod<DoubleComplex> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_ROTMG
+TEST(ERROR, InvalidCommandQueuerotmg) {
+    ErrorClass<RotmgMetod<float> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListrotmg) {
+    ErrorClass<RotmgMetod<double> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectrotmg) {
+    ErrorClass<RotmgMetod<float> > ec;
+    ec.error(clblasInvalidVecX);
+}
+
+/*  Skipping Invalid value- because rotg doesn't depend on parameter N,
+                            So even passing an invalid N doesn't matter
+TEST(ERROR, InvalidValuerotmg) {
+    ErrorClass<RotmgMetod<double> > ec;
+    ec.error(clblasInsufficientMemVecX);
+}
+*/
+
+TEST(ERROR, InvalidDevicerotmg) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<RotmgMetod<double> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+#ifdef DO_NRM2
+TEST(ERROR, InvalidCommandQueuenrm2) {
+    ErrorClass<Nrm2Metod<cl_float> > ec;
+    ec.error(CL_INVALID_COMMAND_QUEUE);
+}
+
+TEST(ERROR, InvalidEventWaitListnrm2) {
+    ErrorClass<Nrm2Metod<cl_double> > ec;
+    ec.error(CL_INVALID_EVENT_WAIT_LIST);
+}
+
+TEST(ERROR, InvalidMemObjectnrm2) {
+    ErrorClass<Nrm2Metod<FloatComplex> > ec;
+    ec.error(clblasInvalidVecX);
+}
+TEST(ERROR, InvalidValuenrm2) {
+    ErrorClass<Nrm2Metod<DoubleComplex> > ec;
+    ec.error(clblasInsufficientMemVecX);
+}
+
+TEST(ERROR, InvalidDevicenrm2) {
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();
+    if (!base->isDevSupportDoublePrecision()) {
+    ErrorClass<Nrm2Metod<DoubleComplex> > ec;
+        ec.error(CL_INVALID_DEVICE);
+    }
+}
+#endif
+
+
diff --git a/src/tests/functional/func-event.cpp b/src/tests/functional/func-event.cpp
new file mode 100644
index 0000000..0f611b8
--- /dev/null
+++ b/src/tests/functional/func-event.cpp
@@ -0,0 +1,1609 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+//#include <stdlib.h>             // srand()
+//#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+//
+//#include "common.h"
+//#include "blas.h"
+#include "blas-wrapper.h"
+#include "clBLAS-wrapper.h"
+#include "BlasBase.h"
+#include "blas-random.h"
+#include "timer.h"
+#include "func.h"
+
+template <typename M>
+class EventClass
+{
+    M metod;
+protected:
+    void eventOutCorrectnessTest();
+    void eventInCorrectnessTest();
+    bool generateData();
+public:
+    void runOut();
+    void runIn();
+};
+template <typename T> bool
+EventClass<T>::generateData()
+{
+    metod.generateData();
+    bool ret =metod.prepareDataToRun();
+    if (!ret) {
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+    }
+    return ret;
+}
+
+template <typename M> void
+EventClass<M>::runOut()
+{
+    metod.initDefault(512*4, 1);
+    eventOutCorrectnessTest();
+    metod.destroy();
+}
+
+template <typename M> void
+EventClass<M>::runIn()
+{
+    metod.initDefault(256, 1);
+    eventInCorrectnessTest();
+    metod.destroy();
+}
+
+
+template <typename M> void
+EventClass<M>::eventOutCorrectnessTest()
+{
+    cl_int err;
+
+    if (generateData()) {
+
+        metod.initOutEvent();
+        err = metod.run();
+        ASSERT_EQ(err, CL_SUCCESS) << "clFinish()";
+
+        //logEvent(events);
+        err = clFinish(metod.queues[0]);
+
+        ASSERT_EQ(err, CL_SUCCESS) << "clFinish()";
+
+        cl_int ret = CL_SUCCESS;
+        err = clGetEventInfo(*metod.outEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &ret, NULL);
+        ASSERT_EQ(err, CL_SUCCESS) << "clGetEventInfo()";
+        ASSERT_EQ(ret, CL_COMPLETE) << "clGetEventInfo()";
+    }
+}
+
+template <typename T> void
+EventClass<T>::eventInCorrectnessTest()
+{
+    cl_int err;
+    cl_int ret = CL_SUCCESS;
+    int qmax = metod.qnum;
+    nano_time_t minSleepTime = 100000000;
+
+
+    if (generateData()) {
+
+        metod.outEvent = new cl_event[1];
+        metod.outEvent[0] = NULL;
+
+        nano_time_t timeFirst = getCurrentTime();
+        // First run.
+        err = metod.run();
+        ASSERT_EQ(err, CL_SUCCESS) << "clFinish()";
+        for (int i = 0; i < qmax; ++i) {
+            err = clFinish(metod.queues[i]);
+        }
+        ASSERT_EQ(err, CL_SUCCESS) << "clFinish()";
+        timeFirst = getCurrentTime() - timeFirst;
+
+        cl_event event = clCreateUserEvent(metod.context, &err);
+        ASSERT_EQ(err, CL_SUCCESS) << "clCreateUserEvent()";
+
+        metod.inEventCount = 1;
+        metod.inEvent = &event;
+
+        err = metod.run();
+        ASSERT_EQ(err, CL_SUCCESS) << "runClBlasFunction()";
+
+        clFlush(metod.queues[0]);
+
+        //
+        sleepTime((timeFirst < minSleepTime)? minSleepTime : timeFirst);
+
+        clSetUserEventStatus(event, CL_COMPLETE);
+
+        err = clFinish(metod.queues[0]);
+        err = clGetEventInfo(metod.outEvent[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &ret, NULL);
+        ASSERT_EQ(err, CL_SUCCESS) << "clGetEventInfo()";
+        ASSERT_EQ(ret, CL_COMPLETE) << "clGetEventInfo()";
+
+        clReleaseEvent(event);
+        metod.inEventCount = 0;
+        metod.inEvent = NULL;
+
+    }
+}
+#ifdef DO_THEIRS
+//******************************************************//
+TEST(EVENT_OUT, sgemm) {
+    EventClass< GemmMetod<float> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, cgemm) {
+    EventClass< GemmMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, dgemm) {
+    CHECK_DOUBLE;
+    EventClass< GemmMetod<cl_double> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, zgemm) {
+    CHECK_DOUBLE;
+    EventClass<GemmMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+//******************************************************//
+TEST(EVENT_OUT, strmm) {
+    EventClass<TrmmMetod<float> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, ctrmm) {
+    EventClass<TrmmMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, dtrmm) {
+    CHECK_DOUBLE;
+    EventClass<TrmmMetod<cl_double> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, ztrmm) {
+    CHECK_DOUBLE;
+    EventClass<TrmmMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+//******************************************************//
+TEST(EVENT_OUT, strsm) {
+    EventClass<TrsmMetod<float> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, ctrsm) {
+    EventClass<TrsmMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, dtrsm) {
+    CHECK_DOUBLE;
+    EventClass<TrsmMetod<cl_double> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, ztrsm) {
+    CHECK_DOUBLE;
+    EventClass<TrsmMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+//******************************************************//
+TEST(EVENT_OUT, sgemv) {
+    EventClass<GemvMetod<float> > ec;
+    ec.runOut();
+
+}
+#if defined(_USE_GEMV_COMPLEX)
+TEST(EVENT_OUT, cgemv) {
+    EventClass<GemvMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+#endif
+TEST(EVENT_OUT, dgemv) {
+    CHECK_DOUBLE;
+    EventClass<GemvMetod<cl_double> > ec;
+    ec.runOut();
+}
+#if defined(_USE_GEMV_COMPLEX)
+TEST(EVENT_OUT, zgemv) {
+    CHECK_DOUBLE;
+    EventClass<GemvMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+//******************************************************//
+TEST(EVENT_OUT, ssymv) {
+    EventClass<SymvMetod<float> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, dsymv) {
+    CHECK_DOUBLE;
+    EventClass<SymvMetod<cl_double> > ec;
+    ec.runOut();
+}
+//******************************************************//
+TEST(EVENT_OUT, ssyr2k) {
+    EventClass<Syr2kMetod<float> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, dsyr2k) {
+    CHECK_DOUBLE;
+    EventClass<Syr2kMetod<cl_double> > ec;
+    ec.runOut();
+}
+//******************************************************//
+//******************************************************//
+TEST(EVENT_IN, sgemm) {
+    EventClass<GemmMetod<float> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, cgemm) {
+    EventClass<GemmMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, dgemm) {
+    CHECK_DOUBLE;
+    EventClass<GemmMetod<cl_double> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, zgemm) {
+    CHECK_DOUBLE;
+    EventClass<GemmMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+//******************************************************//
+TEST(EVENT_IN, strmm) {
+    EventClass<TrmmMetod<float> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, ctrmm) {
+    EventClass<TrsmMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, dgtrmm) {
+    CHECK_DOUBLE;
+    EventClass<TrmmMetod<cl_double> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, ztrmm) {
+    CHECK_DOUBLE;
+    EventClass<TrmmMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+//******************************************************//
+TEST(EVENT_IN, strsm) {
+    EventClass<TrsmMetod<float> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, ctrsm) {
+    EventClass<TrsmMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, dtrsm) {
+    CHECK_DOUBLE;
+    EventClass<TrsmMetod<cl_double> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, ztrsm) {
+    CHECK_DOUBLE;
+    EventClass<TrsmMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+//******************************************************//
+TEST(EVENT_IN, sgemv) {
+    EventClass<GemvMetod<float> > ec;
+    ec.runIn();
+}
+#if defined(_USE_GEMV_COMPLEX)
+TEST(EVENT_IN, cgemv) {
+    EventClass<GemvMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+#endif
+TEST(EVENT_IN, dgemv) {
+    CHECK_DOUBLE;
+    EventClass<GemvMetod<cl_double> > ec;
+    ec.runIn();
+}
+#if defined(_USE_GEMV_COMPLEX)
+TEST(EVENT_IN, zgemv) {
+    CHECK_DOUBLE;
+    EventClass<GemvMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+#endif
+//******************************************************//
+TEST(EVENT_IN, ssymv) {
+    EventClass<SymvMetod<float> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, dsymv) {
+    CHECK_DOUBLE;
+    EventClass<SymvMetod<cl_double> > ec;
+    ec.runIn();
+}
+//******************************************************//
+TEST(EVENT_IN, ssyr2k) {
+    EventClass<Syr2kMetod<float> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, dsyr2k) {
+    CHECK_DOUBLE;
+    EventClass<Syr2kMetod<cl_double> > ec;
+    ec.runIn();
+}
+#endif
+
+#ifdef DO_TRMV
+// TRMV
+//******************************************************//
+TEST(EVENT_OUT, strmv) {
+    EventClass<TrmvMetod<float> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, dtrmv) {
+	CHECK_DOUBLE;
+    EventClass<TrmvMetod<double> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, ctrmv) {
+    EventClass<TrmvMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, ztrmv) {
+	CHECK_DOUBLE;
+    EventClass<TrmvMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_IN, strmv) {
+    EventClass<TrmvMetod<float> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, dtrmv) {
+	CHECK_DOUBLE;
+    EventClass<TrmvMetod<double> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, ctrmv) {
+    EventClass<TrmvMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, ztrmv) {
+	CHECK_DOUBLE;
+    EventClass<TrmvMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+#endif
+
+#ifdef DO_TPMV
+TEST(EVENT_OUT, stpmv) {
+    EventClass<TpmvMetod<float> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, dtpmv) {
+    CHECK_DOUBLE;
+    EventClass<TpmvMetod<double> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, ctpmv) {
+    EventClass<TpmvMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, ztpmv) {
+    CHECK_DOUBLE;
+    EventClass<TpmvMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_IN, stpmv) {
+    EventClass<TpmvMetod<float> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, dtpmv) {
+    CHECK_DOUBLE;
+    EventClass<TpmvMetod<double> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, ctpmv) {
+    EventClass<TpmvMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, ztpmv) {
+    CHECK_DOUBLE;
+    EventClass<TpmvMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+#endif
+
+#ifdef DO_TRSV
+//******************************************************//
+// TRSV
+
+TEST(EVENT_OUT, strsv) {
+    EventClass<TrsvMetod<float> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, dtrsv) {
+	CHECK_DOUBLE;
+    EventClass<TrsvMetod<double> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, ctrsv) {
+    EventClass<TrsvMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, ztrsv) {
+	CHECK_DOUBLE;
+    EventClass<TrsvMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_IN, strsv) {
+    EventClass<TrsvMetod<float> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, dtrsv) {
+	CHECK_DOUBLE;
+    EventClass<TrsvMetod<double> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, ctrsv) {
+    EventClass<TrsvMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, ztrsv) {
+	CHECK_DOUBLE;
+    EventClass<TrsvMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+#endif
+
+#ifdef DO_TPSV
+TEST(EVENT_OUT, stpsv) {
+    EventClass<TpsvMetod<float> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, dtpsv) {
+    CHECK_DOUBLE;
+    EventClass<TpsvMetod<double> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, ctpsv) {
+    EventClass<TpsvMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, ztpsv) {
+    CHECK_DOUBLE;
+    EventClass<TpsvMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_IN, stpsv) {
+    EventClass<TpsvMetod<float> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, dtpsv) {
+    CHECK_DOUBLE;
+    EventClass<TpsvMetod<double> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, ctpsv) {
+    EventClass<TpsvMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, ztpsv) {
+    CHECK_DOUBLE;
+    EventClass<TpsvMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+#endif
+
+
+#ifdef DO_SYMM
+TEST(EVENT_IN, Ssymm) {
+    EventClass<SymmMetod<float> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Dsymm) {
+	CHECK_DOUBLE;
+    EventClass<SymmMetod<double> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Csymm) {
+    EventClass<SymmMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Zsymm) {
+	CHECK_DOUBLE;
+    EventClass<SymmMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Ssymm) {
+    EventClass<SymmMetod<float> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Dsymm) {
+	CHECK_DOUBLE;
+    EventClass<SymmMetod<double> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Csymm) {
+    EventClass<SymmMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Zsymm) {
+	CHECK_DOUBLE;
+    EventClass<SymmMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+#ifdef DO_SYR
+TEST(EVENT_IN, Ssyr) {
+    EventClass<SyrMetod<float> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Dsyr) {
+    CHECK_DOUBLE;
+    EventClass<SyrMetod<double> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Ssyr) {
+    EventClass<SyrMetod<float> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Dsyr) {
+    CHECK_DOUBLE;
+    EventClass<SyrMetod<double> > ec;
+    ec.runOut();
+}
+#endif
+
+#ifdef DO_SPR
+TEST(EVENT_IN, Sspr) {
+    EventClass<SprMetod<float> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Dspr) {
+    CHECK_DOUBLE;
+    EventClass<SprMetod<double> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Sspr) {
+    EventClass<SprMetod<float> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Dspr) {
+    CHECK_DOUBLE;
+    EventClass<SprMetod<double> > ec;
+    ec.runOut();
+}
+#endif
+
+
+#ifdef DO_SYR2
+TEST(EVENT_IN, Ssyr2) {
+    EventClass<Syr2Metod<float> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Dsyr2) {
+    CHECK_DOUBLE;
+    EventClass<Syr2Metod<double> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Ssyr2) {
+    EventClass<Syr2Metod<float> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Dsyr2) {
+    CHECK_DOUBLE;
+    EventClass<Syr2Metod<double> > ec;
+    ec.runOut();
+}
+#endif
+
+#ifdef DO_GER
+TEST(EVENT_IN, Sger) {
+    EventClass<GerMetod<float> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Dger) {
+	CHECK_DOUBLE;
+    EventClass<GerMetod<double> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Cgeru) {
+    EventClass<GerMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Zgeru) {
+	CHECK_DOUBLE;
+    EventClass<GerMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_OUT, Sger) {
+    EventClass<GerMetod<float> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Dger) {
+    CHECK_DOUBLE;
+    EventClass<GerMetod<double> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Cgeru) {
+    EventClass<GerMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Zgeru) {
+    CHECK_DOUBLE;
+    EventClass<GerMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+
+#ifdef DO_HER
+TEST(EVENT_IN, Cher) {
+    EventClass<HerMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Zher) {
+    CHECK_DOUBLE;
+    EventClass<HerMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Cher) {
+    EventClass<HerMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Zher) {
+    CHECK_DOUBLE;
+    EventClass<HerMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+#ifdef DO_GERC
+
+TEST(EVENT_IN, Cgerc) {
+    EventClass<GercMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Zgerc) {
+	CHECK_DOUBLE;
+    EventClass<GercMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Cgerc) {
+    EventClass<GercMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Zgerc) {
+    CHECK_DOUBLE;
+    EventClass<GercMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+#ifdef DO_HER2
+TEST(EVENT_IN, Cher2) {
+    EventClass<Her2Metod<FloatComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Zher2) {
+    CHECK_DOUBLE;
+    EventClass<Her2Metod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Cher2) {
+    EventClass<Her2Metod<FloatComplex> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Zher2) {
+    CHECK_DOUBLE;
+    EventClass<Her2Metod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+#ifdef DO_HEMM
+TEST(EVENT_IN, Chemm) {
+    EventClass<HemmMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, Zhemm) {
+    CHECK_DOUBLE;
+    EventClass<HemmMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Chemm) {
+    EventClass<HemmMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, Zhemm) {
+    CHECK_DOUBLE;
+    EventClass<HemmMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+
+#ifdef DO_HEMV
+TEST(EVENT_IN, Chemv) {
+    EventClass<HemvMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Zhemv) {
+    CHECK_DOUBLE;
+    EventClass<HemvMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Chemv) {
+    EventClass<HemvMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Zhemv) {
+    CHECK_DOUBLE;
+    EventClass<HemvMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+#ifdef DO_HERK
+TEST(EVENT_IN, Cherk) {
+    EventClass<HerkMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, Zherk) {
+    CHECK_DOUBLE;
+    EventClass<HerkMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Cherk) {
+    EventClass<HerkMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, Zherk) {
+    CHECK_DOUBLE;
+    EventClass<HerkMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+
+#ifdef DO_HPMV
+TEST(EVENT_IN, Chpmv) {
+    EventClass<HpmvMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Zhpmv) {
+    CHECK_DOUBLE;
+    EventClass<HpmvMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Chpmv) {
+    EventClass<HpmvMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Zhpmv) {
+    CHECK_DOUBLE;
+    EventClass<HpmvMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+
+#ifdef DO_SPMV
+TEST(EVENT_IN, Sspmv) {
+    EventClass<SpmvMetod<cl_float> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Dspmv) {
+    CHECK_DOUBLE;
+    EventClass<SpmvMetod<cl_double> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Sspmv) {
+    EventClass<SpmvMetod<cl_float> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Dspmv) {
+    CHECK_DOUBLE;
+    EventClass<SpmvMetod<cl_double> > ec;
+    ec.runOut();
+}
+#endif
+
+#ifdef DO_SPR2
+TEST(EVENT_IN, Sspr2) {
+    EventClass<Spr2Metod<float> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Dspr2) {
+    CHECK_DOUBLE;
+    EventClass<Spr2Metod<double> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Sspr2) {
+    EventClass<Spr2Metod<float> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Dspr2) {
+    CHECK_DOUBLE;
+    EventClass<Spr2Metod<double> > ec;
+    ec.runOut();
+}
+#endif
+
+
+#ifdef DO_HPR
+TEST(EVENT_IN, Chpr) {
+    EventClass<HprMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Zhpr) {
+    CHECK_DOUBLE;
+    EventClass<HprMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Chpr) {
+    EventClass<HprMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Zhpr) {
+    CHECK_DOUBLE;
+    EventClass<HprMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+#ifdef DO_HPR2
+TEST(EVENT_IN, Chpr2) {
+    EventClass<Hpr2Metod<FloatComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Zhpr2) {
+    CHECK_DOUBLE;
+    EventClass<Hpr2Metod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Chpr2) {
+    EventClass<Hpr2Metod<FloatComplex> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Zhpr2) {
+    CHECK_DOUBLE;
+    EventClass<Hpr2Metod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+#ifdef DO_GBMV
+TEST(EVENT_IN, CGBMV) {
+    EventClass<GbmvMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, ZGBMV) {
+    CHECK_DOUBLE;
+    EventClass<GbmvMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, CGBMV) {
+    EventClass<GbmvMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, ZGBMV) {
+    CHECK_DOUBLE;
+    EventClass<GbmvMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+#ifdef DO_SBMV
+TEST(EVENT_IN, Ssbmv) {
+    EventClass<SbmvMetod<float> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Dsbmv) {
+    CHECK_DOUBLE;
+    EventClass<SbmvMetod<double> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Ssbmv) {
+    EventClass<SbmvMetod<float> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Dsbmv) {
+    CHECK_DOUBLE;
+    EventClass<SbmvMetod<double> > ec;
+    ec.runOut();
+}
+#endif
+
+//DOT
+
+#ifdef DO_DOT
+TEST(EVENT_IN, Sdot) {
+    EventClass<DotMetod<float> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Ddot) {
+    CHECK_DOUBLE;
+    EventClass<DotMetod<double> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Sdot) {
+    EventClass<DotMetod<float> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Ddot) {
+    CHECK_DOUBLE;
+    EventClass<DotMetod<double> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_IN, Cdotu) {
+    EventClass<DotMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Zdotu) {
+    CHECK_DOUBLE;
+    EventClass<DotMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Cdotu) {
+    EventClass<DotMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Zdotu) {
+    CHECK_DOUBLE;
+    EventClass<DotMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+//ASUM
+
+#ifdef DO_ASUM
+TEST(EVENT_IN, Sasum) {
+    EventClass<AsumMetod<float> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Dasum) {
+    CHECK_DOUBLE;
+    EventClass<AsumMetod<double> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Sasum) {
+    EventClass<AsumMetod<float> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Dasum) {
+    CHECK_DOUBLE;
+    EventClass<AsumMetod<double> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_IN, Scasum) {
+    EventClass<AsumMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Dzasum) {
+    CHECK_DOUBLE;
+    EventClass<AsumMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Scasum) {
+    EventClass<AsumMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Dzasum) {
+    CHECK_DOUBLE;
+    EventClass<AsumMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+//iAMAX
+
+#ifdef DO_iAMAX
+TEST(EVENT_IN, iSamax) {
+    EventClass<iAmaxMetod<float> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, iDamax) {
+    CHECK_DOUBLE;
+    EventClass<iAmaxMetod<double> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, iSamax) {
+    EventClass<iAmaxMetod<float> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, iDamax) {
+    CHECK_DOUBLE;
+    EventClass<iAmaxMetod<double> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_IN, iCamax) {
+    EventClass<iAmaxMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, iZamax) {
+    CHECK_DOUBLE;
+    EventClass<iAmaxMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, iCamax) {
+    EventClass<iAmaxMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, iZamax) {
+    CHECK_DOUBLE;
+    EventClass<iAmaxMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+//DOTC
+#ifdef DO_DOTC
+TEST(EVENT_IN, Cdotc) {
+    EventClass<DotcMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Zdotc) {
+    CHECK_DOUBLE;
+    EventClass<DotcMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Cdotc) {
+    EventClass<DotcMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Zdotc) {
+    CHECK_DOUBLE;
+    EventClass<DotcMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+
+#ifdef DO_HBMV
+TEST(EVENT_IN, Chbmv) {
+    EventClass<HbmvMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Zhbmv) {
+    CHECK_DOUBLE;
+    EventClass<HbmvMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Chbmv) {
+    EventClass<HbmvMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Zhbmv) {
+    CHECK_DOUBLE;
+    EventClass<HbmvMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+
+#ifdef DO_TBMV
+TEST(EVENT_IN, CTBMV) {
+    EventClass<TbmvMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, ZTBMV) {
+    CHECK_DOUBLE;
+    EventClass<TbmvMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, CTBMV) {
+    EventClass<TbmvMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, ZTBMV) {
+    CHECK_DOUBLE;
+    EventClass<TbmvMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+#ifdef DO_TBSV
+TEST(EVENT_IN, CTBSV) {
+    EventClass<TbsvMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, ZTBSV) {
+    CHECK_DOUBLE;
+    EventClass<TbsvMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, CTBSV) {
+    EventClass<TbsvMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, ZTBSV) {
+    CHECK_DOUBLE;
+    EventClass<TbsvMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+#ifdef DO_HER2K
+TEST(EVENT_IN, Cher2k) {
+    EventClass<Her2kMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, Zher2k) {
+    CHECK_DOUBLE;
+    EventClass<Her2kMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Cher2k) {
+    EventClass<Her2kMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, Zher2k) {
+    CHECK_DOUBLE;
+    EventClass<Her2kMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+
+#endif
+
+
+#ifdef DO_SCAL
+TEST(EVENT_IN, Sscal) {
+    EventClass<ScalMetod<float> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, Dscal) {
+    CHECK_DOUBLE;
+    EventClass<ScalMetod<double> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Sscal) {
+    EventClass<ScalMetod<float> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, Dscal) {
+    CHECK_DOUBLE;
+    EventClass<ScalMetod<double> > ec;
+    ec.runOut();
+}
+TEST(EVENT_IN, Cscal) {
+    EventClass<ScalMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, Zscal) {
+    CHECK_DOUBLE;
+    EventClass<ScalMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Cscal) {
+    EventClass<ScalMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, Zscal) {
+    CHECK_DOUBLE;
+    EventClass<ScalMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+#ifdef DO_SSCAL
+TEST(EVENT_IN, Csscal) {
+    EventClass<SscalMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, Zdscal) {
+    CHECK_DOUBLE;
+    EventClass<SscalMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Csscal) {
+    EventClass<SscalMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, Zdscal) {
+    CHECK_DOUBLE;
+    EventClass<SscalMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+#ifdef DO_SWAP
+TEST(EVENT_IN, Sswap) {
+    EventClass<SwapMetod<float> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, Dswap) {
+    CHECK_DOUBLE;
+    EventClass<SwapMetod<double> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Sswap) {
+    EventClass<SwapMetod<float> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, Dswap) {
+    CHECK_DOUBLE;
+    EventClass<SwapMetod<double> > ec;
+    ec.runOut();
+}
+TEST(EVENT_IN, Cswap) {
+    EventClass<SwapMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, Zswap) {
+    CHECK_DOUBLE;
+    EventClass<SwapMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Cswap) {
+    EventClass<SwapMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, Zswap) {
+    CHECK_DOUBLE;
+    EventClass<SwapMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+
+//copy
+
+#ifdef DO_COPY
+TEST(EVENT_IN, Scopy) {
+    EventClass<CopyMetod<float> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, Dcopy) {
+    CHECK_DOUBLE;
+    EventClass<CopyMetod<double> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Scopy) {
+    EventClass<CopyMetod<float> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, Dcopy) {
+    CHECK_DOUBLE;
+    EventClass<CopyMetod<double> > ec;
+    ec.runOut();
+}
+TEST(EVENT_IN, Ccopy) {
+    EventClass<CopyMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, Zcopy) {
+    CHECK_DOUBLE;
+    EventClass<CopyMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Ccopy) {
+    EventClass<CopyMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, Zcopy) {
+    CHECK_DOUBLE;
+    EventClass<CopyMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+#ifdef DO_AXPY
+TEST(EVENT_IN, Saxpy) {
+    EventClass<AxpyMetod<float> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, Daxpy) {
+    CHECK_DOUBLE;
+    EventClass<AxpyMetod<double> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Saxpy) {
+    EventClass<AxpyMetod<float> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, Daxpy) {
+    CHECK_DOUBLE;
+    EventClass<AxpyMetod<double> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_IN, Caxpy) {
+    EventClass<AxpyMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, Zaxpy) {
+    CHECK_DOUBLE;
+    EventClass<AxpyMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Caxpy) {
+    EventClass<AxpyMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, Zaxpy) {
+    CHECK_DOUBLE;
+    EventClass<AxpyMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+
+
+#ifdef DO_ROTG
+TEST(EVENT_IN, Srotg) {
+    EventClass<RotgMetod<float> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, Drotg) {
+    CHECK_DOUBLE;
+    EventClass<RotgMetod<double> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Srotg) {
+    EventClass<RotgMetod<float> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, Drotg) {
+    CHECK_DOUBLE;
+    EventClass<RotgMetod<double> > ec;
+    ec.runOut();
+}
+TEST(EVENT_IN, Crotg) {
+    EventClass<RotgMetod<FloatComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, Zrotg) {
+    CHECK_DOUBLE;
+    EventClass<RotgMetod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Crotg) {
+    EventClass<RotgMetod<FloatComplex> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, Zrotg) {
+    CHECK_DOUBLE;
+    EventClass<RotgMetod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
+
+#ifdef DO_ROTM
+TEST(EVENT_IN, Srotm) {
+    EventClass<RotmMetod<float> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, Drotm) {
+    CHECK_DOUBLE;
+    EventClass<RotmMetod<double> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Srotm) {
+    EventClass<RotmMetod<float> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, Drotm) {
+    CHECK_DOUBLE;
+    EventClass<RotmMetod<double> > ec;
+    ec.runOut();
+}
+#endif
+
+#ifdef DO_ROT
+TEST(EVENT_IN, Srot) {
+    EventClass<RotMetod<float> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, Drot) {
+    CHECK_DOUBLE;
+    EventClass<RotMetod<double> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Csrot) {
+    EventClass<RotMetod<float> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, Zdrot) {
+    CHECK_DOUBLE;
+    EventClass<RotMetod<double> > ec;
+    ec.runOut();
+}
+#endif
+
+#ifdef DO_ROTMG
+TEST(EVENT_IN, Srotmg) {
+    EventClass<RotmgMetod<float> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_IN, Drotmg) {
+    CHECK_DOUBLE;
+    EventClass<RotmgMetod<double> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Srotmg) {
+    EventClass<RotmgMetod<float> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_OUT, Drotmg) {
+    CHECK_DOUBLE;
+    EventClass<RotmgMetod<double> > ec;
+    ec.runOut();
+}
+#endif
+
+#ifdef DO_NRM2
+TEST(EVENT_IN, Snrm2) {
+    EventClass<Nrm2Metod<float> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Dnrm2) {
+    CHECK_DOUBLE;
+    EventClass<Nrm2Metod<double> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Snrm2) {
+    EventClass<Nrm2Metod<float> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Dnrm2) {
+    CHECK_DOUBLE;
+    EventClass<Nrm2Metod<double> > ec;
+    ec.runOut();
+}
+
+TEST(EVENT_IN, Scnrm2) {
+    EventClass<Nrm2Metod<FloatComplex> > ec;
+    ec.runIn();
+}
+TEST(EVENT_IN, Dznrm2) {
+    CHECK_DOUBLE;
+    EventClass<Nrm2Metod<DoubleComplex> > ec;
+    ec.runIn();
+}
+
+TEST(EVENT_OUT, Scnrm2) {
+    EventClass<Nrm2Metod<FloatComplex> > ec;
+    ec.runOut();
+}
+TEST(EVENT_OUT, Dznrm2) {
+    CHECK_DOUBLE;
+    EventClass<Nrm2Metod<DoubleComplex> > ec;
+    ec.runOut();
+}
+#endif
diff --git a/src/tests/functional/func-images.cpp b/src/tests/functional/func-images.cpp
new file mode 100644
index 0000000..e98b2ae
--- /dev/null
+++ b/src/tests/functional/func-images.cpp
@@ -0,0 +1,268 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+//#include <stdlib.h>             // srand()
+//#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+//
+//#include "common.h"
+//#include "blas.h"
+#include "blas-wrapper.h"
+#include "clBLAS-wrapper.h"
+#include "BlasBase.h"
+#include "blas-random.h"
+#include "timer.h"
+#include "func.h"
+
+#include <stdio.h>
+
+template <typename M>
+class ImagesClass
+{
+    enum
+    {
+        I_DEFAULT = -1,
+        I_BUFERS,
+        I_IMAGES,
+        I_CASHES
+    };
+
+    M metod;
+protected:
+    bool generateData();
+    void setImplementation(int i);
+public:
+    void images();
+    nano_time_t runRepeat(int rep, cl_int* err);
+};
+template <typename T> void
+ImagesClass<T>::setImplementation(int i)
+{
+    char str[100];
+    clMath::BlasBase *base = clMath::BlasBase::getInstance();
+
+
+    if (i != I_IMAGES) {
+        if (base->useImages()) {
+            base->removeScratchImages();
+        }
+        base->setUseImages(false);
+    }
+
+#if WIN32
+    if (i == I_DEFAULT) {
+        sprintf (str, "%s=", metod.env);
+    }
+    else {
+        sprintf (str, "%s=%i",metod.env, i);
+    }
+    _putenv(str);
+#else
+    if (i == I_DEFAULT) {
+        str[0] = '\0';
+    }
+    else {
+        sprintf (str, "%i", i);
+    }
+
+    setenv(metod.env, str, 1);
+#endif
+
+    if (i == I_IMAGES) {
+        base->setUseImages(true);
+        if (base->useImages()) {
+            if (base->addScratchImages()) {
+                std::cerr << ">> FATAL ERROR, CANNOT CREATE SCRATCH IMAGES!"
+                          << std::endl
+                          << ">> Test skipped." << ::std::endl;
+                SUCCEED();
+            }
+        }
+   }
+
+}
+
+template <typename T> bool
+ImagesClass<T>::generateData()
+{
+    metod.generateData();
+    bool ret = metod.prepareDataToRun();
+
+    if (!ret) {
+        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
+            << ::std::endl
+            << ">> Can't execute the test, because data is not transfered to GPU."
+            << ::std::endl
+            << ">> Test skipped." << ::std::endl;
+        SUCCEED();
+    }
+    return ret;
+}
+
+template <typename M> nano_time_t
+ImagesClass<M>::runRepeat(int rep, cl_int* err)
+{
+    nano_time_t time1 = getCurrentTime();
+    for (int i= 0; i < rep; i++) {
+        nano_time_t time = getCurrentTime();
+        *err = metod.run();
+        if (*err != CL_SUCCESS) {
+            return 0;
+        }
+        *err = clFinish(metod.queues[0]);
+        if (*err != CL_SUCCESS) {
+            return 0;
+        }
+        time = getCurrentTime() - time;
+        time1 = (time < time1)?time:time1 ;
+    }
+    return time1;
+}
+
+template <typename M> void
+ImagesClass<M>::images()
+{
+    cl_int err;
+    int i= 6;
+    int iMax = 30;
+    nano_time_t maxTime = 1000;
+    nano_time_t minTime = 100;
+    bool next = true;
+
+    do {
+        nano_time_t time;
+
+        metod.initDefault(256*i, 1);
+        bool b = generateData();
+        ASSERT_EQ(b, true) << "generateData()";
+        setImplementation(I_BUFERS);
+        metod.initOutEvent();
+        time = runRepeat(2, &err);
+        ASSERT_EQ(err, CL_SUCCESS) << "clFinish()";
+        //std::cerr << "size = " << 256*i << "/" << i << " time = " << conv2millisec(time) << std::endl;
+        if (conv2millisec(time) < minTime) {
+            i += (((int)minTime - (int)conv2millisec(time)) /20) + 1;
+			metod.destroy();
+            continue;
+        }
+        if (conv2millisec(time) > maxTime) {
+            i = iMax;
+			metod.destroy();
+            continue;
+        }
+		next = false;
+
+		nano_time_t time1 = runRepeat(5, &err);
+        ASSERT_EQ(err, CL_SUCCESS) << "clFinish()";
+
+        setImplementation(I_IMAGES);
+
+        nano_time_t time2 = runRepeat(5, &err);
+        ASSERT_EQ(err, CL_SUCCESS) << "clFinish()";
+
+        setImplementation(I_DEFAULT);
+
+        //nano_time_t time3 = runRepeat(5, & err);
+        //ASSERT_EQ(err, CL_SUCCESS) << "clFinish()";
+
+        double d = (double)(time1) / time2;
+        std::cerr << "size = " << 256*i
+                  << "  timeBufer = " << conv2millisec(time1)
+                  << "  timeImage = " << conv2millisec(time2)
+                  << "  t1/t2 = " << d << std::endl;
+
+        if (d < 1.2) {
+            next = true;
+            i++;
+        }
+        metod.destroy();
+
+    } while (i < iMax && next);
+
+    ASSERT_TRUE(!next) ;
+
+}
+
+// Instantiate the test
+
+//******************************************************/
+TEST(IMAGES, sgemm) {
+    ImagesClass<GemmMetod<float> > ec;
+    ec.images();
+}
+
+TEST(IMAGES, cgemm) {
+    ImagesClass<GemmMetod<FloatComplex> > ec;
+    ec.images();
+}
+
+TEST(IMAGES, dgemm) {
+    CHECK_DOUBLE;
+    ImagesClass<GemmMetod<cl_double> > ec;
+    ec.images();
+}
+
+TEST(IMAGES, zgemm) {
+    CHECK_DOUBLE;
+    ImagesClass<GemmMetod<DoubleComplex> > ec;
+    ec.images();
+}//******************************************************/
+TEST(IMAGES, strmm) {
+    ImagesClass<TrmmMetod<float> > ec;
+    ec.images();
+}
+
+TEST(IMAGES, ctrmm) {
+    ImagesClass<TrmmMetod<FloatComplex> > ec;
+    ec.images();
+}
+
+TEST(IMAGES, dtrmm) {
+    CHECK_DOUBLE;
+    ImagesClass<TrmmMetod<cl_double> > ec;
+    ec.images();
+}
+
+TEST(IMAGES, ztrmm) {
+    CHECK_DOUBLE;
+    ImagesClass<TrmmMetod<DoubleComplex> > ec;
+    ec.images();
+}
+//******************************************************/
+TEST(IMAGES, strsm) {
+    ImagesClass<TrsmMetod<float> > ec;
+    ec.images();
+}
+
+TEST(IMAGES, ctrsm) {
+    ImagesClass<TrsmMetod<FloatComplex> > ec;
+    ec.images();
+}
+
+TEST(IMAGES, dtrsm) {
+    CHECK_DOUBLE;
+    ImagesClass<TrsmMetod<cl_double> > ec;
+    ec.images();
+}
+
+TEST(IMAGES, ztrsm) {
+    CHECK_DOUBLE;
+    ImagesClass<TrsmMetod<DoubleComplex> > ec;
+    ec.images();
+}
+//******************************************************/
diff --git a/src/tests/functional/func-queue.cpp b/src/tests/functional/func-queue.cpp
new file mode 100644
index 0000000..f052388
--- /dev/null
+++ b/src/tests/functional/func-queue.cpp
@@ -0,0 +1,881 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+//#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+//
+//#include "common.h"
+//#include "blas.h"
+#include "blas-wrapper.h"
+#include "clBLAS-wrapper.h"
+#include "BlasBase.h"
+#include "blas-random.h"
+#include "timer.h"
+
+#include "func.h"
+
+template <typename M>
+class MQueueClass
+{
+    M metod;
+protected:
+    void init();
+    void run();
+    void destroy();
+public:
+    void testQueue();
+};
+
+template <typename M> void
+MQueueClass<M>::init()
+{
+    size_t maxElem = 1024*2;
+
+    metod.initDefault(maxElem, 0);
+    metod.generateData();
+
+    metod.outEvent = NULL;
+
+}
+
+template <typename M> void
+MQueueClass<M>::run()
+{
+    cl_int err;
+    bool b = metod.prepareDataToRun();
+    ASSERT_EQ(b, true);
+
+    int qmax = metod.qnum;
+
+    metod.initOutEvent();
+    cl_int ret = CL_SUCCESS;
+
+    err = metod.run();
+    ASSERT_EQ(err, CL_SUCCESS);
+    //::std::cerr << "queues = " << base->numCommandQueues() << std::endl;
+
+
+    for (int q = 0; q < qmax; ++q) {
+        err = clFinish(metod.queues[q]);
+        ASSERT_EQ(err, CL_SUCCESS) << "clFinish()";
+
+        err = clGetEventInfo(metod.outEvent[q], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &ret, NULL);
+        //std::cerr << "2: err=" <<  err <<" ret=" <<  ret << std::endl;
+        ASSERT_EQ(err, CL_SUCCESS) << "clGetEventInfo()";
+        ASSERT_EQ(ret, CL_COMPLETE) << "clGetEventInfo()";
+     }
+}
+template <typename M> void
+MQueueClass<M>::destroy()
+{
+    metod.destroy();
+}
+
+template <typename M> void
+MQueueClass<M>::testQueue()
+{
+    init();
+    run();
+    destroy();
+}
+
+#ifdef DO_THEIRS
+//******************************************************//
+TEST(QUEUE, sgemm) {
+    MQueueClass< GemmMetod<float> > ec;
+    ec.testQueue();
+}
+
+TEST(QUEUE, cgemm) {
+    MQueueClass< GemmMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+
+TEST(QUEUE, dgemm) {
+    CHECK_DOUBLE;
+    MQueueClass< GemmMetod<cl_double> > ec;
+    ec.testQueue();
+}
+
+TEST(QUEUE, zgemm) {
+    CHECK_DOUBLE;
+    MQueueClass<GemmMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+//******************************************************//
+TEST(QUEUE, strmm) {
+    MQueueClass<TrmmMetod<float> > ec;
+    ec.testQueue();
+}
+
+TEST(QUEUE, ctrmm) {
+    MQueueClass<TrmmMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+
+TEST(QUEUE, dtrmm) {
+    CHECK_DOUBLE;
+    MQueueClass<TrmmMetod<cl_double> > ec;
+    ec.testQueue();
+}
+
+TEST(QUEUE, ztrmm) {
+    CHECK_DOUBLE;
+    MQueueClass<TrmmMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+//******************************************************//
+TEST(QUEUE, strsm) {
+    MQueueClass<TrsmMetod<float> > ec;
+    ec.testQueue();
+}
+
+TEST(QUEUE, ctrsm) {
+    MQueueClass<TrsmMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+
+TEST(QUEUE, dtrsm) {
+    CHECK_DOUBLE;
+    MQueueClass<TrsmMetod<cl_double> > ec;
+    ec.testQueue();
+}
+
+TEST(QUEUE, ztrsm) {
+    CHECK_DOUBLE;
+    MQueueClass<TrsmMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+//******************************************************//
+TEST(QUEUE, sgemv) {
+    MQueueClass<GemvMetod<float> > ec;
+    ec.testQueue();
+}
+#if defined(_USE_GEMV_COMPLEX)
+TEST(QUEUE, cgemv) {
+    MQueueClass<GemvMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+#endif
+TEST(QUEUE, dgemv) {
+    CHECK_DOUBLE;
+    MQueueClass<GemvMetod<cl_double> > ec;
+    ec.testQueue();
+}
+#if defined(_USE_GEMV_COMPLEX)
+TEST(QUEUE, zgemv) {
+    CHECK_DOUBLE;
+    MQueueClass<GemvMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+//******************************************************//
+TEST(QUEUE, ssymv) {
+    MQueueClass<SymvMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, dsymv) {
+    CHECK_DOUBLE;
+    MQueueClass<SymvMetod<cl_double> > ec;
+    ec.testQueue();
+}
+//******************************************************//
+TEST(QUEUE, ssyr2k) {
+    MQueueClass<Syr2kMetod<float> > ec;
+    ec.testQueue();
+}
+
+TEST(QUEUE, dsyr2k) {
+    CHECK_DOUBLE;
+    MQueueClass<Syr2kMetod<cl_double> > ec;
+    ec.testQueue();
+}
+#endif  //DO_THEIRS
+
+//******************************************************
+#ifdef DO_TRMV
+TEST(QUEUE, strmv) {
+    MQueueClass< TrmvMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, dtrmv) {
+	CHECK_DOUBLE;
+    MQueueClass< TrmvMetod<cl_double> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, ctrmv) {
+    MQueueClass< TrmvMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, ztrmv) {
+	CHECK_DOUBLE;
+    MQueueClass< TrmvMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+// ******************************************************/
+
+#ifdef DO_TPMV
+TEST(QUEUE, stpmv) {
+    MQueueClass< TpmvMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, dtpmv) {
+    CHECK_DOUBLE;
+    MQueueClass< TpmvMetod<cl_double> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, ctpmv) {
+    MQueueClass< TpmvMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, ztpmv) {
+    CHECK_DOUBLE;
+    MQueueClass< TpmvMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_TRSV
+TEST(QUEUE, strsv) {
+    MQueueClass< TrsvMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, dtrsv) {
+	CHECK_DOUBLE;
+    MQueueClass< TrsvMetod<double> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, ctrsv) {
+    MQueueClass< TrsvMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, ztrsv) {
+	CHECK_DOUBLE;
+    MQueueClass< TrsvMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_TPSV
+TEST(QUEUE, stpsv) {
+    MQueueClass< TpsvMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, dtpsv) {
+    CHECK_DOUBLE;
+    MQueueClass< TpsvMetod<double> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, ctpsv) {
+    MQueueClass< TpsvMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, ztpsv) {
+    CHECK_DOUBLE;
+    MQueueClass< TpsvMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_SYMM
+TEST(QUEUE, Ssymm) {
+    MQueueClass< SymmMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Dsymm) {
+	CHECK_DOUBLE;
+    MQueueClass< SymmMetod<cl_double> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Csymm) {
+    MQueueClass< SymmMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Zsymm) {
+	CHECK_DOUBLE;
+    MQueueClass< SymmMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+
+#endif
+
+#ifdef DO_SYR
+TEST(QUEUE, Ssyr) {
+    MQueueClass< SyrMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Dsyr) {
+    CHECK_DOUBLE;
+    MQueueClass< SyrMetod<cl_double> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_SPR
+TEST(QUEUE, Sspr) {
+    MQueueClass< SprMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Dspr) {
+    CHECK_DOUBLE;
+    MQueueClass< SprMetod<cl_double> > ec;
+    ec.testQueue();
+}
+#endif
+
+
+#ifdef DO_SYR2
+TEST(QUEUE, Ssyr2) {
+    MQueueClass< Syr2Metod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Dsyr2) {
+    CHECK_DOUBLE;
+    MQueueClass< Syr2Metod<cl_double> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_GER
+TEST(QUEUE, sger) {
+    MQueueClass< GerMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, dger) {
+	CHECK_DOUBLE;
+    MQueueClass< GerMetod<cl_double> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, cger) {
+    MQueueClass< GerMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, zger) {
+	CHECK_DOUBLE;
+    MQueueClass< GerMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_GERC
+TEST(QUEUE, cgerc) {
+    MQueueClass< GercMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, zgerc) {
+	CHECK_DOUBLE;
+    MQueueClass< GercMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_HER
+TEST(QUEUE, cher) {
+    MQueueClass< HerMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, zher) {
+        CHECK_DOUBLE;
+    MQueueClass<HerMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_HER2
+TEST(QUEUE, cher2) {
+    MQueueClass< Her2Metod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, zher2) {
+        CHECK_DOUBLE;
+    MQueueClass<Her2Metod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_HEMM
+TEST(QUEUE, chemm) {
+    MQueueClass< HemmMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, zhemm) {
+        CHECK_DOUBLE;
+    MQueueClass<HemmMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+
+#ifdef DO_HEMV
+TEST(QUEUE, chemv) {
+    MQueueClass< HemvMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, zhemv) {
+        CHECK_DOUBLE;
+    MQueueClass<HemvMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_HERK
+TEST(QUEUE, cherk) {
+    MQueueClass< HerkMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, zherk) {
+    CHECK_DOUBLE;
+    MQueueClass<HerkMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+
+#ifdef DO_HPMV
+TEST(QUEUE, chpmv) {
+    MQueueClass< HpmvMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, zhpmv) {
+        CHECK_DOUBLE;
+    MQueueClass<HpmvMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+
+#ifdef DO_SPMV
+TEST(QUEUE, sspmv) {
+    MQueueClass<SpmvMetod<cl_float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, dspmv) {
+        CHECK_DOUBLE;
+    MQueueClass<SpmvMetod<cl_double> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_SPR2
+TEST(QUEUE, Sspr2) {
+    MQueueClass< Spr2Metod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Dspr2) {
+    CHECK_DOUBLE;
+    MQueueClass< Spr2Metod<cl_double> > ec;
+    ec.testQueue();
+}
+#endif
+
+
+#ifdef DO_HPR
+TEST(QUEUE, chpr) {
+    MQueueClass< HprMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, zhpr) {
+        CHECK_DOUBLE;
+    MQueueClass<HprMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_HPR2
+TEST(QUEUE, chpr2) {
+    MQueueClass< Hpr2Metod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, zhpr2) {
+        CHECK_DOUBLE;
+    MQueueClass<Hpr2Metod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_GBMV
+TEST(QUEUE, SGBMV) {
+    MQueueClass< GbmvMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, DGBMV) {
+    CHECK_DOUBLE;
+    MQueueClass< GbmvMetod<cl_double> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, CGBMV) {
+    MQueueClass< GbmvMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, ZGBMV) {
+    CHECK_DOUBLE;
+    MQueueClass< GbmvMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_SYR
+TEST(QUEUE, Ssbmv) {
+    MQueueClass< SbmvMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Dsbmv) {
+    CHECK_DOUBLE;
+    MQueueClass< SbmvMetod<cl_double> > ec;
+    ec.testQueue();
+}
+#endif
+
+//DOT
+
+#ifdef DO_DOT
+TEST(QUEUE, Sdot) {
+    MQueueClass< DotMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Ddot) {
+    CHECK_DOUBLE;
+    MQueueClass< DotMetod<cl_double> > ec;
+    ec.testQueue();
+}
+
+TEST(QUEUE, Cdotu) {
+    MQueueClass< DotMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Zdotu) {
+    CHECK_DOUBLE;
+    MQueueClass< DotMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+//ASUM
+#ifdef DO_ASUM
+TEST(QUEUE, Sasum) {
+    MQueueClass< AsumMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Dasum) {
+    CHECK_DOUBLE;
+    MQueueClass< AsumMetod<cl_double> > ec;
+    ec.testQueue();
+}
+
+TEST(QUEUE, Scasum) {
+    MQueueClass< AsumMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Dzasum) {
+    CHECK_DOUBLE;
+    MQueueClass< AsumMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+//iAMAX
+#ifdef DO_iAMAX
+TEST(QUEUE, iSamax) {
+    MQueueClass< iAmaxMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, iDamax) {
+    CHECK_DOUBLE;
+    MQueueClass< iAmaxMetod<cl_double> > ec;
+    ec.testQueue();
+}
+
+TEST(QUEUE, iCamax) {
+    MQueueClass< iAmaxMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, iZamax) {
+    CHECK_DOUBLE;
+    MQueueClass< iAmaxMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+//DOTC
+#ifdef DO_DOTC
+TEST(QUEUE, Cdotc) {
+    MQueueClass< DotcMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Zdotc) {
+    CHECK_DOUBLE;
+    MQueueClass< DotcMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+
+#ifdef DO_SYR
+TEST(QUEUE, Chbmv) {
+    MQueueClass< HbmvMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Zhbmv) {
+    CHECK_DOUBLE;
+    MQueueClass< HbmvMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_TBMV
+TEST(QUEUE, STBMV) {
+    MQueueClass< TbmvMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, DTBMV) {
+    CHECK_DOUBLE;
+    MQueueClass< TbmvMetod<cl_double> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, CTBMV) {
+    MQueueClass< TbmvMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, ZTBMV) {
+    CHECK_DOUBLE;
+    MQueueClass< TbmvMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_TBSV
+TEST(QUEUE, STBSV) {
+    MQueueClass< TbsvMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, DTBSV) {
+    CHECK_DOUBLE;
+    MQueueClass< TbsvMetod<cl_double> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, CTBSV) {
+    MQueueClass< TbsvMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, ZTBSV) {
+    CHECK_DOUBLE;
+    MQueueClass< TbsvMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_HER2K
+TEST(QUEUE, cher2k) {
+    MQueueClass< Her2kMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, zher2k) {
+    CHECK_DOUBLE;
+    MQueueClass<Her2kMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+
+#ifdef DO_SCAL
+TEST(QUEUE, Sscal) {
+    MQueueClass< ScalMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Dscal) {
+    CHECK_DOUBLE;
+    MQueueClass< ScalMetod<cl_double> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Cscal) {
+    MQueueClass< ScalMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Zscal) {
+    CHECK_DOUBLE;
+    MQueueClass< ScalMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_SSCAL
+TEST(QUEUE, Csscal) {
+    MQueueClass< SscalMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Zdscal) {
+    CHECK_DOUBLE;
+    MQueueClass< SscalMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_SWAP
+TEST(QUEUE, Sswap) {
+    MQueueClass< SwapMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Dswap) {
+    CHECK_DOUBLE;
+    MQueueClass< SwapMetod<cl_double> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Cswap) {
+    MQueueClass< SwapMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Zswap) {
+    CHECK_DOUBLE;
+    MQueueClass< SwapMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+
+#ifdef DO_COPY
+TEST(QUEUE, Scopy) {
+    MQueueClass< CopyMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Dcopy) {
+    CHECK_DOUBLE;
+    MQueueClass< CopyMetod<cl_double> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Ccopy) {
+    MQueueClass< CopyMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Zcopy) {
+    CHECK_DOUBLE;
+    MQueueClass< CopyMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_AXPY
+TEST(QUEUE, Saxpy) {
+    MQueueClass< AxpyMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Daxpy) {
+    CHECK_DOUBLE;
+    MQueueClass< AxpyMetod<cl_double> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Caxpy) {
+    MQueueClass< AxpyMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Zaxpy) {
+    CHECK_DOUBLE;
+    MQueueClass< AxpyMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_ROTG
+TEST(QUEUE, Srotg) {
+    MQueueClass< RotgMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Drotg) {
+    CHECK_DOUBLE;
+    MQueueClass< RotgMetod<cl_double> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Crotg) {
+    MQueueClass< RotgMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Zrotg) {
+    CHECK_DOUBLE;
+    MQueueClass< RotgMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_ROTM
+TEST(QUEUE, Srotm) {
+    MQueueClass< RotmMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Drotm) {
+    CHECK_DOUBLE;
+    MQueueClass< RotmMetod<cl_double> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_ROT
+TEST(QUEUE, Srot) {
+    MQueueClass< RotMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Drot) {
+    CHECK_DOUBLE;
+    MQueueClass< RotMetod<cl_double> > ec;
+    ec.testQueue();
+}
+
+TEST(QUEUE, Csrot) {
+    MQueueClass< RotMetod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Zdrot) {
+    CHECK_DOUBLE;
+    MQueueClass< RotMetod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_ROTMG
+TEST(QUEUE, Srotmg) {
+    MQueueClass< RotmgMetod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Drotmg) {
+    CHECK_DOUBLE;
+    MQueueClass< RotmgMetod<cl_double> > ec;
+    ec.testQueue();
+}
+#endif
+
+#ifdef DO_NRM2
+TEST(QUEUE, Snrm2) {
+    MQueueClass< Nrm2Metod<float> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Dnrm2) {
+    CHECK_DOUBLE;
+    MQueueClass< Nrm2Metod<cl_double> > ec;
+    ec.testQueue();
+}
+
+TEST(QUEUE, Scnrm2) {
+    MQueueClass< Nrm2Metod<FloatComplex> > ec;
+    ec.testQueue();
+}
+TEST(QUEUE, Dznrm2) {
+    CHECK_DOUBLE;
+    MQueueClass< Nrm2Metod<DoubleComplex> > ec;
+    ec.testQueue();
+}
+#endif
diff --git a/src/tests/functional/func-thread.cpp b/src/tests/functional/func-thread.cpp
new file mode 100644
index 0000000..ba0e2b3
--- /dev/null
+++ b/src/tests/functional/func-thread.cpp
@@ -0,0 +1,938 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+//#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+//
+//#include "common.h"
+//#include "blas.h"
+#include "blas-wrapper.h"
+#include "clBLAS-wrapper.h"
+#include "BlasBase.h"
+#include "blas-random.h"
+#include "timer.h"
+#include <symv.h>
+#include "func.h"
+
+// Parallel thread
+#define P_TH 5
+
+#if defined(_MSC_VER)
+#include "windows.h"
+#include "process.h"
+
+
+#define THREAD_ID HANDLE
+#define THREAD_START(ID, DATA) \
+     ID = (HANDLE)_beginthreadex(NULL, 0, &phfunc<M>, &DATA, 0, NULL);
+#define THREAD_WAIT(ID, RET) \
+{ \
+    DWORD r;\
+    WaitForSingleObject(ID, INFINITE); \
+    GetExitCodeThread(ID, &r);\
+    RET = (r == 1);\
+}
+
+template <typename M>
+unsigned __stdcall
+phfunc(void* vm)
+{
+    unsigned ret;
+    M* m = (M*) vm;
+
+    cl_uint err = m->run();
+    clWaitForEvents(1, m->outEvent);
+
+    err = m->getResult();
+    ret = (err == CL_SUCCESS)? 1:0;
+    _endthreadex(ret);
+    return ret;
+}
+
+#else /* defined(_MCS_VER) */
+#include "pthread.h"
+
+#define THREAD_ID pthread_t
+#define THREAD_START(ID, DATA) \
+     pthread_create(&ID, NULL, phfunc<M>, &DATA)
+#define THREAD_WAIT(ID, RET) \
+{ \
+    void* r;\
+    int res = pthread_join(pt[i], &r); \
+    (void) res; \
+    RET =(bool)r;\
+}
+
+
+
+template <typename M>
+void*
+phfunc(void* vm)
+{
+    M* m = (M*) vm;
+
+    cl_uint err = m->run();
+    clWaitForEvents(1, m->outEvent);
+    sleep(1);
+
+    err = m->getResult();
+    return (void *)(err == CL_SUCCESS);
+}
+
+#endif
+
+
+template <typename M>
+class MThreadClass
+{
+    M s_metod;
+    M m_metod[P_TH];
+protected:
+    void init();
+    void run();
+    void destroy();
+public:
+    void mthread();
+};
+
+template <typename M> void
+MThreadClass<M>::init()
+{
+    //size_t maxElem = 1024; PENDING: Make it back to 1024
+    size_t maxElem = 128;
+
+    s_metod.initDefault(maxElem, 1);
+    s_metod.generateData();
+
+    for (int i=0; i < P_TH; ++i ) {
+        m_metod[i].initDefault(maxElem, 1);
+        //m_metod[i].generateData();
+        m_metod[i].copyData(s_metod);
+    }
+}
+
+template <typename M> void
+MThreadClass<M>::run()
+{
+    cl_int err;
+    bool b = s_metod.prepareDataToRun();
+    ASSERT_EQ(b, true);
+    for (int i=0; i < P_TH; ++i ) {
+        bool b = m_metod[i].prepareDataToRun();
+        m_metod[i].initOutEvent();
+        ASSERT_EQ(b, true);
+    }
+
+    err = s_metod.run();
+    if (err == CL_SUCCESS) {
+        err = clFinish(s_metod.queues[0]);
+        ASSERT_EQ(err, CL_SUCCESS) << "clFinish()";
+
+        err = s_metod.getResult();
+        ASSERT_EQ(err, CL_SUCCESS);
+
+        THREAD_ID pt[P_TH];
+
+        for (int i=0; i < P_TH; ++i ) {
+             THREAD_START(pt[i], m_metod[i]);
+        }
+
+        for (int i=0; i < P_TH; ++i ) {
+            bool ret;
+            THREAD_WAIT(pt[i], ret);
+            EXPECT_EQ(ret, true);
+            s_metod.compareData(m_metod[i]);
+        }
+    }
+    else {
+        ::std::cerr << ">> Test skipped." << err <<::std::endl;
+        SUCCEED();
+        return;
+    }
+}
+template <typename M> void
+MThreadClass<M>::destroy()
+{
+    s_metod.destroy();
+    for (int i=0; i < P_TH; ++i ) {
+        m_metod[i].destroy();
+    }
+}
+
+template <typename M> void
+MThreadClass<M>::mthread()
+{
+    init();
+    run();
+    destroy();
+}
+
+#ifdef DO_THEIRS
+TEST(THREAD, sgemm) {
+    MThreadClass<GemmMetod<cl_float> > ec;
+    ec.mthread();
+}
+TEST(THREAD, cgemm) {
+    MThreadClass<GemmMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+TEST(THREAD, dgemm) {
+    CHECK_DOUBLE;
+    MThreadClass<GemmMetod<cl_double> > ec;
+    ec.mthread();
+}
+TEST(THREAD, zgemm) {
+    CHECK_DOUBLE;
+    MThreadClass<GemmMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, strmm) {
+    MThreadClass<TrmmMetod<float> > ec;
+    ec.mthread();
+}
+TEST(THREAD, ctrmm) {
+    MThreadClass<TrmmMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+TEST(THREAD, dtrmm) {
+    CHECK_DOUBLE;
+    MThreadClass<TrmmMetod<cl_double> > ec;
+    ec.mthread();
+}
+TEST(THREAD, ztrmm) {
+    CHECK_DOUBLE;
+    MThreadClass<TrmmMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+//////////////////////////////////////////////////////////////
+TEST(THREAD, strsm) {
+    MThreadClass<TrsmMetod<float> > ec;
+    ec.mthread();
+}
+TEST(THREAD, ctrsm) {
+    MThreadClass<TrsmMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+TEST(THREAD, dtrsm) {
+    CHECK_DOUBLE;
+    MThreadClass<TrsmMetod<cl_double> > ec;
+    ec.mthread();
+}
+TEST(THREAD, ztrsm) {
+    CHECK_DOUBLE;
+    MThreadClass<TrsmMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+//////////////////////////////////////////////////////////////
+TEST(THREAD, sgemv) {
+    MThreadClass<GemvMetod<float> > ec;
+    ec.mthread();
+}
+#if defined(_USE_GEMV_COMPLEX)
+TEST(THREAD, cgemv) {
+    MThreadClass<GemvMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+#endif
+TEST(THREAD, dgemv) {
+    CHECK_DOUBLE;
+    MThreadClass<GemvMetod<cl_double> > ec;
+    ec.mthread();
+}
+
+#if defined(_USE_GEMV_COMPLEX)
+TEST(THREAD, zgemv) {
+    CHECK_DOUBLE;
+    MThreadClass<GemvMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+//////////////////////////////////////////////////////////////
+TEST(THREAD, ssymv) {
+    MThreadClass<SymvMetod<float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, dsymv) {
+    CHECK_DOUBLE;
+    MThreadClass<SymvMetod<cl_double> > ec;
+    ec.mthread();
+}
+//******************************************************//
+TEST(THREAD, ssyr2k) {
+    MThreadClass<Syr2kMetod<float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, dsyr2k) {
+    CHECK_DOUBLE;
+    MThreadClass<Syr2kMetod<cl_double> > ec;
+    ec.mthread();
+}
+#endif //DO_THIERS
+
+#ifdef DO_TRMV
+TEST(THREAD, strmv) {
+    MThreadClass<TrmvMetod<float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, dtrmv) {
+	CHECK_DOUBLE;
+    MThreadClass<TrmvMetod<cl_double> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, ctrmv) {
+    MThreadClass<TrmvMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+TEST(THREAD, ztrmv) {
+	CHECK_DOUBLE;
+    MThreadClass<TrmvMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_TPMV
+TEST(THREAD, stpmv) {
+    MThreadClass<TpmvMetod<float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, dtpmv) {
+    CHECK_DOUBLE;
+    MThreadClass<TpmvMetod<cl_double> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, ctpmv) {
+    MThreadClass<TpmvMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+TEST(THREAD, ztpmv) {
+    CHECK_DOUBLE;
+    MThreadClass<TpmvMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+
+#ifdef DO_TRSV
+TEST(THREAD, strsv) {
+    MThreadClass<TrsvMetod<float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, dtrsv) {
+	CHECK_DOUBLE;
+    MThreadClass<TrsvMetod<double> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, ctrsv) {
+    MThreadClass<TrsvMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+TEST(THREAD, ztrsv) {
+	CHECK_DOUBLE;
+    MThreadClass<TrsvMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_TPSV
+TEST(THREAD, stpsv) {
+    MThreadClass<TpsvMetod<float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, dtpsv) {
+    CHECK_DOUBLE;
+    MThreadClass<TpsvMetod<double> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, ctpsv) {
+    MThreadClass<TpsvMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+TEST(THREAD, ztpsv) {
+    CHECK_DOUBLE;
+    MThreadClass<TpsvMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_SYMM
+TEST(THREAD, Ssymm) {
+    MThreadClass<SymmMetod<float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Dsymm) {
+	CHECK_DOUBLE;
+    MThreadClass<SymmMetod<cl_double> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Csymm) {
+    MThreadClass<SymmMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+TEST(THREAD, Zsymm) {
+	CHECK_DOUBLE;
+    MThreadClass<SymmMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_SYR
+TEST(THREAD, Ssyr) {
+    MThreadClass<SyrMetod<float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Dsyr) {
+    CHECK_DOUBLE;
+    MThreadClass<SyrMetod<cl_double> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_SPR
+TEST(THREAD, Sspr) {
+    MThreadClass<SprMetod<float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Dspr) {
+    CHECK_DOUBLE;
+    MThreadClass<SprMetod<cl_double> > ec;
+    ec.mthread();
+}
+#endif
+
+
+#ifdef DO_SYR2
+TEST(THREAD, Ssyr2) {
+    MThreadClass<Syr2Metod<float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Dsyr2) {
+    CHECK_DOUBLE;
+    MThreadClass<Syr2Metod<cl_double> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_GER
+TEST(THREAD, Sger) {
+    MThreadClass<GerMetod<float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Dger) {
+	CHECK_DOUBLE;
+    MThreadClass<GerMetod<cl_double> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Cger) {
+    MThreadClass<GerMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Zger) {
+	CHECK_DOUBLE;
+    MThreadClass<GerMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_GERC
+TEST(THREAD, Cgerc) {
+    MThreadClass<GercMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Zgerc) {
+	CHECK_DOUBLE;
+    MThreadClass<GercMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_HER
+TEST(THREAD, Cher) {
+    MThreadClass<HerMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Zher) {
+    CHECK_DOUBLE;
+    MThreadClass<HerMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_HER2
+TEST(THREAD, Cher2) {
+    MThreadClass<Her2Metod<FloatComplex> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Zher2) {
+    CHECK_DOUBLE;
+    MThreadClass<Her2Metod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_HEMM
+TEST(THREAD, Chemm) {
+    MThreadClass<HemmMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Zhemm) {
+    CHECK_DOUBLE;
+    MThreadClass<HemmMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+
+#ifdef DO_HEMV
+TEST(THREAD, Chemv) {
+    MThreadClass<HemvMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Zhemv) {
+    CHECK_DOUBLE;
+    MThreadClass<HemvMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_HERK
+TEST(THREAD, Cherk) {
+    MThreadClass<HerkMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Zherk) {
+    CHECK_DOUBLE;
+    MThreadClass<HerkMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_HPMV
+TEST(THREAD, Chpmv) {
+    MThreadClass<HpmvMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Zhpmv) {
+    CHECK_DOUBLE;
+    MThreadClass<HpmvMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+
+#ifdef DO_SPMV
+TEST(THREAD, Sspmv) {
+    MThreadClass<SpmvMetod<cl_float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Dspmv) {
+    CHECK_DOUBLE;
+    MThreadClass<SpmvMetod<cl_double> > ec;
+    ec.mthread();
+}
+#endif
+
+
+#ifdef DO_SPR2
+TEST(THREAD, Sspr2) {
+    MThreadClass<Spr2Metod<float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Dspr2) {
+    CHECK_DOUBLE;
+    MThreadClass<Spr2Metod<cl_double> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_HPR
+TEST(THREAD, Chpr) {
+    MThreadClass<HprMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Zhpr) {
+    CHECK_DOUBLE;
+    MThreadClass<HprMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_HPR2
+TEST(THREAD, Chpr2) {
+    MThreadClass<Hpr2Metod<FloatComplex> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Zhpr2) {
+    CHECK_DOUBLE;
+    MThreadClass<Hpr2Metod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_GBMV
+TEST(THREAD, SGBMV) {
+    MThreadClass<GbmvMetod<cl_float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, ZGBMV) {
+    CHECK_DOUBLE;
+    MThreadClass<GbmvMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_SBMV
+TEST(THREAD, Ssbmv) {
+    MThreadClass<SbmvMetod<float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Dsbmv) {
+    CHECK_DOUBLE;
+    MThreadClass<SbmvMetod<cl_double> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_HBMV
+TEST(THREAD, Chbmv) {
+    MThreadClass<HbmvMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Zhbmv) {
+    CHECK_DOUBLE;
+    MThreadClass<HbmvMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+
+#ifdef DO_TBMV
+TEST(THREAD, STBMV) {
+    MThreadClass<TbmvMetod<cl_float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, ZTBMV) {
+    CHECK_DOUBLE;
+    MThreadClass<TbmvMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_TBSV
+TEST(THREAD, STBSV) {
+    MThreadClass<TbsvMetod<cl_float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, ZTBSV) {
+    CHECK_DOUBLE;
+    MThreadClass<TbsvMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_HER2K
+TEST(THREAD, Cher2k) {
+    MThreadClass<Her2kMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Zher2k) {
+    CHECK_DOUBLE;
+    MThreadClass<Her2kMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_SCAL
+TEST(THREAD, Sscal) {
+    MThreadClass<ScalMetod<cl_float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Zscal) {
+    CHECK_DOUBLE;
+    MThreadClass<ScalMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_SSCAL
+TEST(THREAD, Csscal) {
+    MThreadClass<SscalMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Zdscal) {
+    CHECK_DOUBLE;
+    MThreadClass<ScalMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_SWAP
+TEST(THREAD, Sswap) {
+    MThreadClass<SwapMetod<cl_float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Zswap) {
+    CHECK_DOUBLE;
+    MThreadClass<SwapMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_AXPY
+TEST(THREAD, Saxpy) {
+    MThreadClass<AxpyMetod<cl_float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Zaxpy) {
+    CHECK_DOUBLE;
+    MThreadClass<AxpyMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_COPY
+TEST(THREAD, Scopy) {
+    MThreadClass<CopyMetod<float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Dcopy) {
+        CHECK_DOUBLE;
+    MThreadClass<CopyMetod<cl_double> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Ccopy) {
+    MThreadClass<CopyMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Zcopy) {
+        CHECK_DOUBLE;
+    MThreadClass<CopyMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+//DOT
+#ifdef DO_DOT
+TEST(THREAD, Sdot) {
+    MThreadClass<DotMetod<float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Ddot) {
+    CHECK_DOUBLE;
+    MThreadClass<DotMetod<cl_double> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Cdotu) {
+    MThreadClass<DotMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Zdotu) {
+    CHECK_DOUBLE;
+    MThreadClass<DotMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+//ASUM
+#ifdef DO_ASUM
+TEST(THREAD, Sasum) {
+    MThreadClass<AsumMetod<float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Dasum) {
+    CHECK_DOUBLE;
+    MThreadClass<AsumMetod<cl_double> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Scasum) {
+    MThreadClass<AsumMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Dzasum) {
+    CHECK_DOUBLE;
+    MThreadClass<AsumMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+//iAMAX
+#ifdef DO_iAMAX
+TEST(THREAD, iSamax) {
+    MThreadClass<iAmaxMetod<float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, iDamax) {
+    CHECK_DOUBLE;
+    MThreadClass<iAmaxMetod<cl_double> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, iCamax) {
+    MThreadClass<iAmaxMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, iZamax) {
+    CHECK_DOUBLE;
+    MThreadClass<iAmaxMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+//DOTC
+#ifdef DO_DOTC
+TEST(THREAD, Cdotc) {
+    MThreadClass<DotcMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Zdotc) {
+    CHECK_DOUBLE;
+    MThreadClass<DotcMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+
+#ifdef DO_ROTG
+TEST(THREAD, Srotg) {
+    MThreadClass<RotgMetod<cl_float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Zrotg) {
+    CHECK_DOUBLE;
+    MThreadClass<RotgMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_ROTM
+TEST(THREAD, Srotm) {
+    MThreadClass<RotmMetod<cl_float> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_ROT
+TEST(THREAD, Srot) {
+    MThreadClass<RotMetod<float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Drot) {
+    CHECK_DOUBLE;
+    MThreadClass<RotMetod<cl_double> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Csrot) {
+    MThreadClass<RotMetod<FloatComplex> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Zdrot) {
+    CHECK_DOUBLE;
+    MThreadClass<RotMetod<DoubleComplex> > ec;
+    ec.mthread();
+}
+
+
+#endif
+
+#ifdef DO_ROTMG
+TEST(THREAD, Srotmg) {
+    MThreadClass<RotmgMetod<cl_float> > ec;
+    ec.mthread();
+}
+#endif
+
+#ifdef DO_NRM2
+TEST(THREAD, Snrm2) {
+    MThreadClass<Nrm2Metod<float> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Dnrm2) {
+    CHECK_DOUBLE;
+    MThreadClass<Nrm2Metod<cl_double> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Scnrm2) {
+    MThreadClass<Nrm2Metod<FloatComplex> > ec;
+    ec.mthread();
+}
+
+TEST(THREAD, Dznrm2) {
+    CHECK_DOUBLE;
+    MThreadClass<Nrm2Metod<DoubleComplex> > ec;
+    ec.mthread();
+}
+#endif
diff --git a/src/tests/functional/func.h b/src/tests/functional/func.h
new file mode 100644
index 0000000..c8c983d
--- /dev/null
+++ b/src/tests/functional/func.h
@@ -0,0 +1,2804 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#define DO_THEIRS
+#define DO_TRMV
+#define DO_TRSV
+#define DO_SYMM
+#define DO_SYR
+#define DO_SPR
+#define DO_GER
+#define DO_GERC
+#define DO_HER
+#define DO_SYR2
+#define DO_HER2
+#define DO_HER
+#define DO_SYR2
+#define DO_HER2
+#define DO_HEMV
+#define DO_HEMM
+#define DO_HERK
+#define DO_TPMV
+#define DO_HPMV
+#define DO_SPMV
+#define DO_TPSV
+#define DO_HPR
+#define DO_SBMV
+#define DO_HPR2
+#define DO_SPR2
+#define DO_GBMV
+#define DO_HBMV
+#define DO_TBMV
+#define DO_TBSV
+#define DO_HER2K
+#define DO_SCAL
+#define DO_SSCAL
+#define DO_DOT
+#define DO_DOTC
+#define DO_SWAP
+#define DO_COPY
+#define DO_AXPY
+#define DO_ROTG
+#define DO_ROTM
+#define DO_ROTMG
+#define DO_ROT
+#define DO_NRM2
+#define DO_ASUM
+#define DO_iAMAX
+
+#ifndef FUNC_H_
+#define FUNC_H_
+
+//#define _USE_GEMV_COMPLEX
+#include <typeinfo>
+#include <her.h>
+#include <her2.h>
+#include <spmv.h>
+#include <gbmv.h>
+#include <tbmv.h>
+#include <tbsv.h>
+#include <rotmg.h>
+
+// Functions of BaseMetod Modified
+// <typeinfo> included : As using typeid()
+// testDG.h contains common definitions and enumerations used for populate()
+/*  bool prepareDataToRun();
+    void copyData(baseMetod<T> & source);
+    void initDefault(size_t s, unsigned int q, USE_BUFFER ub);
+    void destroy();
+    void compareData(baseMetod<T> & source);
+    cl_int getResult();
+
+    matrix.h
+    // Added support for Packed Matrix
+    getElement();
+    setElement();
+
+// New stuff added
+   populate() : Can generate data for general, packed, symmetric, lower-upper triangle
+   //
+   // Set diagonal elements to unity, random, zero
+   // Row-Major, Col-Major support
+   // TODO: Hermition Matrix, Banded Matrix support
+*/
+enum USE_BUFFER
+{
+   	USE_ABC,
+   	USE_AB,
+	USE_AC,
+	USE_AXY, 	//For TRMV and friends
+	USE_APXY,	//For TPMV and friends
+	USE_AX, 	//For TRSV and friends
+	USE_APX,	//For TPSV and friends
+	USE_X,      //For blas-1 routines
+	USE_XY,
+    USE_ABXY,
+    USE_ABCXY,  // for xROTMG
+	USE_NOTHING	// Don't Care: Memory Allocation handled by derived Metod (xxxMetod class)
+};
+
+typedef enum BUFFER
+{
+	Aresult,
+	APresult,
+	Bresult,
+	Cresult,
+	Xresult,
+	Yresult
+} BUFFER_KIND;
+
+template<typename T>
+class baseMetod
+{
+protected:
+    clMath::BlasBase *base;
+public:
+    typedef T TYPE;
+    T t;
+
+    USE_BUFFER 	   inputBuffers;
+    BUFFER_KIND	   resultBuffer;
+    BUFFER_KIND    resultBuffer_additional;
+
+    clblasOrder order;
+
+    cl_command_queue* queues;
+    cl_uint           qnum;
+
+    cl_context       context;
+
+    cl_event* outEvent;
+    cl_event* inEvent;
+    cl_uint   inEventCount;
+
+
+    int seed;
+    const char* env;
+
+    size_t size;
+    //size_t size2;
+
+	BUFFER_KIND resultLocation;
+
+    TYPE alpha, beta;
+    cl_mem bufA, bufB, bufC, bufX, bufY, bufAP;
+    TYPE *A, *AP, *B, *C, *X, *Y;
+    size_t ASize, BSize, CSize, XSize, YSize;
+
+    void initOutEvent();
+    bool prepareDataToRun();
+    void copyData(baseMetod<T> & source);
+    void initDefault(size_t s, unsigned int q, USE_BUFFER ub);
+    void destroy();
+    void compareData(baseMetod<T> & source);
+    cl_int getResult();
+
+};
+
+template <typename T> bool
+baseMetod<T>::prepareDataToRun()
+{
+    if (A != NULL) {
+        bufA = base->createEnqueueBuffer(A, size * size * sizeof (TYPE), 0, ((resultBuffer == Aresult )? CL_MEM_READ_WRITE:CL_MEM_READ_ONLY));
+	if ( bufA == NULL){ return false; }
+    }
+
+    if (B != NULL) {
+        bufB = base->createEnqueueBuffer(B, size * size * sizeof (TYPE), 0, ((resultBuffer == Bresult )? CL_MEM_READ_WRITE:CL_MEM_READ_ONLY));
+	if ( bufB == NULL){ return false; }
+    }
+
+    if (C != NULL) {
+        bufC = base->createEnqueueBuffer(C, size * size * sizeof (TYPE), 0, ((resultBuffer == Cresult )? CL_MEM_READ_WRITE:CL_MEM_READ_ONLY));
+	if ( bufC == NULL){ return false; }
+    }
+
+    if (AP != NULL) {
+        bufAP = base->createEnqueueBuffer(AP, ((size * (size + 1)) / 2) * sizeof (TYPE), 0, ((resultBuffer == APresult )? CL_MEM_READ_WRITE:CL_MEM_READ_ONLY));
+	if ( bufAP == NULL){ return false; }
+    }
+
+    if (X != NULL) {
+        bufX = base->createEnqueueBuffer(X, size * sizeof (TYPE), 0, ((resultBuffer == Xresult )? CL_MEM_READ_WRITE:CL_MEM_READ_ONLY));
+	if ( bufX == NULL){ return false; }
+    }
+
+    if (Y != NULL) {
+        bufY = base->createEnqueueBuffer(Y, size * sizeof (TYPE), 0, ((resultBuffer == Yresult )? CL_MEM_READ_WRITE:CL_MEM_READ_ONLY));
+	if ( bufY == NULL){ return false; }
+    }
+
+    return true;
+}
+template <typename T> void
+baseMetod<T>::initOutEvent()
+{
+    outEvent = new cl_event[qnum];
+    for (unsigned int i = 0; i < qnum; ++i) {
+        outEvent[i] = NULL;
+    }
+}
+template <typename T> void
+baseMetod<T>::copyData(baseMetod<T> & source)
+{
+    if (source.A != NULL) {
+        //A = new TYPE[size * size];
+        memcpy(A, source.A, size * size * sizeof(TYPE));
+    }
+    if (source.B != NULL) {
+        //B = new TYPE[size * size];
+        memcpy(B, source.B, size * size * sizeof(TYPE));
+    }
+    if (source.C != NULL) {
+        //C = new TYPE[size * size];
+        memcpy(C, source.C, size * size * sizeof(TYPE));
+    }
+
+    if (source.AP != NULL) {
+        //A = new TYPE[size * size];
+        memcpy(AP, source.AP, ((size * (size+1)) /2 )* sizeof(TYPE));
+    }
+    if (source.X != NULL) {
+        //A = new TYPE[size * size];
+        memcpy(X, source.X, size * sizeof(TYPE));
+    }
+    if (source.Y != NULL) {
+        //A = new TYPE[size * size];
+        memcpy(Y, source.Y, size * sizeof(TYPE));
+    }
+
+    alpha = source.alpha;
+    beta = source.beta;
+}
+
+template <typename T> void
+baseMetod<T>::initDefault(size_t s, unsigned int q, USE_BUFFER ub)
+{
+    size = s;
+
+    order = clblasColumnMajor;
+
+    seed = 12345;
+    base = clMath::BlasBase::getInstance();
+
+    if (q  > 0) {
+        base->setNumCommandQueues(q);
+    }
+
+    queues = base->commandQueues();
+    qnum = base->numCommandQueues();
+
+    context = base->context();
+    alpha = convertMultiplier<TYPE>(base->alpha());
+    beta = convertMultiplier<TYPE>(base->beta());
+
+    outEvent= NULL;
+    inEvent = NULL;
+
+    inEventCount = 0;
+
+    switch (ub) {
+    case USE_ABC:
+        A = new TYPE[size * size];
+        B = new TYPE[size * size];
+        C = new TYPE[size * size];
+		AP = NULL;
+		X  = NULL;
+		Y  = NULL;
+        break;
+
+    case USE_AB:
+        A = new TYPE[size * size];
+        B = new TYPE[size * size];
+        AP = NULL;
+		C  = NULL;
+		X  = NULL;
+		Y  = NULL;
+        break;
+
+	case USE_AC:
+        A = new TYPE[size * size];
+        C = new TYPE[size * size];
+        AP = NULL;
+        B  = NULL;
+        X  = NULL;
+        Y  = NULL;
+        break;
+
+
+    case USE_AX:
+
+		A = new TYPE[size * size];
+		X = new TYPE[size];
+		AP = NULL;
+		B  = NULL;
+		C  = NULL;
+		Y  = NULL;
+		break;
+
+    case USE_AXY:
+		A = new TYPE[size * size];
+		X = new TYPE[size];
+		Y = new TYPE[size];
+		AP = NULL;
+		B  = NULL;
+		C  = NULL;
+		break;
+
+    case USE_APXY:
+		AP = new TYPE[(size * (size + 1)) /2];
+		X = new TYPE[size];
+		Y = new TYPE[size];
+		A = NULL;
+		B  = NULL;
+		C  = NULL;
+		break;
+
+    case USE_APX:
+		AP = new TYPE[(size * (size + 1)) /2];
+		X = new TYPE[size];
+		A  = NULL;
+		B  = NULL;
+		C  = NULL;
+		Y  = NULL;
+		break;
+
+	case USE_ABXY:
+		X = new TYPE[size];
+		Y = new TYPE[size];
+		AP = NULL;
+		A = new TYPE[size * size];
+		B  = new TYPE[size * size];
+		C  = NULL;
+		break;
+
+    // Currently used only for xROTMG requiring 5 buffers
+    // change if it is reusable for more tests
+    case USE_ABCXY:
+        X = new TYPE[size];
+        Y = new TYPE[size];
+        AP = NULL;
+        A = new TYPE[size*size];//for D1
+        B = new TYPE[size*size];// for D2
+        C = new TYPE[size*size];//for SPARAM
+        break;
+
+	case USE_X:
+		X = new TYPE[size];
+		Y = NULL;
+		AP = NULL;
+		A = NULL;
+		B  = NULL;
+		C  = NULL;
+		break;
+
+    case USE_XY: // suitable for BLAS-1 routines: copy & swap
+		X = new TYPE[size];
+		Y = new TYPE[size];
+		AP = NULL;
+		A = NULL;
+		B  = NULL;
+		C  = NULL;
+		break;
+
+    default:
+        AP = NULL;
+		A  = NULL;
+		B  = NULL;
+		C  = NULL;
+		X  = NULL;
+		Y  = NULL;
+    }
+
+    bufA = NULL;
+    bufB = NULL;
+    bufC = NULL;
+    bufX = NULL;
+    bufY = NULL;
+    bufAP = NULL;
+
+    srand(seed);
+    //std::cerr << "init = " << size << std::endl;
+
+    env = NULL;
+}
+
+template <typename T> void
+baseMetod<T>::destroy()
+{
+    if (outEvent != NULL) {
+        for (unsigned int i = 0; i < qnum; ++i) {
+            outEvent[i] = NULL;
+        }
+        delete[](outEvent);
+    }
+
+    //std::cerr << "destroy "<< std::endl;
+
+    delete[] this->A;
+    delete[] this->B;
+    delete[] this->C;
+    delete[] this->AP;
+    delete[] this->X;
+    delete[] this->Y;
+
+    clReleaseMemObject(this->bufA);
+    clReleaseMemObject(this->bufB);
+    clReleaseMemObject(this->bufC);
+    clReleaseMemObject(this->bufAP);
+    clReleaseMemObject(this->bufX);
+    clReleaseMemObject(this->bufY);
+
+    A = NULL;
+    B = NULL;
+    C = NULL;
+    AP = NULL;
+    X = NULL;
+    Y = NULL;
+
+    bufA = NULL;
+    bufB = NULL;
+    bufC = NULL;
+    bufAP = NULL;
+    bufX = NULL;
+    bufY = NULL;
+}
+
+template <typename T> void
+baseMetod<T>::compareData(baseMetod<T> & source)
+{
+/*    if (C == NULL) {
+        compareMatrices<T>(order, size, size, B, source.B, size);
+    }
+    else {
+        compareMatrices<T>(order, size, size, C, source.C, size);
+    }
+*/
+
+/*
+	if (C == NULL && ( X == NULL)) {
+		 resultBuffer = Bresult;
+	}
+	else
+	{
+		 resultBuffer = Cresult;
+	}
+*/
+
+	T* s1 = NULL;
+	T* s2 = NULL;
+
+	s1 = ( resultBuffer == Aresult)? A: ( resultBuffer == Bresult) ? B: ( resultBuffer == Cresult)? C:( resultBuffer == Xresult)? X:( resultBuffer == Yresult)? Y: AP;
+	s2 = ( resultBuffer == Aresult)? source.A: ( resultBuffer == Bresult) ? source.B: ( resultBuffer == Cresult)? source.C:( resultBuffer == Xresult)? source.X:( resultBuffer == Yresult)? source.Y: source.AP;
+
+	clblasOrder fOrder;
+
+	size_t m,n,lda;
+
+    if ( resultBuffer == Aresult || resultBuffer == Bresult || resultBuffer == Cresult )
+    {
+		m = size;
+		n = size;
+		lda = size;
+		fOrder = order;
+    }
+    else if ( resultBuffer == Xresult || resultBuffer == Yresult )
+    {
+		m = size;
+		n = 1;
+		lda = size;
+		fOrder = clblasColumnMajor;
+    }
+    else if ( resultBuffer == APresult)
+    {
+		m = size;
+		n = size;
+		lda = 0; // compareMatrix expects lda = 0 for Packed Matrix
+		fOrder = order;
+    }
+
+	compareMatrices<T>( fOrder, m, n, s1, s2, lda);
+}
+
+template <typename T> cl_int
+baseMetod<T>::getResult()
+{
+    cl_int err;
+/*
+    if (C == NULL) {
+        err = clEnqueueReadBuffer(queues[0], bufB, CL_TRUE, 0, size * size * sizeof(TYPE),
+                B, 0, NULL, NULL);
+    }
+    else {
+        err = clEnqueueReadBuffer(queues[0], bufC, CL_TRUE, 0, size * size * sizeof(TYPE),
+                C, 0, NULL, NULL);
+    }
+*/
+/*
+	if (C == NULL) {
+		 resultBuffer = Bresult;
+	}
+	else
+	{
+		 resultBuffer = Cresult;
+	}
+*/
+
+    T* s =  NULL;
+    s = ( resultBuffer == Aresult)? A: ( resultBuffer == Bresult) ? B: ( resultBuffer == Cresult)? C:( resultBuffer == Xresult)? X:( resultBuffer == Yresult)? Y: AP;
+
+   cl_mem bufs = ( resultBuffer == Aresult)? bufA: ( resultBuffer == Bresult) ? bufB: ( resultBuffer == Cresult)? bufC:( resultBuffer == Xresult)? bufX:( resultBuffer == Yresult)? bufY: bufAP;
+
+    size_t transferSize = 0;
+    if ( resultBuffer == Aresult || resultBuffer == Bresult || resultBuffer == Cresult )
+    {
+	transferSize = size * size;
+    }
+    else if ( resultBuffer == Xresult || resultBuffer == Yresult )
+    {
+	transferSize = size;
+    }
+    else if ( resultBuffer == APresult)
+    {
+	transferSize = (size * (size + 1))/2;
+    }
+
+    transferSize *= sizeof(TYPE);
+	err = CL_SUCCESS;
+
+    err = clEnqueueReadBuffer(queues[0], bufs, CL_TRUE, 0, transferSize,
+                s, 0, NULL, NULL);
+    return err;
+
+}
+///////
+template<typename T>
+class GemmMetod : public baseMetod<T>
+{
+private:
+    typedef T TYPE;
+
+    clblasTranspose transA;
+    clblasTranspose transB;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+GemmMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_ABC);
+    transA = clblasNoTrans;
+    transB = clblasNoTrans;
+	this->resultBuffer = Cresult;
+
+    baseMetod<T>::env = "AMD_CLBLAS_GEMM_IMPLEMENTATION";
+}
+
+template <typename T> void
+GemmMetod<T>::generateData()
+{
+    bool useAlpha = this->base->useAlpha();
+    bool useBeta = this->base->useBeta();
+
+    randomGemmMatrices<TYPE>(this->order, transA, transB,
+            this->size, this->size, this->size, useAlpha,
+            &this->alpha, this->A, this->size, this->B,
+            this->size, useBeta, &this->beta, this->C, this->size);
+
+}
+
+template <typename T> cl_int
+GemmMetod<T>::run()
+{
+    return (cl_int)::clMath::clblas::gemm(this->order, transA, transB,
+        this->size, this->size, this->size, this->alpha, this->bufA, 0,
+        this->size, this->bufB, 0, this->size, this->beta, this->bufC, 0,
+        this->size, this->qnum, this->queues, this->inEventCount,
+        this->inEvent, this->outEvent);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename T>
+class TrmmMetod  : public baseMetod<T>
+{
+    typedef T TYPE;
+
+    clblasTranspose transA;
+    clblasTranspose transB;
+    clblasUplo uplo;
+    clblasSide side;
+    clblasDiag diag;
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+TrmmMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_AB);
+
+    transA = clblasNoTrans;
+    transB = clblasNoTrans;
+    side = clblasLeft;
+    uplo = clblasUpper;
+    diag = clblasUnit;
+	this->resultBuffer = Bresult;
+
+    baseMetod<T>::env = "AMD_CLBLAS_TRMM_IMPLEMENTATION";
+}
+
+template <typename T> void
+TrmmMetod<T>::generateData()
+{
+    bool useAlpha = this->base->useAlpha();
+    randomTrmmMatrices<TYPE>(this->order, side, uplo, diag,
+            this->size, this->size, useAlpha, &this->alpha, this->A,
+            this->size, this->B, this->size);
+}
+
+template <typename T> cl_int
+TrmmMetod<T>::run()
+{
+    return (cl_int)::clMath::clblas::trmm(this->order, this->side, this->uplo,
+          this->transA, clblasUnit, this->size, this->size, this->alpha,
+          this->bufA, 0, this->size, this->bufB, 0, this->size,
+          this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename T>
+class TrsmMetod  : public baseMetod<T>
+{
+    typedef T TYPE;
+
+    clblasTranspose transA;
+    clblasUplo uplo;
+    clblasSide side;
+    clblasDiag diag;
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+TrsmMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_AB);
+
+    transA = clblasNoTrans;
+    side = clblasLeft;
+    uplo = clblasUpper;
+    diag = clblasUnit;
+	this->resultBuffer = Bresult;
+
+    baseMetod<T>::env = "AMD_CLBLAS_TRSM_IMPLEMENTATION";
+}
+
+template <typename T> void
+TrsmMetod<T>::generateData()
+{
+    bool useAlpha = this->base->useAlpha();
+
+    randomTrsmMatrices<T>(this->order, side, uplo, diag,
+            this->size, this->size, useAlpha, &this->alpha,
+            this->A, this->size, this->B, this->size);
+
+}
+template <typename T> cl_int
+TrsmMetod<T>::run()
+{
+    return (cl_int)::clMath::clblas::trsm(this->order, side, uplo,
+        transA, diag, this->size, this->size, this->alpha, this->bufA, 0,
+        this->size, this->bufB, 0, this->size, this->qnum, this->queues,
+        this->inEventCount, this->inEvent, this->outEvent);
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename T>
+class GemvMetod : public baseMetod<T>
+{
+    typedef T TYPE;
+
+    clblasTranspose transA;
+    clblasTranspose transB;
+    clblasTranspose transC;
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+GemvMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_ABC);
+
+    transA = clblasNoTrans;
+    transB = clblasNoTrans;
+    transC = clblasNoTrans;
+	this->resultBuffer = Cresult;
+}
+
+template <typename T> void
+GemvMetod<T>::generateData()
+{
+    bool useAlpha = this->base->useAlpha();
+    bool useBeta = this->base->useBeta();
+
+    randomGemmxMatrices<T>(this->order, transA, transB, transC,
+            this->size, this->size, this->size, useAlpha,
+            &this->alpha, this->A, this->size, this->B, this->size, useBeta,
+            &this->beta, this->C, this->size);
+
+}
+
+template <typename T> cl_int
+GemvMetod<T>::run()
+{
+    return (cl_int)::clMath::clblas::gemv(this->order, transA,
+            this->size, this->size, this->alpha, this->bufA, 0, this->size,
+            this->bufB, 0, 1, this->beta, this->bufC, 0, 1,
+            this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+    return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename T>
+class SymvMetod : public baseMetod<T>
+{
+    typedef T TYPE;
+
+    clblasTranspose transA;
+    clblasTranspose transB;
+    clblasTranspose transC;
+    clblasUplo uplo;
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+SymvMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_ABC);
+
+    transA = clblasNoTrans;
+    transB = clblasNoTrans;
+    transC = clblasNoTrans;
+    uplo = clblasUpper;
+	this->resultBuffer = Cresult;
+}
+
+template <typename T> void
+SymvMetod<T>::generateData()
+{
+    bool useAlpha = this->base->useAlpha();
+    bool useBeta = this->base->useBeta();
+
+    randomGemmxMatrices<T>(this->order, transA, transB, transC,
+            this->size, this->size, this->size, useAlpha,
+            &this->alpha, this->A, this->size, this->B, this->size, useBeta,
+            &this->beta, this->C, this->size);
+
+}
+
+
+template <typename T> cl_int
+SymvMetod<T>::run()
+{
+    return (cl_int)::clMath::clblas::symv(this->order, uplo, this->size, this->alpha,
+            this->bufA, 0, this->size, this->bufB, 0, 1, this->beta, this->bufC, 0, 1,
+            this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename T>
+class Syr2kMetod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+
+    clblasTranspose transA;
+    clblasTranspose transB;
+    clblasUplo uplo;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+Syr2kMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_ABC);
+    transA = clblasNoTrans;
+    transB = clblasNoTrans;
+    uplo = clblasUpper;
+	this->resultBuffer = Cresult;
+}
+
+template <typename T> void
+Syr2kMetod<T>::generateData()
+{
+    bool useBeta = this->base->useBeta();
+
+    randomGemmMatrices<T>(this->order, transA, transB, this->size, this->size,
+        this->size, true, &this->alpha, this->A, this->size, this->B,
+        this->size, useBeta, &this->beta, this->C, this->size);
+}
+
+template <typename T> cl_int
+Syr2kMetod<T>::run()
+{
+    return (cl_int)::clMath::clblas::syr2k(this->order, uplo, transA,
+        this->size, this->size, this->alpha, this->bufA, 0,
+        this->size, this->bufB, 0, this->size, this->beta, this->bufC, 0,
+        this->size, this->qnum, this->queues,
+        this->inEventCount, this->inEvent, this->outEvent);
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename T>
+class TrmvMetod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+
+   	clblasTranspose transA;
+	clblasUplo uplo;
+	clblasDiag diagA;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+// Assumptions
+// 1. Packed Matrix : lda must be set to zero
+// 2. Always test with RowMajor Lower in case of Packed matrix
+// 3. NoTrans case only supported for Packed matrix
+//
+template <typename T> void
+TrmvMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_AXY);
+    this->order = clblasRowMajor;
+    transA = clblasNoTrans;
+    uplo   = clblasLower;
+    diagA  = clblasUnit;
+    this->resultBuffer = Xresult;
+}
+
+template <typename T> void
+TrmvMetod<T>::generateData()
+{
+/*
+enum RealMatrixCreationFlags {
+		NO_FLAGS			= 0,
+		ROW_MAJOR_ORDER 		= 1,
+		PACKED_MATRIX 			= 2,
+		SYMMETRIC_MATRIX		= 4,
+		UPPER_HALF_ONLY			= 8,
+		LOWER_HALF_ONLY			= 16,
+		NO_ALIGNMENT			= 32,
+		UNIT_DIAGONAL			= 64,
+		RANDOM_INIT			= 128,
+		ZERO_DIAGONAL			= 256
+*/
+
+//bool useBeta = this->base->useBeta();
+/*
+    randomGemmMatrices<T>(this->order, transA, transB,this->size, this->size, this->size,
+        true, &this->alpha, this->A, this->size, this->B, this->size, useBeta, &this->beta, this->C, this->size);
+*/
+
+    // Set flags...
+    int creationFlags = 0;
+    creationFlags =  creationFlags | RANDOM_INIT;
+
+    // Default is Column-Major
+    creationFlags = ( (this-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
+    // Setting uplo
+    creationFlags = ( (this-> uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY);
+    BlasRoutineID BlasFn = CLBLAS_TRMV;
+
+    // Matrix A
+	 populate( this->A, this->size, this->size, this->size, BlasFn, creationFlags);
+     populate( this->X, this->size, 1, this->size, BlasFn);
+}
+
+template <typename T> cl_int
+TrmvMetod<T>::run()
+{
+
+
+	DataType type;
+
+	type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE:
+																( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+ 	return (cl_int)::clMath::clblas::trmv(type, this->order, uplo, transA, diagA, this->size, this->bufA, 0, this->size, this->bufX,
+						0, 1, this->bufY/* as Xcopy */, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+
+
+}
+
+/////////////////////////////////////////////////////////////////////
+
+template<typename T>
+class TrsvMetod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+
+        clblasTranspose transA;
+        clblasUplo uplo;
+        clblasDiag diagA;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+// Assumptions
+// 1. Packed Matrix : lda must be set to zero
+// 2. Always test with RowMajor Lower in case of Packed matrix
+// 3. NoTrans case only supported for Packed matrix
+//
+template <typename T> void
+TrsvMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_AX);
+    this->order = clblasRowMajor;
+    transA = clblasNoTrans;
+    uplo   = clblasLower;
+    diagA  = clblasUnit;
+    this->resultBuffer = Xresult;
+}
+
+template <typename T> void
+TrsvMetod<T>::generateData()
+{
+/*
+enum RealMatrixCreationFlags {
+                NO_FLAGS                        = 0,
+                ROW_MAJOR_ORDER                 = 1,
+                PACKED_MATRIX                   = 2,
+                SYMMETRIC_MATRIX                = 4,
+                UPPER_HALF_ONLY                 = 8,
+                LOWER_HALF_ONLY                 = 16,
+                NO_ALIGNMENT                    = 32,
+                UNIT_DIAGONAL                   = 64,
+                RANDOM_INIT                     = 128,
+                ZERO_DIAGONAL                   = 256
+*/
+
+	int creationFlags = 0;
+    creationFlags =  creationFlags | RANDOM_INIT;
+
+    // Default is Column-Major
+    creationFlags = ( (this-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
+    // Setting uplo
+    creationFlags = ( (this-> uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY);
+
+
+    // Matrix A
+    //populate( this->A, this->size, this->size, this->size, creationFlags);
+    //populate( this->X, this->size, 1, this->size);
+
+    randomTrsvMatrices(this->order, this->uplo, this->diagA, this->size, this->A, this->size, this->X, 1);
+}
+
+template <typename T> cl_int
+TrsvMetod<T>::run()
+{
+
+
+        DataType type;
+		type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE:
+                                                                ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+
+        return (cl_int)::clMath::clblas::trsv(type, this->order, uplo, transA, diagA, this->size, this->bufA, 0, this->size, this->bufX,
+                                                0, 1, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+
+
+}
+
+/////////////////////////////////////////////////////////////////////
+
+template<typename T>
+class TpsvMetod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+
+        clblasTranspose transA;
+        clblasUplo uplo;
+        clblasDiag diagA;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+// Assumptions
+// 1. Packed Matrix : lda must be set to zero
+// 2. Always test with RowMajor Lower in case of Packed matrix
+// 3. NoTrans case only supported for Packed matrix
+//
+template <typename T> void
+TpsvMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_AX);
+    this->order = clblasRowMajor;
+    transA = clblasNoTrans;
+    uplo   = clblasLower;
+    diagA  = clblasUnit;
+    this->resultBuffer = Xresult;
+}
+
+template <typename T> void
+TpsvMetod<T>::generateData()
+{
+    randomTrsvMatrices(this->order, this->uplo, this->diagA, this->size, this->A, 0, this->X, 1);
+}
+
+template <typename T> cl_int
+TpsvMetod<T>::run()
+{
+
+
+        DataType type;
+        type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE:
+                                                                ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+
+        return (cl_int)::clMath::clblas::tpsv(type, this->order, uplo, transA, diagA, this->size, this->bufA, 0, this->bufX,
+                                                0, 1, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+
+
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename T>
+class SymmMetod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+
+    clblasSide side;
+    clblasUplo uplo;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+// Assumptions
+// 1. Packed Matrix : lda must be set to zero
+// 2. Always test with RowMajor Lower in case of Packed matrix
+// 3. NoTrans case only supported for Packed matrix
+//
+template <typename T> void
+SymmMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_ABC);
+    this->order = clblasRowMajor;
+    uplo   = clblasLower;
+    side   = clblasLeft;
+    this->resultBuffer = Cresult;
+
+}
+
+template <typename T> void
+SymmMetod<T>::generateData()
+{
+/*
+enum RealMatrixCreationFlags {
+        NO_FLAGS            = 0,
+        ROW_MAJOR_ORDER         = 1,
+        PACKED_MATRIX           = 2,
+        SYMMETRIC_MATRIX        = 4,
+        UPPER_HALF_ONLY         = 8,
+        LOWER_HALF_ONLY         = 16,
+        NO_ALIGNMENT            = 32,
+        UNIT_DIAGONAL           = 64,
+        RANDOM_INIT         = 128,
+        ZERO_DIAGONAL           = 256
+*/
+
+//bool useBeta = this->base->useBeta();
+/*
+    randomGemmMatrices<T>(this->order, transA, transB,this->size, this->size, this->size,
+        true, &this->alpha, this->A, this->size, this->B, this->size, useBeta, &this->beta, this->C, this->size);
+*/
+
+
+    // Set flags...
+    int creationFlags = 0, creationFlagsA;
+    creationFlags =  creationFlags | RANDOM_INIT;
+
+    // Default is Column-Major
+    creationFlags = ( (this-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
+    // Setting uplo
+	//In this case only A matrix is either upper or lower triangular
+	creationFlagsA = creationFlags;
+    creationFlagsA = ( (this-> uplo) == clblasLower)? (creationFlagsA | LOWER_HALF_ONLY) : (creationFlagsA | UPPER_HALF_ONLY);
+	BlasRoutineID BlasFn = CLBLAS_SYMM;
+    // Matrix A
+        populate(this->A, this->size, this->size, this->size, BlasFn, creationFlagsA );
+		populate(this->B, this->size, this->size, this->size, BlasFn, creationFlags);
+		populate(this->C, this->size, this->size, this->size, BlasFn, creationFlags);
+}
+
+template <typename T> cl_int
+SymmMetod<T>::run()
+{
+
+    return (cl_int)::clMath::clblas::symm(this->order, side, uplo, this->size, this->size, this->alpha, this->bufA, 0, this->size,
+											this->bufB, 0, this->size, this->beta, this->bufC, 0, this->size,
+                        					this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+
+
+}
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+class SyrMetod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+    clblasUplo uplo;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+// Assumptions
+// 1. Testing for Row Major order.
+// 2. Lower triangle
+//
+template <typename T> void
+SyrMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_AX);
+    this->order = clblasRowMajor;
+    uplo   = clblasLower;
+    this->resultBuffer = Aresult;
+}
+
+template <typename T> void
+SyrMetod<T>::generateData()
+{
+    /*
+	// Set flags...
+    int creationFlags = 0;
+    creationFlags =  creationFlags | RANDOM_INIT;
+
+    // Default is Column-Major
+    creationFlags = ( (this-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
+    creationFlags = ( (this-> uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY);
+	BlasRoutineID BlasFn = CLBLAS_GER;
+
+    // Matrix A
+    populate( this->A, this->size, this->size, this->size, BlasFn, creationFlags);
+	//Vector X
+    populate( this->X, this->size, 1, this->size, BlasFn);
+	*/
+	randomSyrMatrices( this->order, uplo, this->size, false, &(this->alpha), this->A, this->size, this->X, 1);
+}
+
+template <typename T> cl_int
+SyrMetod<T>::run()
+{
+
+    return (cl_int)::clMath::clblas::syr(this->order, uplo, this->size, this->alpha, this->bufX, 0, 1,
+										 this->bufA, 0, this->size, this->qnum, this->queues,
+										 this->inEventCount, this->inEvent, this->outEvent);
+
+
+}
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+class Syr2Metod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+    clblasUplo uplo;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+// Assumptions
+// 1. Testing for Row Major order.
+// 2. Lower triangle
+//
+template <typename T> void
+Syr2Metod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_AXY);
+    this->order = clblasRowMajor;
+    uplo   = clblasLower;
+    this->resultBuffer = Aresult;
+}
+
+template <typename T> void
+Syr2Metod<T>::generateData()
+{
+    randomSyr2Matrices( this->order, uplo, this->size, false, &(this->alpha), this->A, this->size, this->X, 1, this->Y, 1);
+}
+
+template <typename T> cl_int
+Syr2Metod<T>::run()
+{
+
+    return (cl_int)::clMath::clblas::syr2(this->order, uplo, this->size, this->alpha, this->bufX, 0, 1, this->bufY, 0, 1,
+										 this->bufA, 0, this->size, this->qnum, this->queues,
+										 this->inEventCount, this->inEvent, this->outEvent);
+
+
+}
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+class GerMetod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+	int incx, incy;
+        int m;
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+// Assumptions
+// 1. Packed Matrix : lda must be set to zero
+// 2. Always test with RowMajor Lower in case of Packed matrix
+// 3. NoTrans case only supported for Packed matrix
+//
+template <typename T> void
+GerMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_AXY);
+    this->order = clblasRowMajor;
+    this->resultBuffer = Aresult;
+
+}
+
+template <typename T> void
+GerMetod<T>::generateData()
+{
+/*
+enum RealMatrixCreationFlags {
+        NO_FLAGS            = 0,
+        ROW_MAJOR_ORDER         = 1,
+        PACKED_MATRIX           = 2,
+        SYMMETRIC_MATRIX        = 4,
+        UPPER_HALF_ONLY         = 8,
+        LOWER_HALF_ONLY         = 16,
+        NO_ALIGNMENT            = 32,
+        UNIT_DIAGONAL           = 64,
+        RANDOM_INIT             = 128,
+        ZERO_DIAGONAL           = 256
+*/
+
+//bool useBeta = this->base->useBeta();
+/*
+    randomGemmMatrices<T>(this->order, transA, transB,this->size, this->size, this->size,
+        true, &this->alpha, this->A, this->size, this->B, this->size, useBeta, &this->beta, this->C, this->size);
+*/
+
+
+    // Set flags...
+    int creationFlags = 0;
+    creationFlags =  creationFlags | RANDOM_INIT;
+
+    // Default is Column-Major
+    creationFlags = ( (this-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
+
+	BlasRoutineID BlasFn = CLBLAS_GER;
+    // Matrix A
+    populate(this->A, this->size, this->size, this->size,  BlasFn, creationFlags);
+    populate(this->X, this->size, 1, (1 + (m - 1) * abs(incx)), BlasFn, 0);
+    populate(this->Y, this->size, 1, (1 + (m - 1) * abs(incy)), BlasFn, 0);
+}
+
+template <typename T> cl_int
+GerMetod<T>::run()
+{
+
+    return (cl_int)::clMath::clblas::ger(this->order, this->size, this->size, this->alpha,
+						this->bufX, 0, 1, this->bufY, 0, 1, this->bufA, 0, this->size,
+						this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+template<typename T>
+class GercMetod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+	int incx, incy;
+        int m;
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+// Assumptions
+// 1. Packed Matrix : lda must be set to zero
+// 2. Always test with RowMajor Lower in case of Packed matrix
+// 3. NoTrans case only supported for Packed matrix
+//
+template <typename T> void
+GercMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_AXY);
+    this->order = clblasRowMajor;
+    this->resultBuffer = Aresult;
+
+}
+
+template <typename T> void
+GercMetod<T>::generateData()
+{
+    // Set flags...
+    int creationFlags = 0;
+    creationFlags =  creationFlags | RANDOM_INIT;
+
+    // Default is Column-Major
+    creationFlags = ( (this-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
+
+	BlasRoutineID BlasFn = CLBLAS_GER;
+    // Matrix A
+    populate(this->A, this->size, this->size, this->size,  BlasFn, creationFlags);
+    populate(this->X, this->size, 1, this->size, BlasFn, 0);
+    populate(this->Y, this->size, 1, this->size, BlasFn, 0);
+}
+
+template <typename T> cl_int
+GercMetod<T>::run()
+{
+
+    return (cl_int)::clMath::clblas::gerc(this->order, this->size, this->size, this->alpha,
+						this->bufX, 0, 1, this->bufY, 0, 1, this->bufA, 0, this->size,
+						this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+
+
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+template<typename T>
+class HerMetod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+    clblasUplo uplo;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+// Assumptions
+// 1. Testing for Row Major order.
+// 2. Lower triangle
+//
+template <typename T> void
+HerMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_AX);
+    this->order = clblasRowMajor;
+    uplo = clblasLower;
+    this->resultBuffer = Aresult;
+}
+
+template <typename T> void
+HerMetod<T>::generateData()
+{
+	randomHerMatrices( this->order, uplo, this->size, &(this->alpha), this->A, this->size, this->X, 1 );
+}
+
+
+template <typename T> cl_int
+HerMetod<T>::run()
+{
+
+    return (cl_int)::clMath::clblas::her(this->order, this->uplo, this->size, CREAL(this->alpha), this->bufX, 0, 1,
+                                                                                 this->bufA, 0, this->size, this->qnum, this->queues,
+                                                                                 this->inEventCount, this->inEvent, this->outEvent);
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+class Her2Metod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+    clblasUplo uplo;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+// Assumptions
+// 1. Testing for Row Major order.
+// 2. Lower triangle
+//
+template <typename T> void
+Her2Metod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_AXY);
+    this->order = clblasRowMajor;
+    uplo = clblasLower;
+    this->resultBuffer = Aresult;
+}
+
+template <typename T> void
+Her2Metod<T>::generateData()
+{
+	randomHer2Matrices<T>(this->order, uplo, this->size, &(this->alpha), this->A, this->size,
+								this->X, 1, this->Y, 1);
+}
+
+template <typename T> cl_int
+Her2Metod<T>::run()
+{
+
+    return (cl_int)::clMath::clblas::her2(this->order, this->uplo, this->size, this->alpha, this->bufX, 0, 1, this->bufY, 0, 1,
+                                                                                 this->bufA, 0, this->size, this->qnum, this->queues,
+                                                                                 this->inEventCount, this->inEvent, this->outEvent);
+
+
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+class HemmMetod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+    clblasSide side;
+    clblasUplo uplo;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+// Assumptions
+// 1. Packed Matrix : lda must be set to zero
+// 2. Always test with RowMajor Lower in case of Packed matrix
+// 3. NoTrans case only supported for Packed matrix
+//
+template <typename T> void
+HemmMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_ABC);
+    this->order = clblasRowMajor;
+    uplo   = clblasLower;
+    side   = clblasLeft;
+    this->resultBuffer = Cresult;
+
+}
+
+template <typename T> void
+HemmMetod<T>::generateData()
+{
+/*
+    int creationFlags = 0, creationFlagsA;
+    creationFlags =  creationFlags | RANDOM_INIT;
+
+    creationFlags = ( (this-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
+    creationFlagsA = creationFlags;
+    creationFlagsA = ( (this-> uplo) == clblasLower)? (creationFlagsA | LOWER_HALF_ONLY) : (creationFlagsA | UPPER_HALF_ONLY);
+    BlasRoutineID BlasFn = CLBLAS_HEMM;
+
+        populate(this->A, this->size, this->size, this->size, BlasFn, creationFlagsA );
+        populate(this->B, this->size, this->size, this->size, BlasFn, creationFlags);
+        populate(this->C, this->size, this->size, this->size, BlasFn, creationFlags);
+*/
+
+	randomGemmMatrices<T>(this->order, clblasNoTrans, clblasNoTrans,
+            this->size, this->size, this->size, false,
+            &this->alpha, this->A, this->size, this->B,
+            this->size, false, &this->beta, this->C, this->size);
+}
+
+template <typename T> cl_int
+HemmMetod<T>::run()
+{
+
+    return (cl_int)::clMath::clblas::hemm(this->order, side, uplo, this->size, this->size, this->alpha, this->bufA, 0, this->size,
+                                                                                        this->bufB, 0, this->size, this->beta, this->bufC, 0, this->size,
+                                                                this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+
+
+}
+
+// HEMV
+
+template<typename T>
+class HemvMetod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+    clblasUplo uplo;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+HemvMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_AXY);
+    this->order = clblasRowMajor;
+    uplo   = clblasLower;
+    this->resultBuffer = Yresult;
+}
+
+template <typename T> void
+HemvMetod<T>::generateData()
+{
+	randomHemvMatrices(this->order, uplo, this->size, false, &(this->alpha), this->A, this->size,
+							this->X, 1, false, &(this->beta), this->Y, 1);
+}
+
+template <typename T> cl_int
+HemvMetod<T>::run()
+{
+
+    return (cl_int)::clMath::clblas::hemv(this->order, uplo, this->size, this->alpha, this->bufA, 0, this->size,
+                                                   this->bufX, 0, 1, this->beta, this->bufY, 0, 1,
+                                                   this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+
+}
+/////////////////////////////////////////////////////
+template<typename T>
+class HerkMetod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+    clblasUplo uplo;
+    clblasTranspose transA;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+// Assumptions
+// 1. Packed Matrix : lda must be set to zero
+// 2. Always test with RowMajor Lower in case of Packed matrix
+// 3. NoTrans case only supported for Packed matrix
+//
+template <typename T> void
+HerkMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_AC);
+    this->order = clblasRowMajor;
+    uplo   = clblasLower;
+    transA = clblasNoTrans;
+    this->resultBuffer = Cresult;
+
+}
+
+template <typename T> void
+HerkMetod<T>::generateData()
+{
+
+	randomGemmMatrices<T>(this->order, this->transA, clblasNoTrans,
+        this->size, this->size, this->size, false, &this->alpha, this->A, this->size,
+        NULL, 0, false, &this->beta, this->C, this->size);
+}
+
+template <typename T> cl_int
+HerkMetod<T>::run()
+{
+
+    return (cl_int)::clMath::clblas::herk(this->order, uplo, transA, this->size, this->size, CREAL(this->alpha), this->bufA, 0, this->size,
+                                                                                         CREAL(this->beta), this->bufC, 0, this->size,
+                                                                this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+
+
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+class TpmvMetod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+
+    clblasTranspose trans;
+    clblasUplo uplo;
+    clblasDiag diag;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+// Assumptions
+// 1. Packed Matrix : lda must be set to zero
+// 2. Always test with RowMajor Lower in case of Packed matrix
+// 3. NoTrans case only supported for Packed matrix
+//
+template <typename T> void
+TpmvMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_APXY);
+    this->order = clblasRowMajor;
+    trans = clblasNoTrans;
+    uplo   = clblasLower;
+    diag  = clblasUnit;
+    this->resultBuffer = Xresult;
+}
+
+template <typename T> void
+TpmvMetod<T>::generateData()
+{
+
+    // Set flags...
+    int creationFlags = 0;
+    creationFlags =  creationFlags | RANDOM_INIT | PACKED_MATRIX;
+
+    // Default is Column-Major
+    creationFlags = ( (this-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
+    // Setting uplo
+    creationFlags = ( (this-> uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY);
+    BlasRoutineID BlasFn = CLBLAS_TRMV;
+
+    // Matrix A
+     populate( this->AP, this->size, this->size, 0, BlasFn, creationFlags);
+     populate( this->X, this->size, 1, this->size, BlasFn);
+}
+
+template <typename T> cl_int
+TpmvMetod<T>::run()
+{
+
+
+    DataType type;
+
+    type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE:
+                                                                ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+    return (cl_int)::clMath::clblas::tpmv(type, this->order, uplo, trans, diag, this->size, this->bufAP, 0, this->bufX,
+                        0, 1, this->bufY/* as Xcopy */, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+
+
+}
+///////////////////////////////////////////////////////////////////////////
+template<typename T>
+class SpmvMetod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+    clblasUplo uplo;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+SpmvMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_APXY);
+    this->order = clblasRowMajor;
+    uplo   = clblasLower;
+    this->resultBuffer = Yresult;
+}
+
+template <typename T> void
+SpmvMetod<T>::generateData()
+{
+	randomSpmvMatrices(this->order, uplo, this->size, false, &(this->alpha), this->AP,
+							this->X, 1, false, &(this->beta), this->Y, 1);
+}
+
+template <typename T> cl_int
+SpmvMetod<T>::run()
+{
+
+    return (cl_int)::clMath::clblas::spmv(this->order, uplo, this->size, this->alpha, this->bufAP, 0,
+                                                   this->bufX, 0, 1, this->beta, this->bufY, 0, 1,
+                                                   this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+
+}
+
+///////////////////////////////////////////////////////////////////////////
+template<typename T>
+class HpmvMetod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+    clblasUplo uplo;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+HpmvMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_APXY);
+    this->order = clblasRowMajor;
+    uplo   = clblasLower;
+    this->resultBuffer = Yresult;
+}
+
+template <typename T> void
+HpmvMetod<T>::generateData()
+{
+	randomHemvMatrices(this->order, uplo, this->size, false, &(this->alpha), this->AP, 0,
+							this->X, 1, false, &(this->beta), this->Y, 1);
+}
+
+template <typename T> cl_int
+HpmvMetod<T>::run()
+{
+
+    return (cl_int)::clMath::clblas::hpmv(this->order, uplo, this->size, this->alpha, this->bufAP, 0,
+                                                   this->bufX, 0, 1, this->beta, this->bufY, 0, 1,
+                                                   this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+class SprMetod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+    clblasUplo uplo;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+// Assumptions
+// 1. Testing for Row Major order.
+// 2. Lower triangle
+//
+template <typename T> void
+SprMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_APX);
+    this->order = clblasRowMajor;
+    uplo   = clblasLower;
+    this->resultBuffer = APresult;
+}
+
+template <typename T> void
+SprMetod<T>::generateData()
+{
+    randomSyrMatrices( this->order,uplo, this->size, false, &(this->alpha), this->AP, 0, this->X, 1);
+}
+
+template <typename T> cl_int
+SprMetod<T>::run()
+{
+
+    return (cl_int)::clMath::clblas::spr(this->order, uplo, this->size, this->alpha, this->bufX, 0, 1,
+                                         this->bufAP, 0, this->qnum, this->queues,
+                                         this->inEventCount, this->inEvent, this->outEvent);
+
+
+}
+
+///////////////////////////////////////////////////////////
+
+template<typename T>
+class HprMetod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+    clblasUplo uplo;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+HprMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_APX);
+    this->order = clblasRowMajor;
+    uplo = clblasLower;
+    this->resultBuffer = APresult;
+}
+
+template <typename T> void
+HprMetod<T>::generateData()
+{
+	randomHerMatrices( this->order, uplo, this->size, &(this->alpha), this->AP, 0, this->X, 1 );
+}
+
+
+template <typename T> cl_int
+HprMetod<T>::run()
+{
+
+    return (cl_int)::clMath::clblas::hpr(this->order, this->uplo, this->size, CREAL(this->alpha), this->bufX, 0, 1,
+                                                                                 this->bufAP, 0, this->qnum, this->queues,
+                                                                                 this->inEventCount, this->inEvent, this->outEvent);
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+class Hpr2Metod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+    clblasUplo uplo;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+// Assumptions
+// 1. Testing for Row Major order.
+// 2. Lower triangle
+//
+template <typename T> void
+Hpr2Metod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_APXY);
+    this->order = clblasRowMajor;
+    uplo = clblasLower;
+    this->resultBuffer = APresult;
+}
+
+template <typename T> void
+Hpr2Metod<T>::generateData()
+{
+	randomHer2Matrices<T>(this->order, uplo, this->size, &(this->alpha), this->AP, 0,
+								this->X, 1, this->Y, 1);
+}
+
+template <typename T> cl_int
+Hpr2Metod<T>::run()
+{
+
+    return (cl_int)::clMath::clblas::hpr2(this->order, this->uplo, this->size, this->alpha, this->bufX, 0, 1, this->bufY, 0, 1,
+                                                                                 this->bufAP, 0, this->qnum, this->queues,
+                                                                                 this->inEventCount, this->inEvent, this->outEvent);
+
+
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+class Spr2Metod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+    clblasUplo uplo;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+// Assumptions
+// 1. Testing for Row Major order.
+// 2. Lower triangle
+//
+template <typename T> void
+Spr2Metod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_APXY);
+    this->order = clblasRowMajor;
+    uplo   = clblasLower;
+    this->resultBuffer = APresult;
+}
+
+template <typename T> void
+Spr2Metod<T>::generateData()
+{
+    randomSyr2Matrices( this->order, uplo, this->size, false, &(this->alpha), this->AP, 0, this->X, 1, this->Y, 1);
+}
+
+template <typename T> cl_int
+Spr2Metod<T>::run()
+{
+
+    return (cl_int)::clMath::clblas::spr2(this->order, uplo, this->size, this->alpha, this->bufX, 0, 1, this->bufY, 0, 1,
+										 this->bufAP, 0, this->qnum, this->queues,
+										 this->inEventCount, this->inEvent, this->outEvent);
+
+
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename T>
+class GbmvMetod : public baseMetod<T>
+{
+    typedef T TYPE;
+
+    clblasTranspose transA;
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+// Assumptions
+// 1. Testing for Row Major order.
+// 2. Non Transpose
+//
+template <typename T> void
+GbmvMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_AXY);
+    this->order = clblasRowMajor;
+    transA = clblasNoTrans;
+    this->resultBuffer = Yresult;
+}
+
+template <typename T> void
+GbmvMetod<T>::generateData()
+{
+   randomGbmvMatrices(this->order, this->transA, this->size, this->size, &(this->alpha), &(this->beta),
+                        this->A, this->size, this->X, 1, this->Y, 1);
+}
+
+template <typename T> cl_int
+GbmvMetod<T>::run()
+{
+    return (cl_int)clMath::clblas::gbmv(this->order, this->transA, this->size, this->size, (1), (1),
+                                        this->alpha, this->bufA, 0, this->size, this->bufX, 0, 1, this->beta, this->bufY, 0, 1,
+                                        this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename T>
+class TbmvMetod : public baseMetod<T>
+{
+    typedef T TYPE;
+
+    clblasTranspose transA;
+    clblasUplo uplo;
+    clblasDiag diag;
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+// Assumptions
+// 1. Testing for Row Major order.
+// 2. Non Transpose
+//
+template <typename T> void
+TbmvMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_AXY);
+    this->order = clblasRowMajor;
+    transA = clblasNoTrans;
+    uplo = clblasUpper;
+    diag = clblasNonUnit;
+    this->resultBuffer = Yresult;
+}
+
+template <typename T> void
+TbmvMetod<T>::generateData()
+{
+   randomTbmvMatrices(this->size, this->A, this->size, this->X, 1);
+}
+
+template <typename T> cl_int
+TbmvMetod<T>::run()
+{
+    DataType type;
+
+    type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE:
+                                                     ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+    return (cl_int)clMath::clblas::tbmv(type, this->order, this->uplo, this->transA, this->diag, this->size, (1),
+                                        this->bufA, 0, this->size, this->bufX, 0, 1, this->bufY,
+                                        this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+class SbmvMetod : public baseMetod<T>
+{
+    typedef T TYPE;
+
+    clblasUplo uplo;
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+// Assumptions
+// 1. Testing for Row Major order.
+// 2. Non Transpose
+//
+template <typename T> void
+SbmvMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_AXY);
+    this->order = clblasRowMajor;
+    uplo = clblasUpper;
+    this->resultBuffer = Yresult;
+}
+
+template <typename T> void
+SbmvMetod<T>::generateData()
+{
+   randomGbmvMatrices(this->order, clblasNoTrans, this->size, this->size, &(this->alpha), &(this->beta),
+                        this->A, this->size, this->X, 1, this->Y, 1);
+}
+
+template <typename T> cl_int
+SbmvMetod<T>::run()
+{
+    return (cl_int)clMath::clblas::sbmv(this->order, this->uplo, this->size, 1,
+                                        this->alpha, this->bufA, 0, this->size, this->bufX, 0, 1, this->beta, this->bufY, 0, 1,
+                                        this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+}
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//HBMV
+
+template<typename T>
+class HbmvMetod : public baseMetod<T>
+{
+    typedef T TYPE;
+
+    clblasUplo uplo;
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+// Assumptions
+// 1. Testing for Row Major order.
+// 2. Non Transpose
+//
+template <typename T> void
+HbmvMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_AXY);
+    this->order = clblasRowMajor;
+    uplo = clblasUpper;
+    this->resultBuffer = Yresult;
+}
+
+template <typename T> void
+HbmvMetod<T>::generateData()
+{
+   randomGbmvMatrices(this->order, clblasNoTrans, this->size, this->size, &(this->alpha), &(this->beta),
+                        this->A, this->size, this->X, 1, this->Y, 1);
+}
+
+template <typename T> cl_int
+HbmvMetod<T>::run()
+{
+    return (cl_int)clMath::clblas::hbmv(this->order, this->uplo, this->size, 1,
+                                        this->alpha, this->bufA, 0, this->size, this->bufX, 0, 1, this->beta, this->bufY, 0, 1,
+                                        this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+class TbsvMetod : public baseMetod<T>
+{
+    typedef T TYPE;
+
+    clblasTranspose transA;
+    clblasUplo uplo;
+    clblasDiag diag;
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+TbsvMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_AX);
+    this->order = clblasRowMajor;
+    transA = clblasNoTrans;
+    uplo = clblasUpper;
+    diag = clblasNonUnit;
+    this->resultBuffer = Xresult;
+}
+
+template <typename T> void
+TbsvMetod<T>::generateData()
+{
+   randomTbsvMatrices(this->order, this->uplo, this->diag, this->size, 1, this->A, 2, this->X, 1);
+}
+
+template <typename T> cl_int
+TbsvMetod<T>::run()
+{
+    DataType type;
+
+    type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE:
+                                                     ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+    return (cl_int)clMath::clblas::tbsv(type, this->order, this->uplo, this->transA, this->diag, this->size, 1,
+                                        this->bufA, 0, 2, this->bufX, 0, 1,
+                                        this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+}
+///////////////////////////////////////////////
+
+
+template<typename T>
+class Her2kMetod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+    clblasUplo uplo;
+    clblasTranspose transA;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+Her2kMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_ABC);
+    this->order = clblasRowMajor;
+    uplo   = clblasLower;
+    transA = clblasNoTrans;
+    this->resultBuffer = Cresult;
+
+}
+
+template <typename T> void
+Her2kMetod<T>::generateData()
+{
+
+	clblasTranspose ftransB = (this->transA==clblasNoTrans)? clblasConjTrans: clblasNoTrans;
+
+    randomGemmMatrices<T>(this->order, this->transA, ftransB,
+                                this->size, this->size, this->size, false, &this->alpha, this->A, this->size,
+                                this->B, this->size, false, &this->beta, this->C, this->size);
+}
+
+template <typename T> cl_int
+Her2kMetod<T>::run()
+{
+
+    return (cl_int)::clMath::clblas::her2k(this->order, uplo, this->transA, this->size, this->size, this->alpha,
+                                   this->bufA, 0, this->size, this->bufB, 0, this->size, CREAL(this->beta), this->bufC, 0, this->size,
+                                   this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+
+
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+class ScalMetod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+ScalMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_X);
+    this->resultBuffer = Xresult;
+
+}
+
+template <typename T> void
+ScalMetod<T>::generateData()
+{
+    randomVectors(this->size, this->X, 1);
+}
+
+template <typename T> cl_int
+ScalMetod<T>::run()
+{
+
+    return (cl_int)::clMath::clblas::scal(false, this->size, this->alpha, this->bufX, 0,
+                        1, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////
+// Sscal is for handling the 2 extra cases csscal and zdscal
+template<typename T>
+class SscalMetod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+SscalMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_X);
+    this->resultBuffer = Xresult;
+
+}
+
+template <typename T> void
+SscalMetod<T>::generateData()
+{
+    randomVectors(this->size, this->X, 1);
+}
+
+template <typename T> cl_int
+SscalMetod<T>::run()
+{
+
+    return (cl_int)::clMath::clblas::scal(true, this->size, this->alpha, this->bufX, 0,
+                        1, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+class SwapMetod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+SwapMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_XY);
+    this->resultBuffer = Xresult;
+    // no need to have 2 buffers as result, as this is not a correctness test.
+    // proper correctness testing happens in test-correctness.
+
+}
+
+template <typename T> void
+SwapMetod<T>::generateData()
+{
+    randomVectors(this->size, this->X, 1, this->Y, 1);
+}
+
+template <typename T> cl_int
+SwapMetod<T>::run()
+{
+    DataType type;
+
+	type = ( typeid(T) == typeid(float))? TYPE_FLOAT:
+            ( typeid(T) == typeid(double))? TYPE_DOUBLE:
+			( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT:
+             TYPE_COMPLEX_DOUBLE;
+
+    return (cl_int)::clMath::clblas::swap(type, this->size, this->bufX, 0, 1, this->bufY, 0, 1,
+                        this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+}
+////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+class DotMetod : public baseMetod<T>
+{
+    typedef T TYPE;
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+DotMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_ABXY);
+    this->resultBuffer = Yresult;
+}
+
+template <typename T> void
+DotMetod<T>::generateData()
+{
+    //BlasRoutineID BlasFn = CLBLAS_DOT;
+
+//    populate( this->X, this->size, 1, this->size, BlasFn);
+    randomVectors(this->size, this->X, 1, this->Y, 1, true);
+
+}
+
+template <typename T> cl_int
+DotMetod<T>::run()
+{
+    DataType type;
+
+        type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE:
+                                                                        ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+    return (cl_int)clMath::clblas::dot( type, this->size, this->bufA, 0,
+                                        this->bufX, 0, 1, this->bufY, 0, 1, this->bufB,
+                                        this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//DOTC
+template<typename T>
+class DotcMetod : public baseMetod<T>
+{
+    typedef T TYPE;
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+DotcMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_ABXY);
+    this->resultBuffer = Yresult;
+}
+
+template <typename T> void
+DotcMetod<T>::generateData()
+{
+    randomVectors(this->size, this->X, 1, this->Y, 1, true);
+
+}
+
+template <typename T> cl_int
+DotcMetod<T>::run()
+{
+    DataType type;
+
+        type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE:
+                                                                        ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+    return (cl_int)clMath::clblas::dotc( type, this->size, this->bufA, 0,
+                                        this->bufX, 0, 1, this->bufY, 0, 1, this->bufB,
+                                        this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+}
+////////////////////////////////////////////////////////////////////////////////////////////
+//COPY
+
+template<typename T>
+class CopyMetod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+CopyMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_XY);
+    this->resultBuffer = Yresult;
+}
+
+template <typename T> void
+CopyMetod<T>::generateData()
+{
+    randomVectors(this->size, this->X, 1, this->Y, 1);
+}
+
+template <typename T> cl_int
+CopyMetod<T>::run()
+{
+    DataType type;
+
+        type = ( typeid(T) == typeid(float))? TYPE_FLOAT:
+            ( typeid(T) == typeid(double))? TYPE_DOUBLE:
+                        ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT:
+             TYPE_COMPLEX_DOUBLE;
+
+    return (cl_int)::clMath::clblas::copy(type, this->size, this->bufX, 0, 1, this->bufY, 0, 1,
+                        this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+template<typename T>
+class AxpyMetod : public baseMetod<T>
+{
+public:
+    typedef T TYPE;
+
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+AxpyMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_XY);
+    this->resultBuffer = Yresult;
+
+}
+
+template <typename T> void
+AxpyMetod<T>::generateData()
+{
+    randomVectors(this->size, this->X, 1, this->Y, 1);
+}
+
+template <typename T> cl_int
+AxpyMetod<T>::run()
+{
+
+    return (cl_int)::clMath::clblas::axpy(this->size, this->alpha, this->bufX, 0, 1, this->bufY, 0, 1,
+                        this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+class RotgMetod : public baseMetod<T>
+{
+    typedef T TYPE;
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+RotgMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    //USE_ABXY is actually used to create 2 2-D arrays and 2 vectors
+    //But here we use is to create the required 4 vectors. So more than required memory is allocated here.
+    //As this is functionality test, this does not affect the purpose of the tests.
+    //Here X=SA, Y=SB, A=C and B=S, where RHS's represent the standard netlib variable names
+    baseMetod<T>::initDefault(1, q, USE_ABXY);
+    this->resultBuffer = Yresult;
+    s = s; //Warning
+}
+
+template <typename T> void
+RotgMetod<T>::generateData()
+{
+    randomVectors(this->size, this->X, 1, this->Y, 1);
+
+}
+
+template <typename T> cl_int
+RotgMetod<T>::run()
+{
+    DataType type;
+
+    type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE:
+                                                                        ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+    return (cl_int)clMath::clblas::rotg( type, this->bufX, 0, this->bufY, 0, this->bufA, 0, this->bufB, 0,
+                                        this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+class RotmMetod : public baseMetod<T>
+{
+    typedef T TYPE;
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+RotmMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    // USE_AXY allocates space for 1 2-D array A and 2 vectors: X & Y
+    // Here are we are allocating more memory for PARAM than required, to reuse code. A corrosponds to PARAM.
+    baseMetod<T>::initDefault(s, q, USE_AXY);
+    this->resultBuffer = Yresult;
+}
+
+template <typename T> void
+RotmMetod<T>::generateData()
+{
+    randomVectors(this->size, this->X, 1, this->Y, 1);
+    randomVectors(4, this->A + 1, 1);
+    *(this->A) = 0; //Only 4 inputs are valid here, which are tested in correctness and performance test
+}
+
+template <typename T> cl_int
+RotmMetod<T>::run()
+{
+    DataType type;
+
+    type = ( typeid(T) == typeid(float))? TYPE_FLOAT: TYPE_DOUBLE;
+
+    return (cl_int)clMath::clblas::rotm( type, this->size, this->bufX, 0, 1, this->bufY, 0, 1, this->bufA, 0,
+                                        this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+class RotmgMetod : public baseMetod<T>
+{
+    typedef T TYPE;
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+RotmgMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(5, q, USE_ABCXY);
+    this->resultBuffer = Cresult;
+    s = s; //Warning
+}
+
+template <typename T> void
+RotmgMetod<T>::generateData()
+{
+    randomRotmg(this->A, this->B, this->X, this->Y, this->C);
+
+    *(this->C) = 0; //Only 4 inputs are valid here, which are tested in correctness and performance test
+}
+
+template <typename T> cl_int
+RotmgMetod<T>::run()
+{
+    DataType type;
+
+    type = ( typeid(T) == typeid(float))? TYPE_FLOAT: TYPE_DOUBLE;
+
+    return (cl_int)clMath::clblas::rotmg( type, this->bufA, 0, this->bufB, 0, this->bufX, 0, this->bufY, 0,
+                                         this->bufC, 0, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+}
+
+/////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+class RotMetod : public baseMetod<T>
+{
+    typedef T TYPE;
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+RotMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_XY);
+    this->resultBuffer = Yresult;
+}
+
+template <typename T> void
+RotMetod<T>::generateData()
+{
+    randomVectors(this->size, this->X, 1, this->Y, 1);
+}
+
+template <typename T> cl_int
+RotMetod<T>::run()
+{
+    //DataType type;
+
+	//type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE:
+      //                                                                  ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+
+    return (cl_int)clMath::clblas::rot( this->size, this->bufX, 0, 1, this->bufY, 0, 1, this->alpha, this->beta,
+                                        this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+class Nrm2Metod : public baseMetod<T>
+{
+    typedef T TYPE;
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+Nrm2Metod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_AXY);
+    this->resultBuffer = Yresult;
+}
+
+template <typename T> void
+Nrm2Metod<T>::generateData()
+{
+    randomVectors(this->size, this->X, 1, this->Y, 1, true);
+}
+
+template <typename T> cl_int
+Nrm2Metod<T>::run()
+{
+    DataType type;
+
+        type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE:
+               ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+    return (cl_int)clMath::clblas::nrm2(type, this->size, this->bufY, 0,
+                                        this->bufX, 0, 1, this->bufA,
+                                        this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+}
+
+///////////////////////////////////////////////////////
+
+//ASUM
+
+template<typename T>
+class AsumMetod : public baseMetod<T>
+{
+    typedef T TYPE;
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+AsumMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_AXY);
+    this->resultBuffer = Xresult;
+}
+
+template <typename T> void
+AsumMetod<T>::generateData()
+{
+    randomVectors(this->size, this->X, 1, (T*)NULL, 0, true);
+}
+
+template <typename T> cl_int
+AsumMetod<T>::run()
+{
+    DataType type;
+
+        type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE:
+                                                                        ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+    return (cl_int)clMath::clblas::asum( type, this->size, this->bufA, 0,
+                                        this->bufX, 0, 1, this->bufY,
+                                        this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+}
+
+//////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////
+
+//iAMAX
+
+template<typename T>
+class iAmaxMetod : public baseMetod<T>
+{
+    typedef T TYPE;
+public:
+    void initDefault(size_t s, unsigned int q);
+    cl_int run();
+    void generateData();
+};
+
+template <typename T> void
+iAmaxMetod<T>::initDefault(size_t s, unsigned int q)
+{
+    baseMetod<T>::initDefault(s, q, USE_AXY);
+    this->resultBuffer = Yresult;
+}
+
+template <typename T> void
+iAmaxMetod<T>::generateData()
+{
+    randomVectors(this->size, this->X, 1, this->Y, 1);
+}
+
+template <typename T> cl_int
+iAmaxMetod<T>::run()
+{
+    DataType type;
+
+        type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE:
+                                ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+    return (cl_int)clMath::clblas::iamax( type, this->size,
+                                        this->bufY, 0, this->bufX, 0, 1, this->bufA,
+                                        this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent);
+}
+
+//////////////////////////////////////////////////////////////////////
+
+#define CHECK_DOUBLE \
+{ \
+    clMath::BlasBase* base = clMath::BlasBase::getInstance();\
+    if (!base->isDevSupportDoublePrecision()) {\
+        ::std::cerr << ">> Double precision is not supported"\
+            << ::std::endl \
+            << ">> Test skipped." << ::std::endl;\
+        SUCCEED();\
+        return;\
+    }\
+}
+
+#endif  // FUNC_H_
diff --git a/src/tests/functional/test-functional.cpp b/src/tests/functional/test-functional.cpp
new file mode 100644
index 0000000..c147b77
--- /dev/null
+++ b/src/tests/functional/test-functional.cpp
@@ -0,0 +1,111 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <gtest/gtest.h>
+#include "BlasBase.h"
+
+
+///////////////////////////////////////////////////////////////////////////////
+
+int
+main(int argc, char *argv[])
+{
+    ::clMath::BlasBase *base;
+    TestParams params;
+    int ret;
+
+    if ((argc > 1) && !strcmp(argv[1], "--test-help")) {
+        printUsage("test-functional");
+        return 0;
+    }
+
+    ::testing::InitGoogleTest(&argc, argv);
+    ::std::cerr << "Initialize OpenCL and clblas..." << ::std::endl;
+    base = ::clMath::BlasBase::getInstance();
+    if (base == NULL) {
+        ::std::cerr << "Fatal error, OpenCL or clblas initialization failed! "
+                       "Leaving the test." << ::std::endl;
+        return -1;
+    }
+
+    if (argc != 1) {
+        params.optFlags = NO_FLAGS;
+        params.devType = CL_DEVICE_TYPE_GPU;
+        params.devName = NULL;
+        if (parseBlasCmdLineArgs(argc, argv, &params) != 0) {
+            printUsage(argv[0]);
+            return 1;
+        }
+        if (params.optFlags & SET_SEED) {
+            base->setSeed(params.seed);
+        }
+        if (params.optFlags & SET_ALPHA) {
+            base->setAlpha(params.alpha);
+        }
+        if (params.optFlags & SET_BETA) {
+            base->setBeta(params.beta);
+        }
+        if (params.optFlags & SET_M) {
+            base->setM(params.M);
+        }
+        if (params.optFlags & SET_N) {
+            base->setN(params.N);
+        }
+        if (params.optFlags & SET_K) {
+            base->setK(params.K);
+        }
+        if (params.optFlags & SET_INCX) {
+            base->setIncX(params.incx);
+        }
+        if (params.optFlags & SET_INCY) {
+            base->setIncY(params.incy);
+        }
+        if (params.optFlags & SET_DEVICE_TYPE) {
+            if (!base->setDeviceType(&params.devType, params.devName)) {
+                ::std::cerr << "Fatal error, OpenCL or clblas "
+                        "initialization failed! Leaving the test." <<
+                        ::std::endl;
+                return -1;
+            }
+        }
+        if (params.optFlags & SET_NUM_COMMAND_QUEUES) {
+            base->setNumCommandQueues(params.numCommandQueues);
+        }
+    }
+
+    parseEnv(&params);
+    if ((params.optFlags & SET_USE_IMAGES) &&
+            (params.devType != CL_DEVICE_TYPE_CPU)) {
+        base->setUseImages(params.useImages);
+    }
+
+	/* Use of image based buffers is deprecated
+    if (base->useImages()) {
+        if (base->addScratchImages()) {
+            std::cerr << "FATAL ERROR, CANNOT CREATE SCRATCH IMAGES!" << std::endl;
+        }
+    }
+	*/
+
+    ret = RUN_ALL_TESTS();
+
+    if (base->useImages()) {
+        base->removeScratchImages();
+    }
+
+    return ret;
+}
diff --git a/src/tests/include/BlasBase.h b/src/tests/include/BlasBase.h
new file mode 100644
index 0000000..1901afa
--- /dev/null
+++ b/src/tests/include/BlasBase.h
@@ -0,0 +1,225 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef BLASBASE_H_
+#define BLASBASE_H_
+
+#include <clBLAS.h>
+#include <common.h>
+
+#if _MSC_VER
+#pragma warning (disable:4127)
+#endif
+
+// check it is double precision error and return
+#define CHECK_DP_ERROR_AND_RETURN(err, funcName)                            \
+do {                                                                        \
+    clMath::BlasBase *base = clMath::BlasBase::getInstance();                     \
+                                                                            \
+    if (err == CL_INVALID_DEVICE && !base->isDevSupportDoublePrecision()) { \
+        ::std::cerr << std::endl << ">> " << funcName <<                    \
+                "() reported that this device doesn't support double "      \
+                "precision floating point arithmetic. Test is skipped" <<   \
+        ::std::endl;                                                        \
+        SUCCEED();                                                          \
+                                                                            \
+        return;                                                             \
+    }                                                                       \
+} while (0)                                                                 \
+
+#define DEFAULT_SEED 12345
+#define MAX_COMMAND_QUEUES 10
+
+namespace clMath {
+
+// This class is a singleton
+
+class BlasBase {
+private:
+    cl_platform_id platform_;
+    // used in all cases
+    cl_device_id primaryDevice_;
+    /*
+     * used only  in cases with MultipleQueues to cover problem distribution
+     * among different devices, not only different queues belonging to the same
+     * device
+     */
+    cl_device_id additionalDevice_;
+    cl_context context_;
+    cl_command_queue commandQueues_[MAX_COMMAND_QUEUES];
+
+    bool useNumCommandQueues_;
+    cl_uint numCommandQueues_;
+
+    bool useAlpha_;
+    bool useBeta_;
+    ComplexLong alpha_;
+    ComplexLong beta_;
+
+    bool useSeed_;
+    unsigned int seed_;
+
+    bool useM_, useN_, useK_;
+    size_t M_, N_, K_;
+
+    bool useIncX_, useIncY_;
+    int incX_, incY_;
+
+    bool useImages_;
+    cl_device_type devType_;
+    const char* devName_;
+    cl_ulong imageA_;
+    cl_ulong imageB_;
+
+    BlasBase();
+    ~BlasBase();
+    BlasBase(const BlasBase &);             // intentionally undefined
+    BlasBase & operator=(const BlasBase &); // intentionally undefined
+
+    void SetUp();
+    void TearDown();
+    bool initialized();
+
+    cl_int getPlatforms(cl_platform_id** platforms, cl_int *error);
+    cl_device_id getDevice(cl_device_type type, const char* name,
+        cl_int *error);
+    void printDevInfoStr(cl_device_info param, const char *paramName,
+                         int primAdd);
+
+public:
+    static BlasBase* getInstance();
+
+    cl_context context()
+    {
+        return context_;
+    };
+
+    cl_command_queue* commandQueues() const
+    {
+        return const_cast<cl_command_queue*>(commandQueues_);
+    };
+
+    bool useNumCommandQueues() const    { return useNumCommandQueues_; };
+    cl_uint numCommandQueues() const    { return numCommandQueues_; };
+    void setNumCommandQueues(cl_uint numCommandQueues)
+    {
+        if (numCommandQueues <= MAX_COMMAND_QUEUES) {
+            numCommandQueues_ = numCommandQueues;
+            useNumCommandQueues_ = true;
+        }
+    }
+
+    bool useAlpha() const        { return useAlpha_; }
+    ComplexLong alpha() const   { return alpha_; }
+    void setAlpha(ComplexLong alpha)
+    {
+        alpha_ = alpha;
+        useAlpha_ = true;
+    }
+
+    bool useBeta() const         { return useBeta_; }
+    ComplexLong beta() const    { return beta_; }
+    void setBeta(ComplexLong beta)
+    {
+        beta_ = beta;
+        useBeta_ = true;
+    }
+
+    bool useSeed() const        { return useSeed_; };
+    unsigned int seed() const   { return seed_; };
+    void setSeed(unsigned int seed)
+    {
+        seed_ = seed;
+        useSeed_ = true;
+    }
+
+    bool useM() const           { return useM_; };
+    size_t M() const            { return M_; }
+    void setM(size_t M)
+    {
+        M_ = M;
+        useM_ = true;
+    }
+
+    bool useN() const           { return useN_; };
+    size_t N() const            { return N_; }
+    void setN(size_t N)
+    {
+        N_ = N;
+        useN_ = true;
+    }
+
+    bool useK() const           { return useK_; };
+    size_t K() const            { return K_; }
+    void setK(size_t K)
+    {
+        K_ = K;
+        useK_ = true;
+    }
+
+    bool useIncX() const        { return useIncX_; };
+    int incX() const            { return incX_; }
+    void setIncX(int incX)
+    {
+        incX_ = incX;
+        useIncX_ = true;
+    }
+
+    bool useIncY() const        { return useIncY_; };
+    int incY() const            { return incY_; }
+    void setIncY(int incY)
+    {
+        incY_ = incY;
+        useIncY_ = true;
+    }
+
+    bool useImages() const      { return useImages_; };
+    void setUseImages(bool value)
+    {
+        useImages_ = value;
+    }
+    void setUseImages(int value)
+    {
+        useImages_ = (value != 0);
+    }
+
+    bool setDeviceType(cl_device_type* devType, const char* devName);
+    cl_mem createEnqueueBuffer(const void *data, size_t matrSize, size_t off,
+                               cl_mem_flags mode);
+    cl_mem readBuffer(void *ptr, size_t off, size_t size);
+
+    clblasStatus addScratchImages(void);
+    void removeScratchImages(void);
+    size_t scratchImageWidth(void);
+    size_t scratchImageHeight(void);
+
+    cl_ulong maxMemAllocSize(void);
+    cl_ulong availGlobalMemSize(int primAdd);
+
+    bool isDevSupportDoublePrecision(void);
+    // print information on environment the test run in
+    void printEnvInfo(void);
+
+    void release(void)
+    {
+        TearDown();
+    }
+};
+
+}   // namespace
+
+#endif  // BLASBASE_H_
diff --git a/src/tests/include/ExtraTestSizes.h b/src/tests/include/ExtraTestSizes.h
new file mode 100644
index 0000000..c96d9a0
--- /dev/null
+++ b/src/tests/include/ExtraTestSizes.h
@@ -0,0 +1,270 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef EXTRATESTSIZES_H_
+#define EXTRATESTSIZES_H_
+
+#include <common.h>
+
+//#define AMD_ETS_CONTAINER(ar1, ar2, ar3, ar4, ar5, ar6)
+
+namespace clMath {
+
+union BlasStride {
+    size_t ld;      /* matrix leading dimension */
+    int inc;        /* increment between vector elements */
+};
+
+/*
+ * Common convention:
+ * If a field is zero at test specialization, it is assumed to be undefined.
+ * In this case a test itself is responsible for assigning some value to it.
+ */
+struct ExtraTestSizes
+{
+    ExtraTestSizes() : offA(0), offBX(0), offCY(0)
+    {
+        strideA.ld = 0;
+        strideBX.ld = 0;
+        strideCY.ld = 0;
+    }
+
+    ExtraTestSizes(
+        size_t lda,
+        int incx,
+        int incy,
+        size_t offA,
+        size_t offBX,
+        size_t offCY)
+    {
+        strideA.ld = lda;
+        strideBX.ld = 0;
+        strideBX.inc = incx;
+        strideCY.ld = 0;
+        strideCY.inc = incy;
+        this->offA = offA;
+        this->offBX = offBX;
+        this->offCY = offCY;
+    }
+
+    ExtraTestSizes(
+        size_t lda,
+        size_t ldb,
+        size_t ldc,
+        size_t offA,
+        size_t offBX,
+        size_t offCY)
+    {
+        strideA.ld = lda;
+        strideBX.ld = ldb;
+        strideCY.ld = ldc;
+        this->offA = offA;
+        this->offBX = offBX;
+        this->offCY = offCY;
+    }
+
+    BlasStride strideA;
+    BlasStride strideBX;
+    BlasStride strideCY;
+    size_t offA;
+    size_t offBX;
+    size_t offCY;
+};
+
+template<typename T2, typename T3> class IteratorETS
+{
+public:
+    typedef ExtraTestSizes value_type;
+    typedef std::forward_iterator_tag iterator_category;
+    typedef int difference_type;
+    typedef ExtraTestSizes* pointer;
+    typedef ExtraTestSizes& reference;
+
+    IteratorETS(
+        const size_t *begin1,
+        const size_t *end1,
+        const T2 *begin2,
+        const T2 *end2,
+        const T3 *begin3,
+        const T3 *end3,
+        const size_t *begin4,
+        const size_t *end4,
+        const size_t *begin5,
+        const size_t *end5,
+        const size_t *begin6,
+        const size_t *end6,
+        int startEnd) : begin1_(begin1), end1_(end1),
+                        begin2_(begin2), end2_(end2),
+                        begin3_(begin3), end3_(end3),
+                        begin4_(begin4), end4_(end4),
+                        begin5_(begin5), end5_(end5),
+                        begin6_(begin6), end6_(end6)
+    {
+        cur1_ = (startEnd) ? end1_ : begin1_;
+        cur2_ = begin2_;
+        cur3_ = begin3_;
+        cur4_ = begin4_;
+        cur5_ = begin5_;
+        cur6_ = begin6_;
+    }
+
+    IteratorETS& operator++()
+    {
+        bool carry = false;
+
+        // don't go beyond the end
+        if (cur1_ == end1_) {
+            return *this;
+        }
+
+        carry = (cur6_ + 1 == end6_);
+        cur6_ = (carry) ? begin6_ : (cur6_ + 1);
+        if (carry) {
+            carry = (cur5_ + 1 == end5_);
+            cur5_ = (carry) ? begin5_ : (cur5_ + 1);
+        }
+        if (carry) {
+            carry = (cur4_ + 1 == end4_);
+            cur4_ = (carry) ? begin4_ : (cur4_ + 1);
+        }
+        if (carry) {
+            carry = (cur3_ + 1 == end3_);
+            cur3_ = (carry) ? begin3_ : (cur3_ + 1);
+        }
+        if (carry) {
+            carry = (cur2_ + 1 == end2_);
+            cur2_ = (carry) ? begin2_ : (cur2_ + 1);
+        }
+
+        if (carry) {
+            cur1_++;
+        }
+
+        return *this;
+    }
+
+    bool operator==(const IteratorETS& rhs) const
+    {
+        return (cur1_ == rhs.cur1_ &&
+                cur2_ == rhs.cur2_ &&
+                cur3_ == rhs.cur3_ &&
+                cur4_ == rhs.cur4_ &&
+                cur5_ == rhs.cur5_ &&
+                cur6_ == rhs.cur6_);
+    }
+
+    bool operator!=(const IteratorETS& rhs) const
+    {
+        return !(*this == rhs);
+    }
+
+    ExtraTestSizes& operator*()
+    {
+        inst_ = ExtraTestSizes(*cur1_, *cur2_, *cur3_, *cur4_, *cur5_, *cur6_);
+
+        return inst_;
+    }
+
+private:
+    ExtraTestSizes inst_;
+
+    const size_t *begin1_;
+    const size_t *cur1_;
+    const size_t *end1_;
+
+    const T2 *begin2_;
+    const T2 *cur2_;
+    const T2 *end2_;
+
+    const T3 *begin3_;
+    const T3 *cur3_;
+    const T3 *end3_;
+
+    const size_t *begin4_;
+    const size_t *cur4_;
+    const size_t *end4_;
+
+    const size_t *begin5_;
+    const size_t *cur5_;
+    const size_t *end5_;
+
+    const size_t *begin6_;
+    const size_t *cur6_;
+    const size_t *end6_;
+};
+
+/*
+ * Extra test sizes container
+ */
+template<size_t N1, typename T2, size_t N2, typename T3, size_t N3,
+         size_t N4, size_t N5, size_t N6>
+class ContainerETS
+{
+public:
+    typedef ExtraTestSizes value_type;
+
+    ContainerETS(
+        const size_t (&array1)[N1],
+        const T2 (&array2)[N2],
+        const T3 (&array3)[N3],
+        const size_t (&array4)[N4],
+        const size_t (&array5)[N5],
+        const size_t (&array6)[N6]) : ar1_(array1), ar2_(array2), ar3_(array3),
+                                      ar4_(array4), ar5_(array5), ar6_(array6)
+    { }
+
+IteratorETS<T2, T3> begin() const
+{
+    return IteratorETS<T2, T3>(ar1_, ar1_ + N1, ar2_, ar2_ + N2,
+                               ar3_, ar3_ + N3, ar4_, ar4_ + N4,
+                               ar5_, ar5_ + N5, ar6_, ar6_ + N6, 0);
+}
+
+IteratorETS<T2, T3> end() const
+{
+    return IteratorETS<T2, T3>(ar1_, ar1_ + N1, ar2_, ar2_ + N2,
+                               ar3_, ar3_ + N3, ar4_, ar4_ + N4,
+                               ar5_, ar5_ + N5, ar6_, ar6_ + N6, 1);
+}
+
+private:
+    const size_t *ar1_;
+    const T2 *ar2_;
+    const T3 *ar3_;
+    const size_t *ar4_;
+    const size_t *ar5_;
+    const size_t *ar6_;
+};
+
+template<size_t N1, typename T2, size_t N2, typename T3, size_t N3,
+         size_t N4, size_t N5, size_t N6>
+ContainerETS<N1, T2, N2, T3, N3, N4, N5, N6>
+makeContainerETS(
+    const size_t (&array1)[N1],
+    const T2 (&array2)[N2],
+    const T3 (&array3)[N3],
+    const size_t (&array4)[N4],
+    const size_t (&array5)[N5],
+    const size_t (&array6)[N6])
+{
+    return ContainerETS<N1, T2, N2, T3, N3, N4, N5, N6>(array1, array2, array3,
+                                                        array4, array5, array6);
+}
+
+}       /* namespace clMath */
+
+#endif /* EXTRATESTSIZES_H_ */
diff --git a/src/tests/include/asum.h b/src/tests/include/asum.h
new file mode 100644
index 0000000..0c3f508
--- /dev/null
+++ b/src/tests/include/asum.h
@@ -0,0 +1,81 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <blas-math.h>
+
+using ::testing::TestWithParam;
+
+class ASUM : public TestWithParam<
+    ::std::tr1::tuple<
+    int,                // N
+    int,                // incx, should be greater than 0
+	int,				//offx
+	int,				//offa -- for offAsum
+    int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->N = N;
+        params->incx = incx;
+		params->offBX = offx;
+		params->offa = offAsum;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        //size_t lenX;
+
+        N = ::std::tr1::get<0>(GetParam());
+        incx = ::std::tr1::get<1>(GetParam());
+		offx = ::std::tr1::get<2>(GetParam());
+		offAsum = ::std::tr1::get<3>(GetParam());
+        numCommandQueues = ::std::tr1::get<4>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        if (base->useN()) {
+            N = base->N();
+        }
+
+		printTestParams(N, offx, incx);
+        ::std::cerr << "offAsum = " << offAsum << ::std::endl;
+		::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    size_t N;
+    int incx;
+    size_t offx, offAsum;
+
+    ::clMath::BlasBase *base;
+    cl_ulong imageA, imageX;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+
diff --git a/src/tests/include/axpy.h b/src/tests/include/axpy.h
new file mode 100644
index 0000000..c230177
--- /dev/null
+++ b/src/tests/include/axpy.h
@@ -0,0 +1,94 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#ifndef AXPY__H_
+#define AXPY__H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+// Name AXPY creates problem in gTest
+class AXPY : public TestWithParam<
+
+    ::std::tr1::tuple<
+    int,                // N
+    ComplexLong,       // alpha
+    int,                // offBX
+    int,                // incx, should not be  0
+	int,				//offCY
+	int,				//incy, should not be 0
+    int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->N = N;
+        params->alpha = paramAlpha;
+        params->offBX = offBX;
+        params->incx = incx;
+		params->offCY = offCY;
+		params->incy = incy;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        N = ::std::tr1::get<0>(GetParam());
+		paramAlpha = ::std::tr1::get<1>(GetParam());
+        offBX = ::std::tr1::get<2>(GetParam());
+        incx = ::std::tr1::get<3>(GetParam());
+		offCY = ::std::tr1::get<4>(GetParam());
+		incy = ::std::tr1::get<5>(GetParam());
+        numCommandQueues = ::std::tr1::get<6>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        if (base->useN()) {
+            N = base->N();
+        }
+
+		printTestParams(N, paramAlpha, offBX, incx, offCY, incy);
+		::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    size_t N;
+    bool useAlpha;
+    ComplexLong paramAlpha;
+    size_t offBX;
+    int incx;
+    size_t offCY;
+	int incy;
+	unsigned int seed;
+
+    ::clMath::BlasBase *base;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#endif
diff --git a/src/tests/include/blas-cblas.h b/src/tests/include/blas-cblas.h
new file mode 100644
index 0000000..6154922
--- /dev/null
+++ b/src/tests/include/blas-cblas.h
@@ -0,0 +1,243 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef BLAS_CBLAS_H_
+#define BLAS_CBLAS_H_
+
+/* Under Windows math.h defines "complex" to mean "_complex". */
+#include <math.h>
+#undef complex
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* A complex datatype for use by the C interfaces to ACML routines */
+#ifndef _ACML_COMPLEX
+#define _ACML_COMPLEX
+typedef struct
+{
+    float real, imag;
+} complex;
+typedef struct
+{
+    double real, imag;
+} doublecomplex;
+#endif /* !defined(_ACML_COMPLEX) */
+
+/* Basic complex arithmetic routines for C */
+complex compose_complex(float x, float y);
+float complex_real(complex z);
+float complex_imag(complex z);
+
+doublecomplex compose_doublecomplex(double x, double y);
+double doublecomplex_real(doublecomplex z);
+double doublecomplex_imag(doublecomplex z);
+
+/* BLAS-2 functions */
+void sgemv(char transa, int m, int n, float alpha, float *a, int lda, float *x, int incx, float beta, float *y, int incy);
+void dgemv(char transa, int m, int n, double alpha, double *a, int lda, double *x, int incx, double beta, double *y, int incy);
+void cgemv(char transa, int m, int n, complex *alpha, complex *a, int lda, complex *x, int incx, complex *beta, complex *y, int incy);
+void zgemv(char transa, int m, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *x, int incx, doublecomplex *beta, doublecomplex *y, int incy);
+
+void ssymv(char uplo, int n, float alpha, float *a, int lda, float *x, int incx, float beta, float *y, int incy);
+void dsymv(char uplo, int n, double alpha, double *a, int lda, double *x, int incx, double beta, double *y, int incy);
+
+void strmv(char uplo, char transa, char diag, int n, float *a, int lda, float *x, int incx);
+void dtrmv(char uplo, char transa, char diag, int n, double *a, int lda, double *x, int incx);
+void ctrmv(char uplo, char transa, char diag, int n, complex *a, int lda, complex *x, int incx);
+void ztrmv(char uplo, char transa, char diag, int n, doublecomplex *a, int lda, doublecomplex *x, int incx);
+
+void strsv(char uplo, char transa, char diag, int n, float *a, int lda, float *x, int incx);
+void dtrsv(char uplo, char transa, char diag, int n, double *a, int lda, double *x, int incx);
+void ctrsv(char uplo, char transa, char diag, int n, complex *a, int lda, complex *x, int incx);
+void ztrsv(char uplo, char transa, char diag, int n, doublecomplex *a, int lda, doublecomplex *x, int incx);
+
+void sger(int m, int n, float alpha, float *x, int incx, float *y, int incy, float *a, int lda);
+void dger(int m, int n, double alpha, double *x, int incx, double *y, int incy, double *a, int lda);
+
+void cgeru(int m, int n, complex *alpha, complex *x, int incx, complex *y, int incy, complex *a, int lda);
+void zgeru(int m, int n, doublecomplex *alpha, doublecomplex *x, int incx, doublecomplex *y, int incy, doublecomplex *a, int lda);
+
+void cgerc(int m, int n, complex *alpha, complex *x, int incx, complex *y, int incy, complex *a, int lda);
+void zgerc(int m, int n, doublecomplex *alpha, doublecomplex *x, int incx, doublecomplex *y, int incy, doublecomplex *a, int lda);
+
+void ssyr(char uplo, int n, float alpha, float *x, int incx, float *a, int lda);
+void dsyr(char uplo, int n, double alpha, double *x, int incx, double *a, int lda);
+void ssyr2(char uplo, int n, float alpha, float *x, int incx, float *y, int incy, float *a, int lda);
+void dsyr2(char uplo, int n, double alpha, double *x, int incx, double *y, int incy, double *a, int lda);
+
+void cher(char uplo, int n, float alpha, complex *x, int incx, complex *a, int lda);
+void zher(char uplo, int n, double alpha, doublecomplex *x, int incx, doublecomplex *a, int lda);
+void cher2(char uplo, int n, complex *alpha, complex *x, int incx, complex *y, int incy, complex *a, int lda);
+void zher2(char uplo, int n, doublecomplex *alpha, doublecomplex *x, int incx, doublecomplex *y, int incy, doublecomplex *a, int lda);
+
+void chemv(char uplo, int n, complex *alpha, complex *a, int lda, complex *x, int incx, complex *beta, complex *y, int incy);
+void zhemv(char uplo, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *x, int incx, doublecomplex *beta, doublecomplex *y, int incy);
+
+void stpmv(char uplo, char transa, char diag, int n, float *ap, float *x, int incx);
+void dtpmv(char uplo, char transa, char diag, int n, double *ap, double *x, int incx);
+void ctpmv(char uplo, char transa, char diag, int n, complex *ap, complex *x, int incx);
+void ztpmv(char uplo, char transa, char diag, int n, doublecomplex *ap, doublecomplex *x, int incx);
+
+void stpsv(char uplo, char transa, char diag, int n, float *ap, float *x, int incx);
+void dtpsv(char uplo, char transa, char diag, int n, double *ap, double *x, int incx);
+void ctpsv(char uplo, char transa, char diag, int n, complex *ap, complex *x, int incx);
+void ztpsv(char uplo, char transa, char diag, int n, doublecomplex *ap, doublecomplex *x, int incx);
+
+void sspr(char uplo, int n, float alpha, float *x, int incx, float *ap );
+void dspr(char uplo, int n, double alpha, double *x, int incx, double *ap );
+
+void sspmv(char uplo, int n, float alpha, float *ap, float *x, int incx, float beta, float *y, int incy);
+void dspmv(char uplo, int n, double alpha, double *ap, double *x, int incx, double beta, double *y, int incy);
+void chpmv(char uplo, int n, complex *alpha, complex *ap, complex *x, int incx, complex *beta, complex *y, int incy);
+void zhpmv(char uplo, int n, doublecomplex *alpha, doublecomplex *ap, doublecomplex *x, int incx, doublecomplex *beta, doublecomplex *y, int incy);
+
+void chpr(char uplo, int n, float alpha, complex *x, int incx, complex *ap );
+void zhpr(char uplo, int n, double alpha, doublecomplex *x, int incx, doublecomplex *ap );
+
+void sspr2(char uplo, int n, float alpha, float *x, int incx, float *y, int incy, float *a );
+void dspr2(char uplo, int n, double alpha, double *x, int incx, double *y, int incy, double *a );
+void chpr2(char uplo, int n, complex *alpha, complex *x, int incx, complex *y, int incy, complex *a );
+void zhpr2(char uplo, int n, doublecomplex *alpha, doublecomplex *x, int incx, doublecomplex *y, int incy, doublecomplex *a );
+
+void sgbmv(char trans, int m, int n, int kl, int ku, float alpha, float *a, int inca, float *x, int incx, float beta, float *y, int incy );
+void dgbmv(char trans, int m, int n, int kl, int ku, double alpha, double *a, int inca, double *x, int incx, double beta, double *y, int incy );
+void cgbmv(char trans, int m, int n, int kl, int ku, complex *alpha, complex *a, int inca, complex *x, int incx, complex *beta, complex *y, int incy );
+void zgbmv(char trans, int m, int n, int kl, int ku, doublecomplex *alpha, doublecomplex *a, int inca, doublecomplex *x, int incx, doublecomplex *beta, doublecomplex *y, int incy );
+
+void stbmv(char uplo, char trans, char diag, int n, int k, float *a, int lda, float *x, int incx );
+void dtbmv(char uplo, char trans, char diag, int n, int k, double *a, int lda, double *x, int incx );
+void ctbmv(char uplo, char trans, char diag, int n, int k, complex *a, int lda, complex *x, int incx );
+void ztbmv(char uplo, char trans, char diag, int n, int k, doublecomplex *a, int lda, doublecomplex *x, int incx );
+
+void ssbmv(char uplo, int n, int k, float alpha, float *a, int lda, float *x, int incx, float beta, float *y, int incy );
+void dsbmv(char uplo, int n, int k, double alpha, double *a, int lda, double *x, int incx, double beta, double *y, int incy );
+void chbmv(char uplo, int n, int k, complex *alpha, complex *a, int lda, complex *x, int incx, complex *beta, complex *y, int incy );
+void zhbmv(char uplo, int n, int k, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *x, int incx, doublecomplex *beta, doublecomplex *y, int incy );
+
+void stbsv(char uplo, char trans, char diag, int n, int k, float *a, int lda, float *x, int incx );
+void dtbsv(char uplo, char trans, char diag, int n, int k, double *a, int lda, double *x, int incx );
+void ctbsv(char uplo, char trans, char diag, int n, int k, complex *a, int lda, complex *x, int incx );
+void ztbsv(char uplo, char trans, char diag, int n, int k, doublecomplex *a, int lda, doublecomplex *x, int incx );
+
+/* BLAS-3 functions */
+void sgemm(char transa, char transb, int m, int n, int k, float alpha, float *a, int lda, float *b, int ldb, float beta, float *c, int ldc);
+void dgemm(char transa, char transb, int m, int n, int k, double alpha, double *a, int lda, double *b, int ldb, double beta, double *c, int ldc);
+void cgemm(char transa, char transb, int m, int n, int k, complex *alpha, complex *a, int lda, complex *b, int ldb, complex *beta, complex *c, int ldc);
+void zgemm(char transa, char transb, int m, int n, int k, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb, doublecomplex *beta, doublecomplex *c, int ldc);
+
+void strmm(char side, char uplo, char transa, char diag, int m, int n, float alpha, float *a, int lda, float *b, int ldb);
+void dtrmm(char side, char uplo, char transa, char diag, int m, int n, double alpha, double *a, int lda, double *b, int ldb);
+void ctrmm(char side, char uplo, char transa, char diag, int m, int n, complex *alpha, complex *a, int lda, complex *b, int ldb);
+void ztrmm(char side, char uplo, char transa, char diag, int m, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb);
+
+void strsm(char side, char uplo, char transa, char diag, int m, int n, float alpha, float *a, int lda, float *b, int ldb);
+void dtrsm(char side, char uplo, char transa, char diag, int m, int n, double alpha, double *a, int lda, double *b, int ldb);
+void ctrsm(char side, char uplo, char transa, char diag, int m, int n, complex *alpha, complex *a, int lda, complex *b, int ldb);
+void ztrsm(char side, char uplo, char transa, char diag, int m, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb);
+
+void ssyr2k(char uplo, char transa, int n, int k, float alpha, float *a, int lda, float *b, int ldb, float beta, float *c, int ldc);
+void dsyr2k(char uplo, char transa, int n, int k, double alpha, double *a, int lda, double *b, int ldb, double beta, double *c, int ldc);
+void csyr2k(char uplo, char transa, int n, int k, complex *alpha, complex *a, int lda, complex *b, int ldb, complex *beta, complex *c, int ldc);
+void zsyr2k(char uplo, char transa, int n, int k, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb, doublecomplex *beta, doublecomplex *c, int ldc);
+
+void ssyrk(char uplo, char transa, int n, int k, float alpha, float *a, int lda, float beta, float *c, int ldc);
+void dsyrk(char uplo, char transa, int n, int k, double alpha, double *a, int lda, double beta, double *c, int ldc);
+void csyrk(char uplo, char transa, int n, int k, complex *alpha, complex *a, int lda, complex *beta, complex *c, int ldc);
+void zsyrk(char uplo, char transa, int n, int k, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *beta, doublecomplex *c, int ldc);
+
+void ssymm(char side, char uplo, int m, int n, float alpha, float *a, int lda, float *b, int ldb, float beta, float *c, int ldc);
+void dsymm(char side, char uplo, int m, int n, double alpha, double *a, int lda, double *b, int ldb, double beta, double *c, int ldc);
+void csymm(char side, char uplo, int m, int n, complex *alpha, complex *a, int lda, complex *b, int ldb, complex *beta, complex *c, int ldc);
+void zsymm(char side, char uplo, int m, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb, doublecomplex *beta, doublecomplex *c, int ldc);
+
+void chemm(char side, char uplo, int m, int n, complex *alpha, complex *a, int lda, complex *b, int ldb, complex *beta, complex *c, int ldc);
+void zhemm(char side, char uplo, int m, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb, doublecomplex *beta, doublecomplex *c, int ldc);
+
+void cherk(char uplo, char transa, int n, int k, float alpha, complex *a, int lda, float beta, complex *c, int ldc);
+void zherk(char uplo, char transa, int n, int k, double alpha, doublecomplex *a, int lda, double beta, doublecomplex *c, int ldc);
+
+void cher2k(char uplo, char transa, int n, int k, complex *alpha, complex *a, int lda, complex *b, int ldb, float beta, complex *c, int ldc);
+void zher2k(char uplo, char transa, int n, int k, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb, double beta, doublecomplex *c, int ldc);
+
+void sscal( int n, float alpha, float *x, int incx);
+void dscal( int n, double alpha, double *x, int incx);
+void cscal( int n, complex* alpha, complex *x, int incx);
+void zscal( int n, doublecomplex* alpha, doublecomplex *x, int incx);
+
+void csscal( int n, float alpha, complex *x, int incx);
+void zdscal( int n, double alpha, doublecomplex *x, int incx);
+
+void sswap( int n, float *x, int incx, float *y, int incy);
+void dswap( int n, double *x, int incx, double *y, int incy);
+void cswap( int n, complex *x, int incx, complex *y, int incy);
+void zswap( int n, doublecomplex *x, int incx, doublecomplex *y, int incy);
+
+void scopy( int n, float *x, int incx, float *y, int incy);
+void dcopy( int n, double *x, int incx, double *y, int incy);
+void ccopy( int n, complex *x, int incx, complex *y, int incy);
+void zcopy( int n, doublecomplex *x, int incx, doublecomplex *y, int incy);
+
+float sdot( int n, float *x, int incx, float *y, int incy);
+double ddot( int n, double *x, int incx, double *y, int incy);
+complex cdotu( int n, complex *x, int incx, complex *y, int incy);
+doublecomplex zdotu( int n, doublecomplex *x, int incx, doublecomplex *y, int incy);
+complex cdotc( int n, complex *x, int incx, complex *y, int incy);
+doublecomplex zdotc( int n, doublecomplex *x, int incx, doublecomplex *y, int incy);
+
+void saxpy( int n, float alpha, float *x, int incx, float *y, int incy);
+void daxpy( int n, double aplha, double *x, int incx, double *y, int incy);
+void caxpy( int n, complex *alpha, complex *x, int incx, complex *y, int incy);
+void zaxpy( int n, doublecomplex *alpha, doublecomplex *x, int incx, doublecomplex *y, int incy);
+
+void srotg(float *A, float *B, float *C, float *S);
+void drotg(double *A, double *B, double *C, double *S);
+void crotg(complex *A, complex *B, float *C, complex *S);
+void zrotg(doublecomplex *A, doublecomplex *B, double *C, doublecomplex *S);
+
+void srotmg(float *D1, float *D2, float *X1, const float *Y1, float *PARAM);
+void drotmg(double *D1, double *D2, double *X1, const double *Y1, double *PARAM);
+
+void srotm(int N, float *X, int incx, float *Y, int incy, float* PARAM);
+void drotm(int N, double *X, int incx, double *Y, int incy, double* PARAM);
+
+void srot(int N, float *X, int incx, float *Y, int incy, float C, float S);
+void drot(int N, double *X, int incx, double *Y, int incy, double C, double S);
+void csrot(int N, complex *X, int incx, complex *Y, int incy, float C, float S);
+void zdrot(int N, doublecomplex *X, int incx, doublecomplex *Y, int incy, double C, double S);
+
+float sasum(int n, float *x, int incx);
+double dasum(int n, double *x, int incx);
+float scasum(int n, complex *x, int incx);
+double dzasum(int n, doublecomplex *x, int incx);
+
+float snrm2( int n, float *x, int incx);
+double dnrm2( int n, double *x, int incx);
+float scnrm2( int n, complex *x, int incx);
+double dznrm2( int n, doublecomplex *x, int incx);
+
+int isamax(int n, float *x, int incx);
+int idamax(int n, double *x, int incx);
+int icamax(int n, complex *x, int incx);
+int izamax(int n, doublecomplex *x, int incx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* BLAS_CBLAS_H_ */
diff --git a/src/tests/include/blas-internal.h b/src/tests/include/blas-internal.h
new file mode 100644
index 0000000..4ea6a95
--- /dev/null
+++ b/src/tests/include/blas-internal.h
@@ -0,0 +1,2003 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef BLAS_INTERNAL_H_
+#define BLAS_INTERNAL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* BLAS-2 functions */
+
+void
+blasSgemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    float alpha,
+    const float *A,
+    size_t lda,
+    const float *X,
+    int incx,
+    float beta,
+    float *Y,
+    int incy);
+
+void
+blasDgemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    double alpha,
+    const double *A,
+    size_t lda,
+    const double *X,
+    int incx,
+    double beta,
+    double *Y,
+    int incy);
+
+void
+blasCgemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const FloatComplex *A,
+    size_t lda,
+    const FloatComplex *X,
+    int incx,
+    FloatComplex beta,
+    FloatComplex *Y,
+    int incy);
+
+void
+blasZgemv(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const DoubleComplex *A,
+    size_t lda,
+    const DoubleComplex *X,
+    int incx,
+    DoubleComplex beta,
+    DoubleComplex *Y,
+    int incy);
+
+void
+blasSsymv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    float alpha,
+    const float *A,
+    size_t lda,
+    const float *X,
+    int incx,
+    float beta,
+    float *Y,
+    int incy);
+
+void
+blasDsymv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    double alpha,
+    const double *A,
+    size_t lda,
+    const double *X,
+    int incx,
+    double beta,
+    double *Y,
+    int incy);
+
+/* BLAS-3 functions */
+
+void
+blasSgemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    float alpha,
+    const float *A,
+    size_t lda,
+    const float *B,
+    size_t ldb,
+    float beta,
+    float *C,
+    size_t ldc);
+
+void
+blasDgemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    double alpha,
+    const double *A,
+    size_t lda,
+    const double *B,
+    size_t ldb,
+    double beta,
+    double *C,
+    size_t ldc);
+
+void
+blasCgemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const FloatComplex *A,
+    size_t lda,
+    const FloatComplex *B,
+    size_t ldb,
+    FloatComplex beta,
+    FloatComplex *C,
+    size_t ldc);
+
+void
+blasZgemm(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const DoubleComplex *A,
+    size_t lda,
+    const DoubleComplex *B,
+    size_t ldb,
+    DoubleComplex beta,
+    DoubleComplex *C,
+    size_t ldc);
+
+void
+blasStrmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    float alpha,
+    const float *A,
+    size_t lda,
+    float *B,
+    size_t ldb);
+
+void
+blasDtrmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    double alpha,
+    const double *A,
+    size_t lda,
+    double *B,
+    size_t ldb);
+
+void
+blasCtrmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const FloatComplex *A,
+    size_t lda,
+    FloatComplex *B,
+    size_t ldb);
+
+void
+blasZtrmm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const DoubleComplex *A,
+    size_t lda,
+    DoubleComplex *B,
+    size_t ldb);
+
+void
+blasStrsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    float alpha,
+    const float *A,
+    size_t lda,
+    float *B,
+    size_t ldb);
+
+void
+blasDtrsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    double alpha,
+    const double *A,
+    size_t lda,
+    double *B,
+    size_t ldb);
+
+void
+blasCtrsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    FloatComplex alpha,
+    const FloatComplex *A,
+    size_t lda,
+    FloatComplex *B,
+    size_t ldb);
+
+void
+blasZtrsm(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    DoubleComplex alpha,
+    const DoubleComplex *A,
+    size_t lda,
+    DoubleComplex *B,
+    size_t ldb);
+
+void
+blasSsyr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    float alpha,
+    const float *A,
+    size_t lda,
+    const float *B,
+    size_t ldb,
+    float beta,
+    float *C,
+    size_t ldc);
+
+void
+blasDsyr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    double alpha,
+    const double *A,
+    size_t lda,
+    const double *B,
+    size_t ldb,
+    double beta,
+    double *C,
+    size_t ldc);
+
+void
+blasCsyr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const FloatComplex *A,
+    size_t lda,
+    const FloatComplex *B,
+    size_t ldb,
+    FloatComplex beta,
+    FloatComplex *C,
+    size_t ldc);
+
+void
+blasZsyr2k(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const DoubleComplex *A,
+    size_t lda,
+    const DoubleComplex *B,
+    size_t ldb,
+    DoubleComplex beta,
+    DoubleComplex *C,
+    size_t ldc);
+
+void
+blasSsyrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    float alpha,
+    const float *A,
+    size_t lda,
+    float beta,
+    float *C,
+    size_t ldc);
+
+void
+blasDsyrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    double alpha,
+    const double *A,
+    size_t lda,
+    double beta,
+    double *C,
+    size_t ldc);
+
+void
+blasCsyrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    FloatComplex alpha,
+    const FloatComplex *A,
+    size_t lda,
+    FloatComplex beta,
+    FloatComplex *C,
+    size_t ldc);
+
+void
+blasZsyrk(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    DoubleComplex alpha,
+    const DoubleComplex *A,
+    size_t lda,
+    DoubleComplex beta,
+    DoubleComplex *C,
+    size_t ldc);
+
+void
+blasStrmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        float *A,
+        size_t offa,
+        size_t lda,
+        float *X,
+        size_t offx,
+        int incx);
+
+void
+blasDtrmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        double *A,
+        size_t offa,
+        size_t lda,
+        double *X,
+        size_t offx,
+        int incx);
+
+void
+blasCtrmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda,
+        FloatComplex *X,
+        size_t offx,
+        int incx);
+
+void
+blasZtrmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        DoubleComplex *A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex *X,
+        size_t offx,
+        int incx);
+
+
+void
+blasStpmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        float *AP,
+        size_t offa,
+        float *X,
+        size_t offx,
+        int incx);
+
+void
+blasDtpmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        double *AP,
+        size_t offa,
+        double *X,
+        size_t offx,
+        int incx);
+
+void
+blasCtpmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        FloatComplex *AP,
+        size_t offa,
+        FloatComplex *X,
+        size_t offx,
+        int incx);
+
+void
+blasZtpmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        DoubleComplex *AP,
+        size_t offa,
+        DoubleComplex *X,
+        size_t offx,
+        int incx);
+
+
+void
+blasStrsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        float *A,
+        size_t offa,
+        size_t lda,
+        float *X,
+        size_t offx,
+        int incx);
+
+void
+blasDtrsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        double *A,
+        size_t offa,
+        size_t lda,
+        double *X,
+        size_t offx,
+        int incx);
+
+void
+blasCtrsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda,
+        FloatComplex *X,
+        size_t offx,
+        int incx);
+
+void
+blasZtrsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        DoubleComplex *A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex *X,
+        size_t offx,
+        int incx);
+
+void
+blasStpsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        float *A,
+        size_t offa,
+        float *X,
+        size_t offx,
+        int incx);
+
+void
+blasDtpsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        double *A,
+        size_t offa,
+        double *X,
+        size_t offx,
+        int incx);
+
+void
+blasCtpsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        FloatComplex *A,
+        size_t offa,
+        FloatComplex *X,
+        size_t offx,
+        int incx);
+
+void
+blasZtpsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        DoubleComplex *A,
+        size_t offa,
+        DoubleComplex *X,
+        size_t offx,
+        int incx);
+
+void
+    blasSsymm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        float alpha,
+        float* A,
+	    size_t offa,
+        size_t lda,
+        float* B,
+	    size_t offb,
+        size_t ldb,
+        float beta,
+        float* C,
+	    size_t offc,
+        size_t ldc);
+
+void
+    blasDsymm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        double alpha,
+        double* A,
+	    size_t offa,
+        size_t lda,
+        double* B,
+	    size_t offb,
+        size_t ldb,
+        double beta,
+        double* C,
+	    size_t offc,
+        size_t ldc);
+
+void
+    blasCsymm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* A,
+	    size_t offa,
+        size_t lda,
+        FloatComplex* B,
+	    size_t offb,
+        size_t ldb,
+        FloatComplex  beta,
+        FloatComplex* C,
+	    size_t offc,
+        size_t ldc);
+
+void
+    blasZsymm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* A,
+	    size_t offa,
+        size_t lda,
+        DoubleComplex* B,
+	    size_t offb,
+        size_t ldb,
+        DoubleComplex beta,
+        DoubleComplex* C,
+	    size_t offc,
+        size_t ldc);
+
+void
+    blasSger(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        float alpha,
+        float* x,
+        size_t offx,
+        int incx,
+        float* y,
+        size_t offy,
+        int incy,
+        float* A,
+        size_t offa,
+        size_t lda);
+
+void
+    blasDger(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        double alpha,
+        double* x,
+        size_t offx,
+        int incx,
+        double* y,
+        size_t offy,
+        int incy,
+        double* A,
+        size_t offa,
+        size_t lda);
+
+void
+    blasCgeru(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* x,
+        size_t offx,
+        int incx,
+        FloatComplex* y,
+        size_t offy,
+        int incy,
+        FloatComplex* A,
+        size_t offa,
+        size_t lda);
+
+void
+    blasZgeru(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* x,
+        size_t offx,
+        int incx,
+        DoubleComplex* y,
+        size_t offy,
+        int incy,
+        DoubleComplex* A,
+        size_t offa,
+        size_t lda);
+
+void
+    blasCgerc(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* x,
+        size_t offx,
+        int incx,
+        FloatComplex* y,
+        size_t offy,
+        int incy,
+        FloatComplex* A,
+        size_t offa,
+        size_t lda);
+
+void
+    blasZgerc(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* x,
+        size_t offx,
+        int incx,
+        DoubleComplex* y,
+        size_t offy,
+        int incy,
+        DoubleComplex* A,
+        size_t offa,
+        size_t lda);
+
+
+void
+    blasCher(
+        clblasOrder order,
+ 	    clblasUplo uplo,
+        size_t N,
+        float alpha,
+        FloatComplex* x,
+        size_t offx,
+        int incx,
+        FloatComplex* A,
+        size_t offa,
+        size_t lda);
+
+void
+    blasZher(
+        clblasOrder order,
+	    clblasUplo uplo,
+        size_t N,
+        double alpha,
+        DoubleComplex* x,
+        size_t offx,
+        int incx,
+        DoubleComplex* A,
+        size_t offa,
+        size_t lda);
+
+
+void
+	 blasDsyr(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        double alpha,
+        double* X,
+        size_t offx,
+        int incx,
+        double* A,
+        size_t offa,
+        size_t lda);
+void
+        blasSsyr(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        float alpha,
+        float* X,
+        size_t offx,
+        int incx,
+        float* A,
+        size_t offa,
+        size_t lda);
+
+void
+     blasDspr(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        double alpha,
+        double* X,
+        size_t offx,
+        int incx,
+        double* AP,
+        size_t offa);
+
+void
+    blasSspr(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        float alpha,
+        float* X,
+        size_t offx,
+        int incx,
+        float* AP,
+        size_t offa);
+
+
+void
+	blasSsyr2(
+		clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        float alpha,
+        float* X,
+        size_t offx,
+        int incx,
+		float* Y,
+        size_t offy,
+        int incy,
+        float* A,
+        size_t offa,
+        size_t lda);
+
+void
+	 blasDsyr2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        double alpha,
+        double* X,
+        size_t offx,
+        int incx,
+        double* Y,
+        size_t offy,
+        int incy,
+		double* A,
+        size_t offa,
+        size_t lda);
+
+
+//HER2
+void
+    blasCher2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* X,
+        size_t offx,
+        int incx,
+        FloatComplex* Y,
+        size_t offy,
+        int incy,
+        FloatComplex* A,
+        size_t offa,
+        size_t lda);
+
+void
+     blasZher2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* X,
+        size_t offx,
+        int incx,
+        DoubleComplex* Y,
+        size_t offy,
+        int incy,
+        DoubleComplex* A,
+        size_t offa,
+        size_t lda);
+
+
+
+void
+    blasChemv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* A,
+        size_t offa,
+        size_t lda,
+        FloatComplex* X,
+        size_t offx,
+        int incx,
+        FloatComplex beta,
+        FloatComplex* Y,
+        size_t offy,
+        int incy);
+
+void
+    blasZhemv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex* X,
+        size_t offx,
+        int incx,
+        DoubleComplex beta,
+        DoubleComplex* Y,
+        size_t offy,
+        int incy);
+//HEMM
+void
+    blasChemm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* A,
+        size_t offa,
+        size_t lda,
+        FloatComplex* B,
+        size_t offb,
+        size_t ldb,
+        FloatComplex  beta,
+        FloatComplex* C,
+        size_t offc,
+        size_t ldc);
+
+void
+    blasZhemm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex* B,
+        size_t offb,
+        size_t ldb,
+        DoubleComplex beta,
+        DoubleComplex* C,
+        size_t offc,
+        size_t ldc);
+
+
+void
+	blasCherk(
+    	clblasOrder order,
+	    clblasUplo uplo,
+    	clblasTranspose transA,
+	    size_t N,
+    	size_t K,
+	    float alpha,
+    	const FloatComplex *A,
+    	size_t lda,
+	    float beta,
+    	FloatComplex *C,
+    	size_t ldc);
+
+void
+	blasZherk(
+	    clblasOrder order,
+    	clblasUplo uplo,
+	    clblasTranspose transA,
+    	size_t N,
+	    size_t K,
+    	double alpha,
+	    const DoubleComplex *A,
+	    size_t lda,
+    	double beta,
+	    DoubleComplex *C,
+	    size_t ldc);
+
+
+void
+blasSspmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    float alpha,
+    const float *A,
+    size_t offa,
+    const float *X,
+    size_t offx,
+    int incx,
+    float beta,
+    float *Y,
+    size_t offy,
+    int incy);
+
+void
+blasDspmv(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    double alpha,
+    const double *A,
+    size_t offa,
+    const double *X,
+    size_t offx,
+    int incx,
+    double beta,
+    double *Y,
+    size_t offy,
+    int incy);
+
+
+void
+    blasChpmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* A,
+        size_t offa,
+        FloatComplex* X,
+        size_t offx,
+        int incx,
+        FloatComplex beta,
+        FloatComplex* Y,
+        size_t offy,
+        int incy);
+
+void
+    blasZhpmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* A,
+        size_t offa,
+        DoubleComplex* X,
+        size_t offx,
+        int incx,
+        DoubleComplex beta,
+        DoubleComplex* Y,
+        size_t offy,
+        int incy);
+
+void
+    blasChpr(
+        clblasOrder order,
+ 	    clblasUplo uplo,
+        size_t N,
+        float alpha,
+        FloatComplex* x,
+        size_t offx,
+        int incx,
+        FloatComplex* AP,
+        size_t offa);
+
+void
+    blasZhpr(
+        clblasOrder order,
+	    clblasUplo uplo,
+        size_t N,
+        double alpha,
+        DoubleComplex* x,
+        size_t offx,
+        int incx,
+        DoubleComplex* AP,
+        size_t offa);
+
+void
+	blasSspr2(
+		clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        float alpha,
+        float* X,
+        size_t offx,
+        int incx,
+		float* Y,
+        size_t offy,
+        int incy,
+        float* AP,
+        size_t offa);
+
+void
+	 blasDspr2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        double alpha,
+        double* X,
+        size_t offx,
+        int incx,
+        double* Y,
+        size_t offy,
+        int incy,
+		double* AP,
+        size_t offa);
+
+
+void
+    blasChpr2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* X,
+        size_t offx,
+        int incx,
+        FloatComplex* Y,
+        size_t offy,
+        int incy,
+        FloatComplex* AP,
+        size_t offa);
+
+void
+     blasZhpr2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* X,
+        size_t offx,
+        int incx,
+        DoubleComplex* Y,
+        size_t offy,
+        int incy,
+        DoubleComplex* AP,
+        size_t offa);
+
+void
+    blasSgbmv(
+        clblasOrder order,
+        clblasTranspose transA,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        float alpha,
+        float *A,
+        size_t offa,
+        size_t lda,
+        float *X,
+        size_t offx,
+        int incx,
+        float beta,
+        float *Y,
+        size_t offy,
+        int incy);
+
+void
+    blasDgbmv(
+        clblasOrder order,
+        clblasTranspose transA,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        double alpha,
+        double *A,
+        size_t offa,
+        size_t lda,
+        double *X,
+        size_t offx,
+        int incx,
+        double beta,
+        double *Y,
+        size_t offy,
+        int incy);
+
+void
+    blasCgbmv(
+        clblasOrder order,
+        clblasTranspose transA,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        FloatComplex alpha,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda,
+        FloatComplex *X,
+        size_t offx,
+        int incx,
+        FloatComplex beta,
+        FloatComplex *Y,
+        size_t offy,
+        int incy);
+
+void
+    blasZgbmv(
+        clblasOrder order,
+        clblasTranspose transA,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        DoubleComplex alpha,
+        DoubleComplex *A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex *X,
+        size_t offx,
+        int incx,
+        DoubleComplex beta,
+        DoubleComplex *Y,
+        size_t offy,
+        int incy);
+
+//TBMV
+
+void
+    blasStbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        float *A,
+        size_t offa,
+        size_t lda,
+        float *X,
+        size_t offx,
+        int incx);
+
+void
+    blasDtbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        double *A,
+        size_t offa,
+        size_t lda,
+        double *X,
+        size_t offx,
+        int incx);
+void
+    blasCtbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda,
+        FloatComplex *X,
+        size_t offx,
+        int incx);
+
+void
+    blasZtbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        DoubleComplex *A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex *X,
+        size_t offx,
+        int incx);
+
+void
+    blasSsbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        size_t K,
+        float alpha,
+        float *A,
+        size_t offa,
+        size_t lda,
+        float *X,
+        size_t offx,
+        int incx,
+        float beta,
+        float *Y,
+        size_t offy,
+        int incy);
+
+void
+    blasDsbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        size_t K,
+        double alpha,
+        double *A,
+        size_t offa,
+        size_t lda,
+        double *X,
+        size_t offx,
+        int incx,
+        double beta,
+        double *Y,
+        size_t offy,
+        int incy);
+
+
+void
+    blasChbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        size_t K,
+        FloatComplex alpha,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda,
+        FloatComplex *X,
+        size_t offx,
+        int incx,
+        FloatComplex beta,
+        FloatComplex *Y,
+        size_t offy,
+        int incy);
+
+void
+    blasZhbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        size_t K,
+        DoubleComplex alpha,
+        DoubleComplex *A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex *X,
+        size_t offx,
+        int incx,
+        DoubleComplex beta,
+        DoubleComplex *Y,
+        size_t offy,
+        int incy);
+
+//TBSV
+
+void
+    blasStbsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        float *A,
+        size_t offa,
+        size_t lda,
+        float *X,
+        size_t offx,
+        int incx);
+
+void
+    blasDtbsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        double *A,
+        size_t offa,
+        size_t lda,
+        double *X,
+        size_t offx,
+        int incx);
+void
+    blasCtbsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda,
+        FloatComplex *X,
+        size_t offx,
+        int incx);
+
+void
+    blasZtbsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        DoubleComplex *A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex *X,
+        size_t offx,
+        int incx);
+
+void
+	blasCher2k(
+    	clblasOrder order,
+	    clblasUplo uplo,
+    	clblasTranspose transA,
+	    size_t N,
+    	size_t K,
+	    FloatComplex alpha,
+    	const FloatComplex *A,
+        size_t offa,
+    	size_t lda,
+        const FloatComplex *B,
+        size_t offb,
+    	size_t ldb,
+	    float beta,
+    	FloatComplex *C,
+        size_t offc,
+    	size_t ldc);
+
+void
+	blasZher2k(
+	    clblasOrder order,
+    	clblasUplo uplo,
+	    clblasTranspose transA,
+    	size_t N,
+	    size_t K,
+    	DoubleComplex alpha,
+	    const DoubleComplex *A,
+        size_t offa,
+	    size_t lda,
+        const DoubleComplex *B,
+        size_t offb,
+	    size_t ldb,
+    	double beta,
+	    DoubleComplex *C,
+        size_t offc,
+	    size_t ldc);
+
+/* BLAS-1 functions */
+
+//swap
+void
+blasSswap(
+	size_t N,
+	float *X,
+	size_t offBX,
+	int incx,
+	float *Y,
+	size_t offCY,
+	int incy);
+
+void
+blasDswap(
+	size_t N,
+	double *X,
+	size_t offBX,
+	int incx,
+	double *Y,
+	size_t offCY,
+	int incy);
+
+void
+blasCswap(
+	size_t N,
+	FloatComplex *X,
+	size_t offBX,
+	int incx,
+	FloatComplex *Y,
+	size_t offCY,
+	int incy);
+
+void
+blasZswap(
+	size_t N,
+	DoubleComplex *X,
+	size_t offBX,
+	int incx,
+	DoubleComplex *Y,
+	size_t offCY,
+	int incy);
+
+
+
+//Scal
+void
+	blasSscal(
+	    size_t N,
+        float alpha,
+        float *X,
+        size_t offx,
+        int incx);
+
+void
+	blasDscal(
+	    size_t N,
+        double alpha,
+        double *X,
+        size_t offx,
+        int incx);
+
+void
+	blasCscal(
+	    size_t N,
+        FloatComplex alpha,
+        FloatComplex *X,
+        size_t offx,
+        int incx);
+
+
+void
+	blasZscal(
+	    size_t N,
+        DoubleComplex alpha,
+        DoubleComplex *X,
+        size_t offx,
+        int incx);
+
+void
+	blasCsscal(
+	    size_t N,
+        float alpha,
+        FloatComplex *X,
+        size_t offx,
+        int incx);
+
+void
+	blasZdscal(
+	    size_t N,
+        double alpha,
+        DoubleComplex *X,
+        size_t offx,
+        int incx);
+
+//COPY
+
+void
+blasScopy(
+    size_t N,
+    float *X,
+    size_t offx,
+    int incx,
+    float *Y,
+    size_t offy,
+    int incy);
+
+void
+blasDcopy(
+    size_t N,
+    double *X,
+    size_t offx,
+    int incx,
+    double *Y,
+    size_t offy,
+    int incy);
+
+void
+blasCcopy(
+    size_t N,
+    FloatComplex *X,
+    size_t offx,
+    int incx,
+    FloatComplex *Y,
+    size_t offy,
+    int incy);
+
+void
+blasZcopy(
+    size_t N,
+    DoubleComplex *X,
+    size_t offx,
+    int incx,
+    DoubleComplex *Y,
+    size_t offy,
+    int incy);
+
+
+// DOT
+float
+blasSdot(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx,
+        float *Y,
+        size_t offy,
+        int incy);
+
+double
+blasDdot(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx,
+        double *Y,
+        size_t offy,
+        int incy);
+
+FloatComplex
+blasCdotu(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx,
+        FloatComplex *Y,
+        size_t offy,
+        int incy);
+
+DoubleComplex
+blasZdotu(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx,
+        DoubleComplex *Y,
+        size_t offy,
+        int incy);
+
+//ASUM
+
+float
+blasSasum(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx);
+
+double
+blasDasum(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx);
+
+float
+blasScasum(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx);
+
+double
+blasDzasum(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx);
+
+//DOTC
+FloatComplex
+blasCdotc(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx,
+        FloatComplex *Y,
+        size_t offy,
+        int incy);
+
+DoubleComplex
+blasZdotc(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx,
+        DoubleComplex *Y,
+        size_t offy,
+        int incy);
+
+//axpy
+void
+blasSaxpy(
+        size_t N,
+        float alpha,
+        const float *X,
+        size_t offBX,
+        int incx,
+        float *Y,
+        size_t offCY,
+        int incy);
+
+void
+blasDaxpy(
+        size_t N,
+        double alpha,
+        const double *X,
+        size_t offBX,
+        int incx,
+        double *Y,
+        size_t offCY,
+        int incy);
+void
+blasCaxpy(
+        size_t N,
+        FloatComplex alpha,
+        const FloatComplex *X,
+        size_t offBX,
+        int incx,
+        FloatComplex *Y,
+        size_t offCY,
+        int incy);
+void
+blasZaxpy(
+        size_t N,
+        DoubleComplex alpha,
+        const DoubleComplex *X,
+        size_t offBX,
+        int incx,
+        DoubleComplex *Y,
+        size_t offCY,
+        int incy);
+
+//ROTG
+void
+blasSrotg(
+        float* SA,
+        size_t offSA,
+        float* SB,
+        size_t offSB,
+        float* C,
+        size_t offC,
+        float* S,
+        size_t offS);
+
+void
+blasDrotg(
+        double* SA,
+        size_t offSA,
+        double* SB,
+        size_t offSB,
+        double* C,
+        size_t offC,
+        double* S,
+        size_t offS);
+
+void
+blasCrotg(
+        FloatComplex* SA,
+        size_t offSA,
+        FloatComplex* SB,
+        size_t offSB,
+        float* C,
+        size_t offC,
+        FloatComplex* S,
+        size_t offS);
+
+void
+blasZrotg(
+        DoubleComplex* SA,
+        size_t offSA,
+        DoubleComplex* SB,
+        size_t offSB,
+        double* C,
+        size_t offC,
+        DoubleComplex* S,
+        size_t offS);
+void
+blasSrotmg(
+        float *D1,
+        size_t offD1,
+        float *D2,
+        size_t offD2,
+        float *X1,
+        size_t offX1,
+        const float *Y1,
+        size_t offY1,
+        float *PARAM,
+        size_t offParam);
+
+void
+blasDrotmg(
+        double *D1,
+        size_t offD1,
+        double *D2,
+        size_t offD2,
+        double *X1,
+        size_t offX1,
+        const double *Y1,
+        size_t offY1,
+        double *PARAM,
+        size_t offParam);
+
+void
+blasSrotm(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx,
+        float *Y,
+        size_t offy,
+        int incy,
+        float *PARAM,
+        size_t offParam);
+
+void
+blasDrotm(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx,
+        double *Y,
+        size_t offy,
+        int incy,
+        double *PARAM,
+        size_t offParam);
+
+void
+blasSrot(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx,
+        float *Y,
+        size_t offy,
+        int incy,
+        float C,
+        float S);
+
+void
+blasDrot(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx,
+        double *Y,
+        size_t offy,
+        int incy,
+        double C,
+        double S);
+
+void
+blasCsrot(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx,
+        FloatComplex *Y,
+        size_t offy,
+        int incy,
+        float C,
+        float S);
+
+void
+blasZdrot(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx,
+        DoubleComplex *Y,
+        size_t offy,
+        int incy,
+        double C,
+        double S);
+
+int
+blasiSamax(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx);
+
+int
+blasiDamax(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx);
+
+int
+blasiCamax(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx);
+
+int
+blasiZamax(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx);
+
+float
+blasSnrm2(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx);
+
+double
+blasDnrm2(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx);
+
+float
+blasScnrm2(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx);
+
+double
+blasDznrm2(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx);
+
+#ifdef __cplusplus
+}
+   /* extern "C" { */
+#endif
+
+#endif  /* BLAS_INTERNAL_H_ */
diff --git a/src/tests/include/blas-math.h b/src/tests/include/blas-math.h
new file mode 100644
index 0000000..a7e3293
--- /dev/null
+++ b/src/tests/include/blas-math.h
@@ -0,0 +1,369 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef BLAS_MATH_H_
+#define BLAS_MATH_H_
+
+#if defined (_MSC_VER)
+
+static unsigned long long ROW_NAN = 0x7ff0000000000000LL;
+static unsigned int ROW_NANF = 0x7fc00000;
+
+#define NAN *(reinterpret_cast<double*>(&ROW_NAN))
+#define NANF *(reinterpret_cast<float*>(&ROW_NANF))
+
+#else   /* _MSC_VER */
+
+#define NANF NAN
+
+#endif  /* !_MSC_VER */
+
+#include <math.h>       // NAN, sqrt, abs
+#include <stdlib.h>     // rand()
+#include <clBLAS.h>
+#include <common.h>
+
+static inline cl_int
+module(cl_int a)
+{
+    return abs(a);
+}
+
+static inline cl_float
+module(cl_float a)
+{
+	return fabsf(a);
+}
+
+static inline cl_double
+module(cl_double a)
+{
+	return fabs(a);
+}
+
+static inline FloatComplex
+operator+(FloatComplex a, FloatComplex b)
+{
+	return floatComplex(CREAL(a) + CREAL(b), CIMAG(b) + CIMAG(b));
+}
+
+static inline FloatComplex
+operator-(FloatComplex a, FloatComplex b)
+{
+	return floatComplex(CREAL(a) - CREAL(b), CIMAG(b) - CIMAG(b));
+}
+
+static inline FloatComplex
+operator*(FloatComplex a, FloatComplex b)
+{
+	return floatComplex(
+		CREAL(a) * CREAL(b) - CIMAG(a) * CIMAG(b),
+		CREAL(a) * CIMAG(b) + CREAL(b) * CIMAG(a));
+}
+
+static inline FloatComplex
+operator*(FloatComplex a, cl_float b)
+{
+	return floatComplex(CREAL(a) * b, CIMAG(a) * b);
+}
+
+static inline FloatComplex
+operator/(FloatComplex a, FloatComplex b)
+{
+	cl_float div = CREAL(b) * CREAL(b) + CIMAG(b) * CIMAG(b);
+
+	return floatComplex(
+		(CREAL(a) * CREAL(b) + CIMAG(a) * CIMAG(b)) / div,
+		(CREAL(b) * CIMAG(a) - CREAL(a) * CIMAG(b)) / div);
+}
+
+static inline FloatComplex
+operator/(FloatComplex a, cl_float b)
+{
+	return floatComplex(CREAL(a) / b, CIMAG(a) / b);
+}
+
+static inline cl_float
+module(FloatComplex a)
+{
+    if ((CREAL(a) == 0.0) && (CIMAG(a) == 0.0))
+        return 0.0;
+	return sqrtf(CREAL(a) * CREAL(a) + CIMAG(a) * CIMAG(a));
+}
+
+static inline DoubleComplex
+operator+(DoubleComplex a, DoubleComplex b)
+{
+	return doubleComplex(CREAL(a) + CREAL(b), CIMAG(b) + CIMAG(b));
+}
+
+static inline DoubleComplex
+operator-(DoubleComplex a, DoubleComplex b)
+{
+	return doubleComplex(CREAL(a) - CREAL(b), CIMAG(b) - CIMAG(b));
+}
+
+static inline DoubleComplex
+operator*(DoubleComplex a, DoubleComplex b)
+{
+	return doubleComplex(
+	    CREAL(a) * CREAL(b) - CIMAG(a) * CIMAG(b),
+	    CREAL(a) * CIMAG(b) + CREAL(b) * CIMAG(a));
+}
+
+static inline DoubleComplex
+operator*(DoubleComplex a, cl_double b)
+{
+	return doubleComplex(CREAL(a) * b, CIMAG(a) * b);
+}
+
+static inline DoubleComplex
+operator/(DoubleComplex a, DoubleComplex b)
+{
+	cl_double div = CREAL(b) * CREAL(b) + CIMAG(b) * CIMAG(b);
+
+	return doubleComplex(
+		(CREAL(a) * CREAL(b) + CIMAG(a) * CIMAG(b)) / div,
+		(CREAL(b) * CIMAG(a) - CREAL(a) * CIMAG(b)) / div);
+}
+
+static inline DoubleComplex
+operator/(DoubleComplex a, cl_double b)
+{
+	return doubleComplex(CREAL(a) / b, CIMAG(a) / b);
+}
+
+static inline cl_double
+module(DoubleComplex a)
+{
+    if ((CREAL(a) == 0.0) && (CIMAG(a) == 0.0))
+        return 0.0;
+	return sqrt(CREAL(a) * CREAL(a) + CIMAG(a) * CIMAG(a));
+}
+
+// Random generator
+
+template<class T>
+static T
+randomTrsv(cl_double limit)
+{
+    T v;
+    T temp;
+    temp = ((T)rand() / (T)(RAND_MAX));
+    temp = temp * (T)limit;
+    if(temp == 0)
+    {
+        if ((rand() % 2) == 1)
+        {
+            temp = ((T)rand() / (T)(RAND_MAX));
+            temp = temp * (T)limit;
+        }
+    }
+    v = static_cast<float>(temp);
+    if ((rand() % 2) == 1)
+        v = -v;
+    return v;
+}
+
+template<>
+__template_static FloatComplex
+randomTrsv<FloatComplex>(cl_double limit)
+{
+    return floatComplex(randomTrsv<cl_float>(limit), randomTrsv<cl_float>(limit));
+}
+
+template<>
+__template_static DoubleComplex
+randomTrsv<DoubleComplex>(cl_double limit)
+{
+    return doubleComplex(randomTrsv<cl_double>(limit), randomTrsv<cl_double>(limit));
+}
+
+template<typename T>
+static T
+randomTrsv(cl_double left, cl_double right)
+{
+    T v;
+    T l = static_cast<T>(left);
+
+    v = randomTrsv<T>(right - left);
+    if (v < 0) {
+        v -= l;
+    }
+    else {
+        v += l;
+    }
+    return v;
+}
+
+template<>
+__template_static FloatComplex
+randomTrsv<FloatComplex>(cl_double left, cl_double right)
+{
+    return floatComplex(randomTrsv<cl_float>(left, right),
+        randomTrsv<cl_float>(left, right));
+}
+
+template<>
+__template_static DoubleComplex
+randomTrsv<DoubleComplex>(cl_double left, cl_double right)
+{
+    return doubleComplex(randomTrsv<cl_double>(left, right),
+        randomTrsv<cl_double>(left, right));
+}
+
+
+template<class T>
+static T
+random(cl_double limit)
+{
+	T v;
+    cl_ulong l = static_cast<cl_ulong>(limit);
+    if (l == 0) {
+        return 0;
+    }
+	v = static_cast<float>(rand() % l);
+	if ((rand() % 2) == 1)
+		v = -v;
+	return v;
+}
+
+template<>
+__template_static FloatComplex
+random<FloatComplex>(cl_double limit)
+{
+	return floatComplex(random<cl_float>(limit), random<cl_float>(limit));
+}
+
+template<>
+__template_static DoubleComplex
+random<DoubleComplex>(cl_double limit)
+{
+	return doubleComplex(random<cl_double>(limit), random<cl_double>(limit));
+}
+
+template<typename T>
+static T
+random(cl_double left, cl_double right)
+{
+    T v;
+    T l = static_cast<T>(left);
+
+    v = random<T>(right - left);
+    if (v < 0) {
+        v -= l;
+    }
+    else {
+        v += l;
+    }
+    return v;
+}
+
+template<>
+__template_static FloatComplex
+random<FloatComplex>(cl_double left, cl_double right)
+{
+	return floatComplex(random<cl_float>(left, right),
+        random<cl_float>(left, right));
+}
+
+template<>
+__template_static DoubleComplex
+random<DoubleComplex>(cl_double left, cl_double right)
+{
+	return doubleComplex(random<cl_double>(left, right),
+        random<cl_double>(left, right));
+}
+
+// Type-dependant constants
+
+template<class T>
+static T
+ZERO()
+{
+	return static_cast<T>(0.0);
+}
+
+template<>
+__template_static FloatComplex
+ZERO<FloatComplex>()
+{
+	return floatComplex(0.0, 0.0);
+}
+
+template<>
+__template_static DoubleComplex
+ZERO<DoubleComplex>()
+{
+	return doubleComplex(0.0, 0.0);
+}
+
+
+template<class T>
+static T
+ONE()
+{
+	return static_cast<T>(1.0);
+}
+
+template<>
+__template_static FloatComplex
+ONE<FloatComplex>()
+{
+	return floatComplex(1.0, 0.0);
+}
+
+template<>
+__template_static DoubleComplex
+ONE<DoubleComplex>()
+{
+	return doubleComplex(1.0, 0.0);
+}
+
+template<class T>
+static T
+FNAN();
+
+template<>
+__template_static float
+FNAN<float>()
+{
+    return NANF;
+}
+
+template<>
+__template_static double
+FNAN<double>()
+{
+    return NAN;
+}
+
+template<>
+__template_static FloatComplex
+FNAN<FloatComplex>()
+{
+    return floatComplex(NANF, NANF);
+}
+
+template<>
+__template_static DoubleComplex
+FNAN<DoubleComplex>()
+{
+    return doubleComplex(NAN, NAN);
+}
+
+#endif	/* BLAS_MATH_H_ */
diff --git a/src/tests/include/blas-random.h b/src/tests/include/blas-random.h
new file mode 100644
index 0000000..85fd457
--- /dev/null
+++ b/src/tests/include/blas-random.h
@@ -0,0 +1,1236 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef BLAS_RANDOM_H_
+#define BLAS_RANDOM_H_
+
+#include <clBLAS.h>
+#include <math.h>       // sqrt()
+
+#include <blas-math.h>
+#include <test-limits.h>
+#include <matrix.h>
+#include <testDG.h>
+
+template <typename T>
+static void
+randomGemmxMatrices(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    clblasTranspose transC,
+    size_t M,
+    size_t N,
+    size_t K,
+    bool useAlpha,
+    T *alpha,
+    T *A,
+    size_t lda,
+    T *B,
+    size_t ldb,
+    bool useBeta,
+    T *beta,
+    T *C,
+    size_t ldc)
+{
+    size_t m, n, k;
+    cl_double bound;
+
+    if (!useAlpha) {
+        *alpha = random<T>(100);
+        if (module(*alpha) == 0.0) {
+            *alpha = ONE<T>();
+        }
+    }
+
+    bound = UPPER_BOUND<T>();
+    bound = sqrt(((K - 1) * bound) / (module(*alpha) * K * K));
+
+    for (m = 0; m < M; m++) {
+        for (k = 0; k < K; k++) {
+            setElement<T>(order, transA, m, k, A, lda, random<T>(bound));
+        }
+    }
+
+    if (B != NULL) {
+        for (k = 0; k < K; k++) {
+            for (n = 0; n < N; n++) {
+                setElement<T>(order, transB, k, n, B, ldb, random<T>(bound));
+            }
+        }
+    }
+
+    if ((!useBeta) && (beta != NULL)) {
+        *beta = random<T>(100);
+    }
+
+    if (C != NULL) {
+        // if C is not NULL, then beta must not be NULL.
+        bound = UPPER_BOUND<T>();
+        if (module(*beta) != 0.0) {
+            bound = sqrt(bound / (module(*beta) * K));
+        }
+
+        for (m = 0; m < M; m++) {
+            for (n = 0; n < N; n++) {
+                setElement<T>(order, transC, m, n, C, ldc, random<T>(bound));
+            }
+        }
+    }
+}
+
+template <typename T>
+static void
+randomGemmMatrices(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    bool useAlpha,
+    T *alpha,
+    T *A,
+    size_t lda,
+    T *B,
+    size_t ldb,
+    bool useBeta,
+    T *beta,
+    T *C,
+    size_t ldc)
+{
+    randomGemmxMatrices<T>(order, transA, transB, clblasNoTrans, M, N, K,
+                        useAlpha, alpha, A, lda, B, ldb, useBeta, beta, C, ldc);
+}
+
+template <typename T>
+static void
+randomTrmmMatrices(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    bool useAlpha,
+    T *alpha,
+    T *A,
+    size_t lda,
+    T *B,
+    size_t ldb)
+{
+    size_t i, j;
+    size_t limA = 0;        /* Matrix A boundary: M or N */
+
+    switch (side) {
+    case clblasLeft:
+        randomGemmMatrices<T>(order, clblasNoTrans, clblasNoTrans, M, N, M,
+            useAlpha, alpha, A, lda, B, ldb, false, NULL, NULL, 0);
+        limA = M;
+        break;
+    case clblasRight:
+        randomGemmMatrices<T>(order, clblasNoTrans, clblasNoTrans, M, N, N,
+            useAlpha, alpha, B, ldb, A, lda, false, NULL, NULL, 0);
+        limA = N;
+        break;
+    }
+
+    // set to NAN elements which must not be accessed
+    for (i = 0; i < limA; i++) {
+        switch (uplo) {
+        case clblasUpper:
+            for (j = 0; j < i; j++) {
+                setElement<T>(order, clblasNoTrans, i, j, A, lda, FNAN<T>());
+            }
+            break;
+        case clblasLower:
+            for (j = i + 1; j < limA; j++) {
+                setElement<T>(order, clblasNoTrans, i, j, A, lda, FNAN<T>());
+            }
+            break;
+        }
+    }
+
+    if (diag == clblasUnit) {
+        for (i = 0; i < limA; i++) {
+            setElement<T>(order, clblasNoTrans, i, i, A, lda, FNAN<T>());
+        }
+    }
+}
+
+template <typename T>
+static void
+randomTrsmMatrices(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    bool useAlpha,
+    T *alpha,
+    T *A,
+    size_t lda,
+    T *B,
+    size_t ldb)
+{
+    size_t limA, i, j;
+    T min, max, x, y;
+    cl_double modMin, modMax, sum;
+
+    min = ZERO<T>();
+    max = ZERO<T>();
+
+    if (side == clblasLeft) {
+        limA = M;
+    }
+    else {
+        limA = N;
+    }
+
+    /*
+     * Generate max(|a_{ii}|). Determine min(|a_{ii}|).
+     * Generate a_{ii} which are constrainted by min/max.
+     */
+    switch (diag) {
+    case clblasUnit:
+        for (i = 0; i < limA; i++) {
+            // must not be accessed
+            setElement<T>(order, clblasNoTrans, i, i, A, lda, ONE<T>());
+        }
+        break;
+    case clblasNonUnit:
+        /* Do not allow zeros on A's main diagonal */
+        do {
+            max = random<T>(TRSM_LIMIT_A<T>());
+        } while (module(max) < 1);
+        modMax = module(max);
+        min = max / 100;
+        modMin = module(min);
+        setElement<T>(order, clblasNoTrans, 0, 0, A, lda, max);
+        for (i = 1; i < limA; i++) {
+            x = random<T>(modMin, modMax);
+            if (module(x) == 0) {
+                x = max;
+            }
+            setElement<T>(order, clblasNoTrans, i, i, A, lda, x);
+        }
+        break;
+    }
+
+    /* Generate a_{ij} for all j <> i. */
+    for (i = 0; i < limA; i++) {
+        if (diag == clblasUnit) {
+            sum = module(ONE<T>());
+        }
+        else {
+            sum = module(getElement<T>(order, clblasNoTrans, i, i, A, lda));
+        }
+
+        for (j = 0; j < limA; j++) {
+            if (j == i) {
+                continue;
+            }
+
+            if (((uplo == clblasUpper) && (j > i)) ||
+                ((uplo == clblasLower) && (j < i))) {
+                // useful element
+                if (sum >= 1.0) {
+                    x = random<T>(sum / sqrt((double)limA - j));
+                    sum -= module(x);
+                }
+                else {
+                    x = ZERO<T>();
+                }
+            }
+            else {
+                // must not be accessed
+                x = FNAN<T>();
+            }
+
+            setElement<T>(order, clblasNoTrans, i, j, A, lda, x);
+        }
+    }
+
+    /* Generate matrix B. */
+    switch (side) {
+    case clblasLeft:
+        for (j = 0; j < N; j++) {
+            sum = TRSM_LIMIT_B<T>();
+            for (i = 0; i < M; i++) {
+                x = getElement<T>(order, clblasNoTrans, i, i, A, lda);
+                y = ZERO<T>();
+                if (sum >= 0.0) {
+                    y = random<T>(sum * module(x) / sqrt((double)M - i));
+                    sum -= module(y) / module(x);
+                }
+                setElement<T>(order, clblasNoTrans, i, j, B, ldb, y);
+                if ((i == 0) && (j == 0)) {
+                    min = y;
+                }
+                else if (module(y) < module(min)) {
+                    min = y;
+                }
+            }
+        }
+        break;
+    case clblasRight:
+        for (i = 0; i < M; i++) {
+            sum = TRSM_LIMIT_B<T>();
+            for (j = 0; j < N; j++) {
+                x = getElement<T>(order, clblasNoTrans, j, j, A, lda);
+                y = ZERO<T>();
+                if (sum >= 0.0) {
+                    y = random<T>(sum * module(x) / sqrt((double)N - j));
+                    sum -= module(y) / module(x);
+                }
+                setElement<T>(order, clblasNoTrans, i, j, B, ldb, y);
+                if ((i == 0) && (j == 0)) {
+                    min = y;
+                }
+                else if (module(y) < module(min)) {
+                    min = y;
+                }
+            }
+        }
+        break;
+    }
+    if (diag == clblasUnit) {
+        for (i = 0; i < limA; i++) {
+            // must not be accessed
+            setElement<T>(order, clblasNoTrans, i, i, A, lda, FNAN<T>());
+        }
+    }
+
+    /* Calculate alpha and adjust B accordingly */
+    if (!useAlpha) {
+        *alpha = ONE<T>();
+    }
+    if (module(min) > module(*alpha)) {
+        /* FIXME: What exactly next three lines do? */
+        *alpha = random<T>(module(min) - 2);
+        *alpha = *alpha + ONE<T>();
+        *alpha = *alpha + ONE<T>();
+
+        if (module(*alpha) < 1.0) {
+            *alpha = ONE<T>();
+        }
+    }
+    if (module(*alpha) != 1.0) {
+        for (i = 0; i < M; i++) {
+            for (j = 0; j < N; j++) {
+                x = getElement<T>(order, clblasNoTrans, i, j, B, ldb);
+                x = x / *alpha;
+                setElement<T>(order, clblasNoTrans, i, j, B, ldb, x);
+            }
+        }
+    }
+}
+
+template <typename T>
+static void
+randomTrsvMatrices(
+    clblasOrder order,
+	clblasUplo uplo,
+    clblasDiag diag,
+    size_t N,
+    T *A,
+    size_t lda,
+    T *X,
+    int incx)
+{
+	size_t i, j;
+    T min, max, x, y;
+    cl_double modMin, modMax, sum, maxDiag;
+
+    min = ZERO<T>();
+    max = ZERO<T>();
+	incx = abs(incx);
+    maxDiag = 1.0;
+
+    cl_double bound;
+    bound = (UPPER_BOUND<T>()/(N));
+
+    switch (diag) {
+    case clblasUnit:
+        for (i = 0; i < N; i++) {
+            // must not be accessed
+            if(lda > 0)
+            {
+            setElement<T>(order, clblasNoTrans, i, i, A, lda, ONE/*FNAN*/<T>());
+        }
+            else //Packed case
+            {
+                setElementPacked<T>(order, clblasNoTrans, uplo, i, i, A, N, ONE/*FNAN*/<T>());
+            }
+        }
+        break;
+    case clblasNonUnit:
+        /* Do not allow zeros on A's main diagonal and get a big number which is atleast greater than N/4*/
+        maxDiag = ((N/4) > bound) ? (bound/4) : (N/4);
+        maxDiag = (1 > (maxDiag)) ? 1 : maxDiag;
+        do {
+            max = randomTrsv<T>(bound);
+        } while ((module(max) < (maxDiag)));
+        modMax = module(max);
+        min = max / 100;
+        modMin = module(min);
+        if(lda > 0)
+        {
+        setElement<T>(order, clblasNoTrans, 0, 0, A, lda, max);
+        }
+        else //Packed Case
+        {
+            setElementPacked<T>(order, clblasNoTrans, uplo, 0, 0, A, N, max);
+        }
+        //printf("Diagonals %d ", max);
+        for (i = 1; i < N; i++) {
+            x = randomTrsv<T>(modMin, modMax);
+            if (module(x) < 1) {
+                x = max;
+            }
+            //printf("%d ", x);
+            /*if(module(x) < 1)
+            {
+                printf("WARNING: Diagonal less than one\n");
+            }*/
+            if(lda > 0)
+            {
+            setElement<T>(order, clblasNoTrans, i, i, A, lda, x);
+        }
+            else
+            {
+                setElementPacked<T>(order, clblasNoTrans, uplo, i, i, A, N, x);
+            }
+        }
+       // printf("\n");
+        break;
+    }
+
+    /* Generate a_{ij} for all j <> i. */
+    for (i = 0; i < N; i++) {
+
+        if (diag == clblasUnit) {
+            sum = module(ONE<T>());
+        }
+        else {
+            T temp;
+            if(lda > 0)
+            {
+                temp = getElement<T>(order, clblasNoTrans, i, i, A, lda);
+        }
+            else
+            {
+                temp = getElementPacked<T>(order, clblasNoTrans, uplo, i, i, A, N);
+            }
+            sum = module(temp);
+        }
+
+        for (j = 0; j < N; j++) {
+            if (j == i) {
+                continue;
+            }
+
+            if (((uplo == clblasUpper) && (j > i)) ||
+                ((uplo == clblasLower) && (j < i)))
+            {
+                x = randomTrsv<T>(sum/N);
+                }
+                else {
+                // must not be accessed
+                x = FNAN<T>();
+            }
+            if(lda > 0)
+            {
+            setElement<T>(order, clblasNoTrans, i, j, A, lda, x);
+        }
+            else //Packed Case.
+            {
+                setElementPacked<T>(order, clblasNoTrans, uplo, i, j, A, N, x);
+    }
+        }
+    }
+
+    /* Generate matrix X. */
+    sum = TRSM_LIMIT_B<T>();
+    for (i = 0; i < N; i++) {
+        if(lda > 0)
+        {
+        x = getElement<T>(order, clblasNoTrans, i, i, A, lda);
+        }
+        else //Packed Case.
+        {
+            x = getElementPacked<T>(order, clblasNoTrans, uplo, i, i, A, N);
+        }
+        sum = module(x);
+        y = randomTrsv<T>(sum/N);
+        setElement<T>(clblasColumnMajor, clblasNoTrans, (i * abs(incx)), 0, X, (1 + (N-1)*abs(incx)), y);
+        if (i == 0) {
+            min = y;
+        }
+        else if (module(y) < module(min)) {
+            min = y;
+        }
+    }
+}
+
+template <typename T>
+static void
+randomSyrMatrices(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    bool useAlpha,
+    T *alpha,
+    T *A,
+    size_t lda,
+    T *X,
+	int incx
+    )
+{
+    size_t i, j;
+	size_t lengthX;
+    cl_double bound;
+
+    if (!useAlpha) {
+        *alpha = random<T>(100);
+        if (module(*alpha) == 0.0) {
+            *alpha = ONE<T>();
+        }
+    }
+	#ifdef DEBUG_SYR
+	printf("ALPHA in randomSyrMatrices %f\n", *alpha);
+	#endif
+
+	// bound is calculated by solving the equation (alpha*x^2 + x - UPPER_BOUND) < 0
+
+	bound = UPPER_BOUND<T>();
+
+	if(module(*alpha) > (sqrt(bound) / (2.0)))
+		*alpha = random<T>((sqrt(bound) / (2.0)));
+
+	#ifdef DEBUG_SYR
+	printf("ALPHA in randomSyrMatrices after check %f bound for alpha is %f\n", *alpha, (sqrt(bound) / (2.0)));
+	#endif
+
+	bound = bound / module(*alpha);
+
+    bound = sqrt( ((((1.0) / module(*alpha)) / (4.0)) / module(*alpha)) + bound) - ((1.0) / ((2.0) * (*alpha)));
+
+	#ifdef DEBUG_SYR
+	printf("BOUND : %f alpha %f \n", bound, *alpha);
+	#endif
+
+     if( lda )
+    {
+    for (i = 0; i < N; i++) {
+        for (j = 0; j < N; j++) {
+            setElement<T>(order, clblasNoTrans, i, j, A, lda, random<T>(bound));
+        }
+    }
+    } else {
+        for (i = 0; i < N; i++) {
+            for (j = 0; j < N; j++) {
+                setElementPacked<T>(order, clblasNoTrans, uplo, i, j, A, N, random<T>(bound));
+            }
+        }
+    }
+
+
+	lengthX = 1 + ((N - 1) * abs(incx));
+    if (X != NULL) {
+        for (i = 0; i < lengthX; i++) {
+			X[i] = random<T>(bound);
+        }
+    }
+}
+
+template <typename T>
+static void
+randomSyr2Matrices(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    bool useAlpha,
+    T *alpha,
+    T *A,
+    size_t lda,
+    T *X,
+	int incx,
+	T *Y,
+	int incy
+    )
+{
+    size_t i, j;
+	size_t lengthX;
+    size_t lengthY;
+	cl_double bound;
+
+    if (!useAlpha) {
+        *alpha = random<T>(100);
+        if (module(*alpha) == 0.0) {
+            *alpha = ONE<T>();
+        }
+    }
+	#ifdef DEBUG_SYR2
+	printf("ALPHA in randomSyr2Matrices %f\n", *alpha);
+	#endif
+
+	// bound is calculated by solving the equation (2*alpha*x^2 + x - UPPER_BOUND) < 0
+
+	bound = UPPER_BOUND<T>();
+
+	if(module(*alpha) > (sqrt(bound) / (4.0)))
+		*alpha = random<T>((sqrt(bound) / (4.0)));
+
+	#ifdef DEBUG_SYR2
+	printf("ALPHA in randomSyrMatrices after check %f bound for alpha is %f\n", *alpha, (sqrt(bound) / (2.0)));
+	#endif
+
+	bound = bound / ( 2 * module(*alpha));
+
+    bound = sqrt( ((((1.0) / module(*alpha)) / (16.0)) / module(*alpha)) + bound) - ((1.0) / ((4.0) * (*alpha)));
+
+	#ifdef DEBUG_SYR2
+	printf("BOUND : %f alpha %f \n", bound, *alpha);
+	#endif
+
+    if( lda )
+    {
+    for (i = 0; i < N; i++) {
+        for (j = 0; j < N; j++) {
+            setElement<T>(order, clblasNoTrans, i, j, A, lda, random<T>(bound));
+        }
+    }
+    } else {
+        for (i = 0; i < N; i++) {
+            for (j = 0; j < N; j++) {
+                setElementPacked<T>(order, clblasNoTrans, uplo, i, j, A, N, random<T>(bound));
+            }
+        }
+    }
+
+	lengthX = 1 + ((N - 1) * abs(incx));
+    if (X != NULL) {
+        for (i = 0; i < lengthX; i++) {
+			X[i] = random<T>(bound);
+        }
+    }
+	lengthY = 1 + (N - 1) * abs(incy);
+	if (Y != NULL) {
+		for (i = 0; i < lengthY; i++) {
+			Y[i] = random<T>(bound);
+		}
+	}
+}
+
+template <typename T>
+static void
+randomHemvMatrices(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    bool useAlpha,
+    T *alpha,
+    T *A,
+    size_t lda,
+    T *X,
+    int incx,
+	bool useBeta,
+	T *beta,
+    T *Y,
+    int incy
+    )
+{
+    size_t i, j;
+    size_t lengthX;
+    size_t lengthY;
+    cl_double bound;
+	cl_double fAlpha, fBeta;
+
+    if (!useAlpha) {
+        *alpha = random<T>(100);
+        if (module(CREAL(*alpha)) == 0.0) {
+            CREAL(*alpha) = 1.0;
+        }
+    }
+
+	if (!useBeta) {
+        *beta = random<T>(100);
+        if (module(CREAL(*beta)) == 0.0) {
+            CREAL(*beta) = 1.0;
+        }
+    }
+
+    #ifdef DEBUG_HEMV
+    printf("ALPHA in randomSyr2Matrices %f.%f\n", CREAL(*alpha), CIMAG(*alpha));
+    printf("BETA in randomSyr2Matrices %f.%f\n", CREAL(*beta), CIMAG(*beta));
+    #endif
+
+    // bound is calculated by solving the equation (2*alpha*x^2 + x - UPPER_BOUND) < 0
+
+    bound = UPPER_BOUND<T>();
+
+    if((module(CREAL(*alpha)) > bound) || (module(CIMAG(*alpha)) > bound))
+        *alpha = random<T>((sqrt(bound) / ((2.0) * N)));
+	if (module(CREAL(*alpha)) == 0.0) {
+            CREAL(*alpha) = 1.0;
+    }
+
+	if((module(CREAL(*beta)) > bound) || (module(CIMAG(*beta)) > bound))
+        *beta = random<T>((sqrt(bound)));
+	if (module(CREAL(*beta)) == 0.0) {
+            CREAL(*beta) = 1.0;
+    }
+
+    #ifdef DEBUG_HEMV
+    printf("ALPHA in randomSyrMatrices after check %f.%f bound for alpha is %f\n", CREAL(*alpha), CIMAG(*alpha), (sqrt(bound) / ((2.0) * N)));
+    #endif
+
+	fAlpha = (module(CREAL(*alpha)) > module(CIMAG(*alpha))) ? module(CREAL(*alpha)) : module(CIMAG(*alpha));
+	fBeta  = (module(CREAL(*beta)) > module(CIMAG(*beta))) ? module(CREAL(*beta)) : module(CIMAG(*beta));
+
+    bound = bound / (fAlpha * N);
+
+    bound = sqrt( ((((((fBeta * fBeta)) / fAlpha) / (4.0)) / fAlpha) / (N * N)) + bound) - ((fBeta) / ((2.0) * (fAlpha) * N));
+
+    #ifdef DEBUG_HEMV
+    printf("BOUND : %f \n", bound);
+    #endif
+
+    if( lda )
+    {
+    for (i = 0; i < N; i++) {
+        for (j = 0; j < N; j++) {
+            setElement<T>(order, clblasNoTrans, i, j, A, lda, random<T>(bound));
+        }
+    }
+    } else {
+        for (i = 0; i < N; i++) {
+            for (j = 0; j < N; j++) {
+                setElementPacked<T>(order, clblasNoTrans, uplo, i, j, A, N, random<T>(bound));
+            }
+        }
+    }
+
+    lengthX = 1 + ((N - 1) * abs(incx));
+    if (X != NULL) {
+        for (i = 0; i < lengthX; i++) {
+            X[i] = random<T>(bound);
+        }
+    }
+    lengthY = 1 + (N - 1) * abs(incy);
+    if (Y != NULL) {
+        for (i = 0; i < lengthY; i++) {
+            Y[i] = random<T>(bound);
+        }
+    }
+}
+
+template <typename T>
+static void randomVectors(
+    size_t N,
+    T *X,
+    int incx,
+    T *Y = NULL,
+    int incy = 0,
+    bool considerN=false
+    )
+{
+    cl_double quotient = (considerN)? N: 1.0;
+    cl_double bound = sqrt( UPPER_BOUND<T>()/quotient ) / 2;     // sqrt for the alpha factor and 2 for addition
+
+    int length = 1 + ((N - 1) * abs(incx));
+    for(int i=0; i<length; i++) {
+        X[i] = random<T>(bound);
+    }
+
+    if(Y != NULL)
+    {
+        length = 1 + ((N - 1) * abs(incy));
+        for(int i=0; i<length; i++) {
+            Y[i] = random<T>(bound);
+        }
+    }
+}
+
+// testDG
+template <typename T>
+static void
+setElementWithRandomData(T *p, int vectorLength, cl_double bound)
+{
+	for(int k=0; k<vectorLength; k++)
+		p[k] = random<T>(bound);
+}
+
+template <typename T>
+static void
+setElementWithUnity(T *p, int vectorLength)
+{
+	p[0] = (T)1.0;
+	if ( vectorLength == 2)
+	{
+		p[1] = 0.0f;
+	}
+}
+
+
+template <typename T>
+static void
+setElementWithZero(T *p, int vectorLength)
+{
+	for(int k=0; k<vectorLength; k++)
+		p[k] = (T)0.0;
+}
+
+
+
+template <typename T>
+static void
+setDiagonalUnityOrNonUnity(int unity, T* data, size_t rows, size_t cols, size_t lda, int vectorLength, int creationFlags, cl_double bound)
+{
+
+	if (creationFlags & PACKED_MATRIX)
+	{
+
+		// Rows = Cols for PACKED Matrix
+		for(size_t i=0;i< rows;i++)
+		{
+			if (creationFlags & UPPER_HALF_ONLY)
+    			{
+				(unity==1)? setElementWithUnity( ((creationFlags & ROW_MAJOR_ORDER))?RMUPacked(i,i):RMLPacked(i,i), vectorLength):
+                      			(unity == 0)? setElementWithZero( ((creationFlags & ROW_MAJOR_ORDER))?RMUPacked(i,i):RMLPacked(i,i), vectorLength):
+                      				setElementWithRandomData( ((creationFlags & ROW_MAJOR_ORDER))?RMUPacked(i,i):RMLPacked(i,i), vectorLength, bound);
+    			}
+    			else
+    			{
+          			(unity==1)? setElementWithUnity( (creationFlags & ROW_MAJOR_ORDER)?RMLPacked(i,i):RMUPacked(i,i), vectorLength):
+                      			(unity==0)? setElementWithZero( (creationFlags & ROW_MAJOR_ORDER)?RMLPacked(i,i):RMUPacked(i,i), vectorLength):
+                      				setElementWithRandomData( (creationFlags & ROW_MAJOR_ORDER)?RMLPacked(i,i):RMUPacked(i,i) , vectorLength, bound);
+    			}
+		}
+	}
+	else
+	{
+		// Row Major - rows x lda
+		// Col major - lda x cols
+		size_t firstdimension;
+		T *p;
+
+		if (creationFlags & ROW_MAJOR_ORDER)
+		{
+			firstdimension = rows;
+		} else {
+			firstdimension = cols;
+		}
+
+		for(size_t i=0; i<firstdimension; i++)
+		{
+			p = (T *)data + (i*lda)*vectorLength;
+			p += i*vectorLength;
+
+			if (unity == 0)
+			{
+				setElementWithZero(p, vectorLength);
+			}
+			else if (unity == 1)
+			{
+				setElementWithUnity(p, vectorLength);
+			}
+			else
+			{
+				setElementWithRandomData(p, vectorLength, bound);
+			}
+		}
+	}
+}
+
+template <typename T>
+static void
+setTriangularMatrixWithRandomData(char uplo, T* data, size_t rows, size_t cols, size_t lda, int vectorLength, int creationFlags, cl_double bound)
+{
+
+	// Packed Matrix
+	if (creationFlags & PACKED_MATRIX)
+	{
+		if (uplo == 'L')
+		{
+			for( size_t i=0; i < rows; i++)
+			{
+				for( size_t j=0; j < i; j++) // Don't touch diagonals
+				{
+					//setRandom( (flags & ROW_MAJOR) ? RMLPacked(i,j) : CMLPacked(i,j));
+					setElementWithRandomData( (creationFlags & ROW_MAJOR_ORDER) ? RMLPacked(i,j) : RMUPacked(j,i), vectorLength, bound);
+				}
+			}
+		}
+		else
+		{
+			for( size_t i=0; i < rows; i++)
+			{
+				for( size_t j=(i+1); j < cols; j++) // Don't touch diagonals
+				{
+					//printf("(i,j) -- (%d,%d) : Index : %d\n", i, j, ((i*((2*rows) + 1 - i))/2 + (j -i)));
+					setElementWithRandomData( (creationFlags & ROW_MAJOR_ORDER) ? RMUPacked(i,j) : RMLPacked(j,i), vectorLength, bound);
+				}
+			}
+		}
+	}
+	else
+	{
+		// Row Major - rows x lda
+		// Col major - lda x cols
+		size_t firstdimension, seconddimension;
+		T *p;
+
+		if ((uplo != 'U') && (uplo != 'L'))
+		{
+			throw -1;
+		}
+
+		if (creationFlags & ROW_MAJOR_ORDER)
+		{
+			firstdimension = rows;
+			seconddimension = cols;
+		} else {
+			firstdimension = cols;
+			seconddimension = rows;
+			if (uplo == 'U')
+			{
+				uplo = 'L';
+			} else {
+				uplo = 'U';
+			}
+		}
+
+		for(size_t i=0; i<firstdimension; i++)
+		{
+			size_t start, end;
+
+			p = (T *)data + (i* lda)*vectorLength;
+
+			// Fill the row
+			if ((uplo == 'U') || (uplo == 'u'))
+			{
+				start = i+1;
+				end = seconddimension;
+			} else {
+				start = 0;
+				end = i;
+			}
+			for(size_t j=start; j<end; j++) // Don't populate the diagonal
+			{
+				setElementWithRandomData(p + j*vectorLength, vectorLength, bound);
+			}
+		}
+	}
+}
+
+
+
+template <typename T>
+static void
+doTriangleOperation(TRIANGLE_OPERATIONS op, T* data, size_t rows, size_t cols, size_t lda, int vectorLength, int creationFlags )
+{
+        size_t firstdimension, seconddimension;
+        T *p1, *p2;
+		size_t start, end;
+
+        if (creationFlags & ROW_MAJOR_ORDER)
+        {
+            firstdimension = rows;
+            seconddimension = cols;
+        } else {
+            firstdimension = cols;
+            seconddimension = rows;
+        }
+
+		for(size_t i=0; i<firstdimension; i++)
+		{
+			//
+			// Get the correct Lower Triangle offsets for ROW
+			// and COL major matrices
+			//
+			if (creationFlags & ROW_MAJOR_ORDER)
+			{
+				start =0; end = i;
+			} else {
+				start =i+1; end = seconddimension;
+			}
+
+			for(size_t j=start; j<end; j++)
+			{
+				p1 = (T *)data + i*lda*vectorLength + j*vectorLength; // LT Address
+				p2 = (T *)data + j*lda*vectorLength + i*vectorLength; // UT Address
+				switch(op)
+				{
+				case LTOU:
+					for(int k=0; k<vectorLength; k++)
+					{
+						p2[k] = p1[k];
+					}
+					break;
+				case UTOL:
+					for(int k=0; k<vectorLength; k++)
+					{
+						p1[k] = p2[k];
+					}
+					break;
+				case SWAP:
+					for(int k=0; k<vectorLength; k++)
+					{
+						T temp;
+
+						temp = p2[k];
+						p1[k] = p2[k];
+						p2[k] = temp;
+					}
+					break;
+				default:
+					throw -1;
+				} // end switch
+			}
+		}
+	}
+
+
+// Handles float's and double's only
+// Default is NO_FLAGS, Column-Major Order
+
+template <typename T>
+static void
+doPopulate(T* data, size_t rows, size_t cols, size_t lda, int vectorLength, cl_double bound, int creationFlags = 0)
+{
+    bool triangularMatrix = ((creationFlags & LOWER_HALF_ONLY) ||
+							(creationFlags & UPPER_HALF_ONLY));
+
+
+	// Non-Square Matrix
+	if( rows != cols)
+	{
+		// Row-Major
+		if (creationFlags & ROW_MAJOR_ORDER)
+		{
+			for( size_t i=0; i < rows; i++)
+			{
+				for(size_t j=0; j < cols; j++)
+				{
+
+					T *p = (T *)data + i* lda*vectorLength + j*vectorLength;
+					setElementWithRandomData(p, vectorLength , bound);
+
+					if ( i == j)
+					{
+						if (creationFlags & UNIT_DIAGONAL)
+						{
+							setElementWithUnity(p, vectorLength);
+						} else if (creationFlags & ZERO_DIAGONAL)
+						{
+							setElementWithZero(p, vectorLength);
+						}
+					}
+				}
+			}
+		}
+		else // Col-Major
+		{
+			for( size_t i=0; i < rows; i++)
+			{
+				for(size_t j=0; j < cols; j++)
+				{
+					T *p = (T *)data + j* lda*vectorLength + i*vectorLength;
+					setElementWithRandomData(p, vectorLength, bound);
+					if ( i == j)
+					{
+						if (creationFlags & UNIT_DIAGONAL)
+						{
+							setElementWithUnity(p, vectorLength);
+						} else if (creationFlags & ZERO_DIAGONAL)
+						{
+							setElementWithZero(p, vectorLength);
+						}
+					}
+
+				}
+			}
+		}
+	}
+
+	else if ( creationFlags & PACKED_MATRIX ) // SQUARE and PACKED
+	{
+		if (triangularMatrix)
+		{
+			if (creationFlags & UPPER_HALF_ONLY)
+				setTriangularMatrixWithRandomData('U', data, rows, cols, lda, vectorLength, creationFlags, bound);
+			if (creationFlags & LOWER_HALF_ONLY)
+				{
+					setTriangularMatrixWithRandomData('L', data, rows, cols, lda, vectorLength, creationFlags, bound);
+				}
+		}
+		else
+		{
+			// FIXME: throw -1;
+		}
+
+		if (creationFlags & UNIT_DIAGONAL)
+		{
+			setDiagonalUnity();
+		} else if (creationFlags & ZERO_DIAGONAL)
+		{
+			setDiagonalZero();
+		} else
+		{
+			setDiagonalRandom();
+		}
+
+
+	} else // SQUARE
+	{
+		if (triangularMatrix)
+		{
+			if (creationFlags & UPPER_HALF_ONLY)
+				setTriangularMatrixWithRandomData('U', data, rows, cols, lda, vectorLength, creationFlags, bound);
+			if (creationFlags & LOWER_HALF_ONLY)
+				setTriangularMatrixWithRandomData('L', data, rows, cols, lda, vectorLength, creationFlags, bound);
+		} else {
+			setTriangularMatrixWithRandomData('L', data, rows, cols, lda, vectorLength, creationFlags, bound);
+			if (creationFlags & SYMMETRIC_MATRIX)
+			{
+				doTriangleOperation(LTOU, data, rows, cols, lda, vectorLength, creationFlags);
+			} else {
+				setTriangularMatrixWithRandomData('U', data, rows, cols, lda, vectorLength, creationFlags, bound);
+			}
+		}
+		if (creationFlags & UNIT_DIAGONAL)
+		{
+			setDiagonalUnity();
+		} else if (creationFlags & ZERO_DIAGONAL)
+		{
+			setDiagonalZero();
+		} else
+		{
+			setDiagonalRandom();
+		}
+
+	}
+}
+
+template <typename T>
+static void
+populate(T* data, size_t rows, size_t cols, size_t lda, BlasRoutineID BlasFn, int creationFlags = 0)
+{
+    cl_double bound;
+    bound = UPPER_BOUND<T>();
+	cl_double biggest = (cl_double)std::max( rows, cols);
+
+	switch( BlasFn )
+	{
+		case CLBLAS_TRMV:
+							bound = sqrt( ((biggest - 1)* bound) / (biggest * biggest));
+							break;
+
+		case CLBLAS_SYMM:
+		case CLBLAS_HER:
+		case CLBLAS_HER2:
+		case CLBLAS_HEMM:
+		case CLBLAS_HERK:
+		case CLBLAS_GER:	// Taking cube root because of Alpha factor- (alpha*X*Y)
+							bound = pow( (((biggest - 1)* bound) / (biggest * biggest)), ((double)1/3) );
+							break;
+
+		default :			::std::cerr << "Invalid function ID sent to populate!" << ::std::endl;
+	}
+	doPopulate( data, rows, cols, lda, 1, bound, creationFlags);
+}
+
+template<>
+__template_static void
+populate<FloatComplex>(FloatComplex* data, size_t rows, size_t cols, size_t lda, BlasRoutineID BlasFn, int creationFlags)
+{
+    cl_double bound;
+    bound = UPPER_BOUND<FloatComplex>();
+	cl_double biggest = (cl_double)std::max( rows, cols);
+
+	switch( BlasFn )
+	{
+		case CLBLAS_TRMV:
+							bound = sqrt( ((biggest - 1)* bound) / (biggest * biggest)) / 2;
+							break;
+
+		case CLBLAS_SYMM:
+		case CLBLAS_HER:
+		case CLBLAS_HER2:
+		case CLBLAS_HEMM:
+		case CLBLAS_HERK:
+		case CLBLAS_GER:	// Taking cube root because of Alpha factor- (alpha*X*Y)
+							bound = pow( (((biggest - 1)* bound) / (biggest * biggest)), ((double)1/3) );
+							break;
+
+		default :			::std::cerr << "Invalid function ID sent to populate!" << ::std::endl;
+	}
+    doPopulate( (float*)data, rows, cols, lda, 2, bound, creationFlags);
+}
+
+template<>
+__template_static  void
+populate<DoubleComplex>(DoubleComplex* data, size_t rows, size_t cols, size_t lda,  BlasRoutineID BlasFn, int creationFlags )
+{
+    cl_double bound;
+    bound = UPPER_BOUND<DoubleComplex>();
+	cl_double biggest = (cl_double)std::max( rows, cols);
+
+	switch( BlasFn )
+	{
+		case CLBLAS_TRMV:
+							bound = sqrt( ((biggest - 1)* bound) / (biggest * biggest)) / 2;
+							break;
+
+		case CLBLAS_SYMM:
+		case CLBLAS_HER:
+		case CLBLAS_HER2:
+		case CLBLAS_GER:
+		case CLBLAS_HEMM:
+		case CLBLAS_HERK:
+		case CLBLAS_SYR:	// Taking cube root because of Alpha factor- (alpha*X*Y)
+							bound = pow( (((biggest - 1)* bound) / (biggest * biggest)), ((double)1/3) );
+							break;
+
+		default :			::std::cerr << "Invalid function ID sent to populate!" << ::std::endl;
+	}
+    doPopulate( (double*)data, rows, cols, lda, 2, bound, creationFlags);
+}
+
+template <typename T>
+static double maxVal( T elem )
+{
+    return (double)elem;
+}
+
+template <>
+__template_static double maxVal<FloatComplex>( FloatComplex elem )
+{
+    return (cl_double)std::max( CREAL(elem), CIMAG(elem) );
+}
+
+template <>
+__template_static double maxVal<DoubleComplex>( DoubleComplex elem )
+{
+    return (cl_double)std::max( CREAL(elem), CIMAG(elem) );
+}
+
+
+#endif  // BLAS_RANDOM_H_
diff --git a/src/tests/include/blas-wrapper.h b/src/tests/include/blas-wrapper.h
new file mode 100644
index 0000000..04c2c10
--- /dev/null
+++ b/src/tests/include/blas-wrapper.h
@@ -0,0 +1,1987 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef BLAS_WRAPPER_H_
+#define BLAS_WRAPPER_H_
+
+#include <clBLAS.h>
+
+namespace clMath {
+
+class blas {
+public:
+
+    // GEMV wrappers
+    static void
+    gemv(
+        clblasOrder order,
+        clblasTranspose transA,
+        size_t M,
+        size_t N,
+        float alpha,
+        const float *A,
+        size_t lda,
+        const float *X,
+        int incx,
+        float beta,
+        float *Y,
+        int incy);
+
+    static void
+    gemv(
+        clblasOrder order,
+        clblasTranspose transA,
+        size_t M,
+        size_t N,
+        double alpha,
+        const double *A,
+        size_t lda,
+        const double *X,
+        int incx,
+        double beta,
+        double *Y,
+        int incy);
+
+    static void
+    gemv(
+        clblasOrder order,
+        clblasTranspose transA,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+        const FloatComplex *A,
+        size_t lda,
+        const FloatComplex *X,
+        int incx,
+        FloatComplex beta,
+        FloatComplex *Y,
+        int incy);
+
+    static void
+    gemv(
+        clblasOrder order,
+        clblasTranspose transA,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        const DoubleComplex *A,
+        size_t lda,
+        const DoubleComplex *X,
+        int incx,
+        DoubleComplex beta,
+        DoubleComplex *Y,
+        int incy);
+
+    // SYMV wrappers
+    static void
+    symv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        float alpha,
+        const float *A,
+        size_t lda,
+        const float *X,
+        int incx,
+        float beta,
+        float *Y,
+        int incy);
+
+    static void
+    symv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        double alpha,
+        const double *A,
+        size_t lda,
+        const double *X,
+        int incx,
+        double beta,
+        double *Y,
+        int incy);
+
+    // GEMM wrappers
+    static void
+    gemm(
+        clblasOrder order,
+        clblasTranspose transA,
+        clblasTranspose transB,
+        size_t M,
+        size_t N,
+        size_t K,
+        float alpha,
+        const float *A,
+        size_t lda,
+        const float *B,
+        size_t ldb,
+        float beta,
+        float *C,
+        size_t ldc);
+
+    static void
+    gemm(
+        clblasOrder order,
+        clblasTranspose transA,
+        clblasTranspose transB,
+        size_t M,
+        size_t N,
+        size_t K,
+        double alpha,
+        const double *A,
+        size_t lda,
+        const double *B,
+        size_t ldb,
+        double beta,
+        double *C,
+        size_t ldc);
+
+    static void
+    gemm(
+        clblasOrder order,
+        clblasTranspose transA,
+        clblasTranspose transB,
+        size_t M,
+        size_t N,
+        size_t K,
+        FloatComplex alpha,
+        const FloatComplex *A,
+        size_t lda,
+        const FloatComplex *B,
+        size_t ldb,
+        FloatComplex beta,
+        FloatComplex *C,
+        size_t ldc);
+
+    static void
+    gemm(
+        clblasOrder order,
+        clblasTranspose transA,
+        clblasTranspose transB,
+        size_t M,
+        size_t N,
+        size_t K,
+        DoubleComplex alpha,
+        const DoubleComplex *A,
+        size_t lda,
+        const DoubleComplex *B,
+        size_t ldb,
+        DoubleComplex beta,
+        DoubleComplex *C,
+        size_t ldc);
+
+    // TRMM wrappers
+    static void
+    trmm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t M,
+        size_t N,
+        float alpha,
+        const float *A,
+        size_t lda,
+        float *B,
+        size_t ldb);
+
+    static void
+    trmm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t M,
+        size_t N,
+        double alpha,
+        const double *A,
+        size_t lda,
+        double *B,
+        size_t ldb);
+
+    static void
+    trmm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+        const FloatComplex *A,
+        size_t lda,
+        FloatComplex *B,
+        size_t ldb);
+
+    static void
+    trmm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        const DoubleComplex *A,
+        size_t lda,
+        DoubleComplex *B,
+        size_t ldb);
+
+    // TRSM wrappers
+    static void
+    trsm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t M,
+        size_t N,
+        float alpha,
+        const float *A,
+        size_t lda,
+        float *B,
+        size_t ldb);
+
+    static void
+    trsm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t M,
+        size_t N,
+        double alpha,
+        const double *A,
+        size_t lda,
+        double *B,
+        size_t ldb);
+
+    static void
+    trsm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+        const FloatComplex *A,
+        size_t lda,
+        FloatComplex *B,
+        size_t ldb);
+
+    static void
+    trsm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        const DoubleComplex *A,
+        size_t lda,
+        DoubleComplex *B,
+        size_t ldb);
+
+    // SYR2K wrappers
+    static void
+    syr2k(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        size_t N,
+        size_t K,
+        float alpha,
+        const float *A,
+        size_t lda,
+        const float *B,
+        size_t ldb,
+        float beta,
+        float *C,
+        size_t ldc);
+
+    static void
+    syr2k(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        size_t N,
+        size_t K,
+        double alpha,
+        const double *A,
+        size_t lda,
+        const double *B,
+        size_t ldb,
+        double beta,
+        double *C,
+        size_t ldc);
+
+    static void
+    syr2k(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        size_t N,
+        size_t K,
+        FloatComplex alpha,
+        const FloatComplex *A,
+        size_t lda,
+        const FloatComplex *B,
+        size_t ldb,
+        FloatComplex beta,
+        FloatComplex *C,
+        size_t ldc);
+
+    static void
+    syr2k(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        size_t N,
+        size_t K,
+        DoubleComplex alpha,
+        const DoubleComplex *A,
+        size_t lda,
+        const DoubleComplex *B,
+        size_t ldb,
+        DoubleComplex beta,
+        DoubleComplex *C,
+        size_t ldc);
+
+    // SYRK wrappers
+    static void
+    syrk(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        size_t N,
+        size_t K,
+        float alpha,
+        const float *A,
+        size_t lda,
+        float beta,
+        float *C,
+        size_t ldc);
+
+    static void
+    syrk(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        size_t N,
+        size_t K,
+        double alpha,
+        const double *A,
+        size_t lda,
+        double beta,
+        double *C,
+        size_t ldc);
+
+    static void
+    syrk(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        size_t N,
+        size_t K,
+        FloatComplex alpha,
+        const FloatComplex *A,
+        size_t lda,
+        FloatComplex beta,
+        FloatComplex *C,
+        size_t ldc);
+
+    static void
+    syrk(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        size_t N,
+        size_t K,
+        DoubleComplex alpha,
+        const DoubleComplex *A,
+        size_t lda,
+        DoubleComplex beta,
+        DoubleComplex *C,
+        size_t ldc);
+
+	static void
+    trmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+	    clblasDiag diag,
+        size_t N,
+        float *A,
+	    size_t offa,
+        size_t lda,
+        float *X,
+	    size_t offx,
+        int incx);
+
+	static void
+    trmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        double *A,
+        size_t offa,
+        size_t lda,
+        double *X,
+        size_t offx,
+        int incx);
+
+	static void
+    trmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda,
+        FloatComplex *X,
+        size_t offx,
+        int incx);
+
+	static void
+    trmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        DoubleComplex *A,
+	    size_t offa,
+        size_t lda,
+        DoubleComplex *X,
+	    size_t offx,
+        int incx);
+
+    //TPMV
+   static void
+    tpmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        float *AP,
+        size_t offa,
+        float *X,
+        size_t offx,
+        int incx);
+
+    static void
+    tpmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        double *AP,
+        size_t offa,
+        double *X,
+        size_t offx,
+        int incx);
+
+    static void
+    tpmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        FloatComplex *AP,
+        size_t offa,
+        FloatComplex *X,
+        size_t offx,
+        int incx);
+
+    static void
+    tpmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        DoubleComplex *AP,
+        size_t offa,
+        DoubleComplex *X,
+        size_t offx,
+        int incx);
+
+
+
+	 static void
+    trsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        float *A,
+	    size_t offa,
+        size_t lda,
+        float *X,
+	    size_t offx,
+        int incx);
+
+        static void
+    trsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        double *A,
+	    size_t offa,
+        size_t lda,
+        double *X,
+	    size_t offx,
+        int incx);
+
+        static void
+    trsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        FloatComplex *A,
+	    size_t offa,
+        size_t lda,
+        FloatComplex *X,
+	    size_t offx,
+        int incx);
+
+        static void
+     trsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        DoubleComplex *A,
+	    size_t offa,
+        size_t lda,
+        DoubleComplex *X,
+	    size_t offx,
+        int incx);
+
+static void
+    tpsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        float *A,
+        size_t offa,
+        float *X,
+        size_t offx,
+        int incx);
+
+static void
+    tpsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        double *A,
+        size_t offa,
+        double *X,
+        size_t offx,
+        int incx);
+
+static void
+    tpsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        FloatComplex *A,
+        size_t offa,
+        FloatComplex *X,
+        size_t offx,
+        int incx);
+
+static void
+    tpsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t N,
+        DoubleComplex *A,
+        size_t offa,
+        DoubleComplex *X,
+        size_t offx,
+        int incx);
+
+static void
+	symm(
+		clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        float alpha,
+        float* A,
+	    size_t offa,
+        size_t lda,
+        float* B,
+	    size_t offb,
+        size_t ldb,
+        float beta,
+        float* C,
+	    size_t offc,
+        size_t ldc);
+
+static void
+    symm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        double alpha,
+        double* A,
+	    size_t offa,
+        size_t lda,
+        double* B,
+	    size_t offb,
+        size_t ldb,
+        double beta,
+        double* C,
+	    size_t offc,
+        size_t ldc);
+
+static void
+    symm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* A,
+	    size_t offa,
+        size_t lda,
+        FloatComplex* B,
+	    size_t offb,
+        size_t ldb,
+        FloatComplex  beta,
+        FloatComplex* C,
+	    size_t offc,
+        size_t ldc);
+
+static void
+    symm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* A,
+	    size_t offa,
+        size_t lda,
+        DoubleComplex* B,
+	    size_t offb,
+        size_t ldb,
+        DoubleComplex beta,
+        DoubleComplex* C,
+	    size_t offc,
+        size_t ldc);
+
+
+static void
+   ger(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        float alpha,
+        float* x,
+        size_t offx,
+        int incx,
+        float* y,
+        size_t offy,
+        int incy,
+        float* A ,
+        size_t offa,
+        size_t lda);
+
+static void
+    ger(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        double alpha,
+        double* x,
+        size_t offx,
+        int incx,
+        double* y,
+        size_t offy,
+        int incy,
+        double* A,
+        size_t offa,
+        size_t lda);
+
+static void
+   ger(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+         FloatComplex* x,
+        size_t offx,
+        int incx,
+        FloatComplex* y,
+        size_t offy,
+        int incy,
+        FloatComplex* A ,
+        size_t offa,
+        size_t lda);
+
+static void
+    ger(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* x,
+        size_t offx,
+        int incx,
+        DoubleComplex* y,
+        size_t offy,
+        int incy,
+        DoubleComplex* A,
+        size_t offa,
+        size_t lda);
+
+static void
+   gerc(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* x,
+        size_t offx,
+        int incx,
+        FloatComplex* y,
+        size_t offy,
+        int incy,
+        FloatComplex* A ,
+        size_t offa,
+        size_t lda);
+
+static void
+    gerc(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* x,
+        size_t offx,
+        int incx,
+        DoubleComplex* y,
+        size_t offy,
+        int incy,
+        DoubleComplex* A,
+        size_t offa,
+        size_t lda);
+
+
+//HER wrappers
+
+static void
+   her(
+        clblasOrder order,
+	    clblasUplo uplo,
+        size_t N,
+        float alpha,
+        FloatComplex* x,
+        size_t offx,
+        int incx,
+        FloatComplex* A ,
+        size_t offa,
+        size_t lda);
+
+static void
+    her(
+        clblasOrder order,
+	    clblasUplo uplo,
+        size_t N,
+        double alpha,
+        DoubleComplex* x,
+        size_t offx,
+        int incx,
+        DoubleComplex* A,
+        size_t offa,
+        size_t lda);
+
+// SYR wrappers
+static void
+    syr(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        float alpha,
+        float* X,
+        size_t offx,
+        int incx,
+        float* A,
+        size_t offa,
+        size_t lda);
+
+	static void
+    syr(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        double Alpha,
+        double* X,
+        size_t offx,
+        int incx,
+        double* A,
+        size_t offa,
+        size_t lda);
+
+//SPR
+
+static void
+    spr(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        float alpha,
+        float* X,
+        size_t offx,
+        int incx,
+        float* AP,
+        size_t offa);
+
+	static void
+    spr(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        double Alpha,
+        double* X,
+        size_t offx,
+        int incx,
+        double* AP,
+        size_t offa);
+
+
+	static void
+    syr2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        float alpha,
+        float* X,
+        size_t offx,
+        int incx,
+        float* Y,
+        size_t offy,
+        int incy,
+        float* A,
+        size_t offa,
+        size_t lda);
+
+	static void
+    syr2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        double Alpha,
+        double* X,
+        size_t offx,
+        int incx,
+        double* Y,
+        size_t offy,
+        int incy,
+		double* A,
+        size_t offa,
+        size_t lda);
+
+//HER2
+ static void
+    her2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* X,
+        size_t offx,
+        int incx,
+        FloatComplex* Y,
+        size_t offy,
+        int incy,
+        FloatComplex* A,
+        size_t offa,
+        size_t lda);
+
+    static void
+    her2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* X,
+        size_t offx,
+        int incx,
+        DoubleComplex* Y,
+        size_t offy,
+        int incy,
+        DoubleComplex* A,
+        size_t offa,
+        size_t lda);
+
+
+static void
+    hemv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* A,
+        size_t offa,
+        size_t lda,
+        FloatComplex* X,
+        size_t offx,
+        int incx,
+        FloatComplex beta,
+        FloatComplex* Y,
+        size_t offy,
+        int incy);
+
+static void
+    hemv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex* X,
+        size_t offx,
+        int incx,
+        DoubleComplex beta,
+        DoubleComplex* Y,
+        size_t offy,
+        int incy);
+
+//HEMM
+static void
+    hemm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* A,
+        size_t offa,
+        size_t lda,
+        FloatComplex* B,
+        size_t offb,
+        size_t ldb,
+        FloatComplex  beta,
+        FloatComplex* C,
+        size_t offc,
+        size_t ldc);
+
+static void
+    hemm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex* B,
+        size_t offb,
+        size_t ldb,
+        DoubleComplex beta,
+        DoubleComplex* C,
+        size_t offc,
+        size_t ldc);
+
+// HERK wrappers
+static void
+    herk(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        size_t N,
+        size_t K,
+        float alpha,
+        const FloatComplex *A,
+        size_t lda,
+        float beta,
+        FloatComplex *C,
+        size_t ldc);
+
+static void
+    herk(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        size_t N,
+        size_t K,
+        double alpha,
+        const DoubleComplex *A,
+        size_t lda,
+        double beta,
+        DoubleComplex *C,
+        size_t ldc);
+
+// SPMV wrappers
+    static void
+    spmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        float alpha,
+        const float *A,
+        size_t offa,
+        const float *X,
+        size_t offx,
+        int incx,
+        float beta,
+        float *Y,
+        size_t offy,
+        int incy);
+
+    static void
+    spmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        double alpha,
+        const double *A,
+        size_t offa,
+        const double *X,
+        size_t offx,
+        int incx,
+        double beta,
+        double *Y,
+        size_t offy,
+        int incy);
+
+static void
+    hpmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* A,
+        size_t offa,
+        FloatComplex* X,
+        size_t offx,
+        int incx,
+        FloatComplex beta,
+        FloatComplex* Y,
+        size_t offy,
+        int incy);
+
+static void
+    hpmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* A,
+        size_t offa,
+        DoubleComplex* X,
+        size_t offx,
+        int incx,
+        DoubleComplex beta,
+        DoubleComplex* Y,
+        size_t offy,
+        int incy);
+
+//HPR wrappers
+static void
+   hpr(
+        clblasOrder order,
+	    clblasUplo uplo,
+        size_t N,
+        float alpha,
+        FloatComplex* x,
+        size_t offx,
+        int incx,
+        FloatComplex* AP ,
+        size_t offa);
+
+static void
+    hpr(
+        clblasOrder order,
+	    clblasUplo uplo,
+        size_t N,
+        double alpha,
+        DoubleComplex* x,
+        size_t offx,
+        int incx,
+        DoubleComplex* AP,
+        size_t offa);
+
+static void
+    spr2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        float alpha,
+        float* X,
+        size_t offx,
+        int incx,
+        float* Y,
+        size_t offy,
+        int incy,
+        float* AP,
+        size_t offa);
+
+	static void
+    spr2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        double Alpha,
+        double* X,
+        size_t offx,
+        int incx,
+        double* Y,
+        size_t offy,
+        int incy,
+		double* AP,
+        size_t offa);
+
+ static void
+    hpr2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex* X,
+        size_t offx,
+        int incx,
+        FloatComplex* Y,
+        size_t offy,
+        int incy,
+        FloatComplex* AP,
+        size_t offa);
+
+    static void
+    hpr2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex* X,
+        size_t offx,
+        int incx,
+        DoubleComplex* Y,
+        size_t offy,
+        int incy,
+        DoubleComplex* AP,
+        size_t offa);
+
+    // GBMV wrappers
+    static void
+    gbmv(
+        clblasOrder order,
+        clblasTranspose transA,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        float alpha,
+        float *A,
+        size_t offa,
+        size_t lda,
+        float *X,
+        size_t offx,
+        int incx,
+        float beta,
+        float *Y,
+        size_t offy,
+        int incy);
+
+    static void
+    gbmv(
+        clblasOrder order,
+        clblasTranspose transA,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        double alpha,
+        double *A,
+        size_t offa,
+        size_t lda,
+        double *X,
+        size_t offx,
+        int incx,
+        double beta,
+        double *Y,
+        size_t offy,
+        int incy);
+
+    static void
+    gbmv(
+        clblasOrder order,
+        clblasTranspose transA,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        FloatComplex alpha,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda,
+        FloatComplex *X,
+        size_t offx,
+        int incx,
+        FloatComplex beta,
+        FloatComplex *Y,
+        size_t offy,
+        int incy);
+
+	static void
+    gbmv(
+        clblasOrder order,
+        clblasTranspose transA,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        DoubleComplex alpha,
+        DoubleComplex *A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex *X,
+        size_t offx,
+        int incx,
+        DoubleComplex beta,
+        DoubleComplex *Y,
+        size_t offy,
+        int incy);
+
+//TBMV
+
+static void
+    tbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        float *A,
+        size_t offa,
+        size_t lda,
+        float *X,
+        size_t offx,
+        int incx);
+
+static void
+    tbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        double *A,
+        size_t offa,
+        size_t lda,
+        double *X,
+        size_t offx,
+        int incx);
+
+static void
+    tbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda,
+        FloatComplex *X,
+        size_t offx,
+        int incx);
+
+static void
+    tbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        DoubleComplex *A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex *X,
+        size_t offx,
+        int incx);
+
+//SBMV
+
+static void
+    sbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        size_t K,
+        float alpha,
+        float *A,
+        size_t offa,
+        size_t lda,
+        float *X,
+        size_t offx,
+        int incx,
+        float beta,
+        float *Y,
+        size_t offy,
+        int incy);
+
+static void
+    sbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        size_t K,
+        double alpha,
+        double *A,
+        size_t offa,
+        size_t lda,
+        double *X,
+        size_t offx,
+        int incx,
+        double beta,
+        double *Y,
+        size_t offy,
+        int incy);
+
+//HBMV
+static void
+    hbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        size_t K,
+        FloatComplex alpha,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda,
+        FloatComplex *X,
+        size_t offx,
+        int incx,
+        FloatComplex beta,
+        FloatComplex *Y,
+        size_t offy,
+        int incy);
+
+static void
+    hbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        size_t K,
+        DoubleComplex alpha,
+        DoubleComplex *A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex *X,
+        size_t offx,
+        int incx,
+        DoubleComplex beta,
+        DoubleComplex *Y,
+        size_t offy,
+        int incy);
+
+//TBSV
+
+static void
+    tbsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        float *A,
+        size_t offa,
+        size_t lda,
+        float *X,
+        size_t offx,
+        int incx);
+
+static void
+    tbsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        double *A,
+        size_t offa,
+        size_t lda,
+        double *X,
+        size_t offx,
+        int incx);
+
+static void
+    tbsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        FloatComplex *A,
+        size_t offa,
+        size_t lda,
+        FloatComplex *X,
+        size_t offx,
+        int incx);
+
+static void
+    tbsv(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        DoubleComplex *A,
+        size_t offa,
+        size_t lda,
+        DoubleComplex *X,
+        size_t offx,
+        int incx);
+
+static void
+    her2k(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        size_t N,
+        size_t K,
+        FloatComplex alpha,
+        const FloatComplex *A,
+        size_t offa,
+        size_t lda,
+        const FloatComplex *B,
+        size_t offb,
+        size_t ldb,
+        float beta,
+        FloatComplex *C,
+        size_t offc,
+        size_t ldc);
+
+static void
+    her2k(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        size_t N,
+        size_t K,
+        DoubleComplex alpha,
+        const DoubleComplex *A,
+        size_t offa,
+        size_t lda,
+        const DoubleComplex *B,
+        size_t offb,
+        size_t ldb,
+        double beta,
+        DoubleComplex *C,
+        size_t offc,
+        size_t ldc);
+
+//copy
+
+static void
+    copy(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx,
+        float *Y,
+        size_t offy,
+        int incy);
+
+static void
+    copy(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx,
+        double *Y,
+        size_t offy,
+        int incy);
+
+static void
+    copy(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx,
+        FloatComplex *Y,
+        size_t offy,
+        int incy);
+static void
+    copy(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx,
+        DoubleComplex *Y,
+        size_t offy,
+        int incy);
+
+//DOT
+
+static float
+    dot(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx,
+        float *Y,
+        size_t offy,
+        int incy);
+
+static double
+    dot(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx,
+        double *Y,
+        size_t offy,
+        int incy);
+
+static FloatComplex
+    dot(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx,
+        FloatComplex *Y,
+        size_t offy,
+        int incy);
+
+static DoubleComplex
+    dot(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx,
+        DoubleComplex *Y,
+        size_t offy,
+        int incy);
+
+//ASUM
+
+static float
+    asum(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx);
+
+static double
+    asum(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx);
+
+static float
+    asum(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx);
+
+static double
+    asum(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx);
+
+
+static FloatComplex
+    dotc(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx,
+        FloatComplex *Y,
+        size_t offy,
+        int incy);
+
+static DoubleComplex
+    dotc(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx,
+        DoubleComplex *Y,
+        size_t offy,
+        int incy);
+
+
+ // SWAP wrappers
+    static void
+    swap(
+        size_t N,
+        float *X,
+        size_t offa,
+		int incx,
+        float *Y,
+		size_t offb,
+        int incy);
+
+	static void
+    swap(
+        size_t N,
+        double *X,
+        size_t offa,
+		int incx,
+        double *Y,
+		size_t offb,
+        int incy);
+
+	static void
+    swap(
+        size_t N,
+        FloatComplex *X,
+        size_t offa,
+		int incx,
+        FloatComplex *Y,
+		size_t offb,
+        int incy);
+
+ 	static void
+    swap(
+        size_t N,
+        DoubleComplex *X,
+        size_t offa,
+		int incx,
+        DoubleComplex *Y,
+		size_t offb,
+        int incy);
+
+// Scal
+static void scal(
+        bool is_css_zds,
+        size_t N,
+        float alpha,
+        float *X,
+        size_t offx,
+        int incx);
+
+static void scal(
+        bool is_css_zds,
+        size_t N,
+        double alpha,
+        double *X,
+        size_t offx,
+        int incx);
+
+static void scal(
+        bool is_css_zds,
+        size_t N,
+        FloatComplex alpha,
+        FloatComplex *X,
+        size_t offx,
+        int incx);
+
+static void scal(
+        bool is_css_zds,
+        size_t N,
+        DoubleComplex alpha,
+        DoubleComplex *X,
+        size_t offx,
+        int incx);
+
+//axpy calls
+static void
+	axpy(
+		size_t N,
+        float alpha,
+		const float * X,
+		size_t offBX,
+		int incx,
+		float *Y,
+		size_t offCY,
+		int incy);
+
+ static void
+	axpy(
+		size_t N,
+        double alpha,
+		const double *X,
+		size_t offBX,
+		int incx,
+		double *Y,
+		size_t offCY,
+		int incy);
+
+ static void
+	axpy(
+		size_t N,
+        FloatComplex alpha,
+		const FloatComplex *X,
+		size_t offBX,
+		int incx,
+		FloatComplex *Y,
+		size_t offCY,
+		int incy);
+
+ static void
+	axpy(
+		size_t N,
+        DoubleComplex alpha,
+		const DoubleComplex *X,
+		size_t offBX,
+		int incx,
+		DoubleComplex *Y,
+		size_t offCY,
+		int incy);
+
+static void rotmg(
+        float* D1,
+        size_t offD1,
+        float* D2,
+        size_t offD2,
+        float* X1,
+        size_t offX1,
+        const float* Y1,
+        size_t offY1,
+        float* PARAM,
+        size_t offParam);
+
+static void rotmg(
+        double* D1,
+        size_t offD1,
+        double* D2,
+        size_t offD2,
+        double* X1,
+        size_t offX1,
+        const double* Y1,
+        size_t offY1,
+        double* PARAM,
+        size_t offParam);
+
+static void rotm(
+        size_t N,
+        float* X,
+        size_t offx,
+        int incx,
+        float* Y,
+        size_t offy,
+        int incy,
+        float* PARAM,
+        size_t offParam);
+
+static void rotm(
+        size_t N,
+        double* X,
+        size_t offx,
+        int incx,
+        double* Y,
+        size_t offy,
+        int incy,
+        double* PARAM,
+        size_t offParam);
+
+static void rotg(
+        float* SA,
+        size_t offSA,
+        float* SB,
+        size_t offSB,
+        float* C,
+        size_t offC,
+        float* S,
+        size_t offS);
+
+static void rotg(
+        double* SA,
+        size_t offSA,
+        double* SB,
+        size_t offSB,
+        double* C,
+        size_t offC,
+        double* S,
+        size_t offS);
+
+static void rotg(
+        FloatComplex* SA,
+        size_t offSA,
+        FloatComplex* SB,
+        size_t offSB,
+        float* C,
+        size_t offC,
+        FloatComplex* S,
+        size_t offS);
+
+static void rotg(
+        DoubleComplex* SA,
+        size_t offSA,
+        DoubleComplex* SB,
+        size_t offSB,
+        double* C,
+        size_t offC,
+        DoubleComplex* S,
+        size_t offS);
+
+static void rot(
+        size_t N,
+        float* X,
+        size_t offx,
+        int incx,
+        float* Y,
+        size_t offy,
+        int incy,
+        float C,
+        float S);
+
+static void rot(
+        size_t N,
+        double* X,
+        size_t offx,
+        int incx,
+        double* Y,
+        size_t offy,
+        int incy,
+        double C,
+        double S);
+
+static void rot(
+        size_t N,
+        FloatComplex* X,
+        size_t offx,
+        int incx,
+        FloatComplex* Y,
+        size_t offy,
+        int incy,
+        FloatComplex C,
+        FloatComplex S);
+
+static void rot(
+        size_t N,
+        DoubleComplex* X,
+        size_t offx,
+        int incx,
+        DoubleComplex* Y,
+        size_t offy,
+        int incy,
+        DoubleComplex C,
+        DoubleComplex S);
+
+static int
+    iamax(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx);
+
+static int
+    iamax(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx);
+
+static int
+    iamax(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx);
+
+static int
+    iamax(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx);
+
+static float
+    nrm2(
+        size_t N,
+        float *X,
+        size_t offx,
+        int incx);
+
+static double
+    nrm2(
+        size_t N,
+        double *X,
+        size_t offx,
+        int incx);
+
+static float
+    nrm2(
+        size_t N,
+        FloatComplex *X,
+        size_t offx,
+        int incx);
+
+static double
+    nrm2(
+        size_t N,
+        DoubleComplex *X,
+        size_t offx,
+        int incx);
+
+
+};// class blas
+
+}   // namespace clMath;
+
+#endif  // BLAS_WRAPPER_H_
diff --git a/src/tests/include/clBLAS-wrapper.h b/src/tests/include/clBLAS-wrapper.h
new file mode 100644
index 0000000..9da0114
--- /dev/null
+++ b/src/tests/include/clBLAS-wrapper.h
@@ -0,0 +1,2235 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef CLBLAS_WRAPPER_H_
+#define CLBLAS_WRAPPER_H_
+
+#include <clBLAS.h>
+#include <cltypes.h>
+
+namespace clMath {
+
+class clblas {
+public:
+    // GEMV wrappers
+    static clblasStatus
+    gemv(
+        clblasOrder order,
+        clblasTranspose transA,
+        size_t M,
+        size_t N,
+        float alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        float beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    gemv(
+        clblasOrder order,
+        clblasTranspose transA,
+        size_t M,
+        size_t N,
+        double alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        double beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    gemv(
+        clblasOrder order,
+        clblasTranspose transA,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        FloatComplex beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    gemv(
+        clblasOrder order,
+        clblasTranspose transA,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        DoubleComplex beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    // SYMV wrappers
+    static clblasStatus
+    symv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        float alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        float beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    symv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        double alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        double beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    // GEMM wrappers
+    static clblasStatus
+    gemm(
+        clblasOrder order,
+        clblasTranspose transA,
+        clblasTranspose transB,
+        size_t M,
+        size_t N,
+        size_t K,
+        float alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        const cl_mem B,
+        size_t offB,
+        size_t ldb,
+        float beta,
+        cl_mem C,
+        size_t offC,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    gemm(
+        clblasOrder order,
+        clblasTranspose transA,
+        clblasTranspose transB,
+        size_t M,
+        size_t N,
+        size_t K,
+        double alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        const cl_mem B,
+        size_t offB,
+        size_t ldb,
+        double beta,
+        cl_mem C,
+        size_t offC,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    gemm(
+        clblasOrder order,
+        clblasTranspose transA,
+        clblasTranspose transB,
+        size_t M,
+        size_t N,
+        size_t K,
+        FloatComplex alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        const cl_mem B,
+        size_t offB,
+        size_t ldb,
+        FloatComplex beta,
+        cl_mem C,
+        size_t offC,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    gemm(
+        clblasOrder order,
+        clblasTranspose transA,
+        clblasTranspose transB,
+        size_t M,
+        size_t N,
+        size_t K,
+        DoubleComplex alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        const cl_mem B,
+        size_t offB,
+        size_t ldb,
+        DoubleComplex beta,
+        cl_mem C,
+        size_t offC,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    gemm2(
+        clblasOrder order,
+        clblasTranspose transA,
+        clblasTranspose transB,
+        size_t M,
+        size_t N,
+        size_t K,
+        float alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        const cl_mem B,
+        size_t offB,
+        size_t ldb,
+        float beta,
+        cl_mem C,
+        size_t offC,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    gemm2(
+        clblasOrder order,
+        clblasTranspose transA,
+        clblasTranspose transB,
+        size_t M,
+        size_t N,
+        size_t K,
+        double alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        const cl_mem B,
+        size_t offB,
+        size_t ldb,
+        double beta,
+        cl_mem C,
+        size_t offC,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    gemm2(
+        clblasOrder order,
+        clblasTranspose transA,
+        clblasTranspose transB,
+        size_t M,
+        size_t N,
+        size_t K,
+        FloatComplex alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        const cl_mem B,
+        size_t offB,
+        size_t ldb,
+        FloatComplex beta,
+        cl_mem C,
+        size_t offC,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    gemm2(
+        clblasOrder order,
+        clblasTranspose transA,
+        clblasTranspose transB,
+        size_t M,
+        size_t N,
+        size_t K,
+        DoubleComplex alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        const cl_mem B,
+        size_t offB,
+        size_t ldb,
+        DoubleComplex beta,
+        cl_mem C,
+        size_t offC,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    // TRMM wrappers
+    static clblasStatus
+    trmm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t M,
+        size_t N,
+        float alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        cl_mem B,
+        size_t offB,
+        size_t ldb,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    trmm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t M,
+        size_t N,
+        double alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        cl_mem B,
+        size_t offB,
+        size_t ldb,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    trmm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        cl_mem B,
+        size_t offB,
+        size_t ldb,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    trmm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        cl_mem B,
+        size_t offB,
+        size_t ldb,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    // TRSM wrappers
+    static clblasStatus
+    trsm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t M,
+        size_t N,
+        float alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        cl_mem B,
+        size_t offB,
+        size_t ldb,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    trsm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t M,
+        size_t N,
+        double alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        cl_mem B,
+        size_t offB,
+        size_t ldb,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    trsm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        cl_mem B,
+        size_t offB,
+        size_t ldb,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    trsm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        clblasDiag diag,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        cl_mem B,
+        size_t offB,
+        size_t ldb,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    // SYR2K wrappers
+    static clblasStatus
+    syr2k(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transAB,
+        size_t N,
+        size_t K,
+        float alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        const cl_mem B,
+        size_t offB,
+        size_t ldb,
+        float beta,
+        cl_mem C,
+        size_t offC,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    syr2k(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transAB,
+        size_t N,
+        size_t K,
+        double alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        const cl_mem B,
+        size_t offB,
+        size_t ldb,
+        double beta,
+        cl_mem C,
+        size_t offC,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    syr2k(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transAB,
+        size_t N,
+        size_t K,
+        FloatComplex alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        const cl_mem B,
+        size_t offB,
+        size_t ldb,
+        FloatComplex beta,
+        cl_mem C,
+        size_t offC,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    syr2k(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transAB,
+        size_t N,
+        size_t K,
+        DoubleComplex alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        const cl_mem B,
+        size_t offB,
+        size_t ldb,
+        DoubleComplex beta,
+        cl_mem C,
+        size_t offC,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    // SYRK wrappers
+    static clblasStatus
+    syrk(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        size_t N,
+        size_t K,
+        float alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        float beta,
+        cl_mem C,
+        size_t offC,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    syrk(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        size_t N,
+        size_t K,
+        double alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        double beta,
+        cl_mem C,
+        size_t offC,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    syrk(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        size_t N,
+        size_t K,
+        FloatComplex alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        FloatComplex beta,
+        cl_mem C,
+        size_t offC,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    syrk(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        size_t N,
+        size_t K,
+        DoubleComplex alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        DoubleComplex beta,
+        cl_mem C,
+        size_t offC,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+
+    static clblasStatus
+    trmv(
+	    DataType type,
+    	clblasOrder order,
+    	clblasUplo uplo,
+	    clblasTranspose trans,
+	    clblasDiag diag,
+    	size_t N,
+    	const cl_mem A,
+    	size_t ffa,
+    	size_t lda,
+    	cl_mem X,
+    	size_t offx,
+    	int incx,
+	    cl_mem scratchBuff,
+    	cl_uint numCommandQueues,
+    	cl_command_queue *commandQueues,
+    	cl_uint numEventsInWaitList,
+    	const cl_event *eventWaitList,
+    	cl_event *events);
+
+	static clblasStatus
+    trsv(
+        DataType type,
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    tpsv(
+        DataType type,
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        const cl_mem A,
+        size_t offa,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+	static clblasStatus
+	symm(
+		clblasOrder order,
+	    clblasSide side,
+    	clblasUplo uplo,
+	    size_t M,
+	    size_t N,
+	    float alpha,
+	    const cl_mem A,
+	    size_t offa,
+	    size_t lda,
+	    const cl_mem B,
+	    size_t offb,
+	    size_t ldb,
+	    float beta,
+    	cl_mem C,
+	    size_t offc,
+	    size_t ldc,
+	    cl_uint numCommandQueues,
+	    cl_command_queue *commandQueues,
+	    cl_uint numEventsInWaitList,
+	    const cl_event *eventWaitList,
+    	cl_event *events);
+
+	static clblasStatus
+    symm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        double alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem B,
+        size_t offb,
+        size_t ldb,
+        double beta,
+        cl_mem C,
+        size_t offc,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+	static clblasStatus
+    symm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem B,
+        size_t offb,
+        size_t ldb,
+        FloatComplex beta,
+        cl_mem C,
+        size_t offc,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+	static clblasStatus
+    symm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem B,
+        size_t offb,
+        size_t ldb,
+        DoubleComplex beta,
+        cl_mem C,
+        size_t offc,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+	static clblasStatus
+	syr(
+		clblasOrder order,
+		clblasUplo uplo,
+		size_t N,
+		float alpha,
+		const cl_mem X,
+		size_t offx,
+		int incx,
+		cl_mem A,
+		size_t offa,
+		size_t lda,
+		cl_uint numCommandQueues,
+		cl_command_queue *commandQueue,
+		cl_uint numEventsInWaitList,
+		const cl_event *eventWaitList,
+		cl_event *events);
+
+static clblasStatus
+	syr(
+		clblasOrder order,
+		clblasUplo uplo,
+		size_t N,
+		double alpha,
+		const cl_mem X,
+		size_t offx,
+		int incx,
+		cl_mem A,
+		size_t offa,
+		size_t lda,
+		cl_uint numCommandQueues,
+		cl_command_queue *commandQueue,
+		cl_uint numEventsInWaitList,
+		const cl_event *eventWaitList,
+		cl_event *events);
+
+
+static clblasStatus
+        ger(
+            clblasOrder order,
+            size_t M,
+            size_t N,
+            float alpha,
+            const cl_mem X,
+            size_t offx,
+            int incx,
+            const cl_mem Y,
+            size_t offy,
+            int incy,
+            cl_mem A,
+            size_t offa,
+            size_t lda,
+            cl_uint numCommandQueues,
+            cl_command_queue *commandQueues,
+            cl_uint numEventsInWaitList,
+            const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+    ger(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        double alpha,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        const cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_mem A,
+        size_t offa,
+        size_t lda,
+        cl_uint numCommandQueues,
+	cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+
+static clblasStatus
+    ger(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        const cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_mem A,
+        size_t offa,
+        size_t lda,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+
+static clblasStatus
+    ger(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        const cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_mem A,
+        size_t offa,
+        size_t lda,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+    gerc(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        const cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_mem A,
+        size_t offa,
+        size_t lda,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+
+static clblasStatus
+    gerc(
+        clblasOrder order,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        const cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_mem A,
+        size_t offa,
+        size_t lda,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+    her(
+        clblasOrder order,
+	clblasUplo uplo,
+        size_t N,
+        float alpha,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        cl_mem A,
+        size_t offa,
+        size_t lda,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+
+static clblasStatus
+    her(
+        clblasOrder order,
+	clblasUplo uplo,
+        size_t N,
+        double alpha,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        cl_mem A,
+        size_t offa,
+        size_t lda,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+
+static clblasStatus
+	syr2(
+		clblasOrder order,
+		clblasUplo uplo,
+		size_t N,
+		float alpha,
+		const cl_mem X,
+		size_t offx,
+		int incx,
+		const cl_mem Y,
+		size_t offy,
+		int incy,
+		cl_mem A,
+		size_t offa,
+		size_t lda,
+		cl_uint numCommandQueues,
+		cl_command_queue *commandQueue,
+		cl_uint numEventsInWaitList,
+		const cl_event *eventWaitList,
+		cl_event *events);
+
+static clblasStatus
+	syr2(
+		clblasOrder order,
+		clblasUplo uplo,
+		size_t N,
+		double alpha,
+		const cl_mem X,
+		size_t offx,
+		int incx,
+		const cl_mem Y,
+		size_t offy,
+		int incy,
+		cl_mem A,
+		size_t offa,
+		size_t lda,
+		cl_uint numCommandQueues,
+		cl_command_queue *commandQueue,
+		cl_uint numEventsInWaitList,
+		const cl_event *eventWaitList,
+		cl_event *events);
+
+//HER2 wrappers
+ static clblasStatus
+    her2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        FloatComplex alpha,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        const cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_mem A,
+        size_t offa,
+        size_t lda,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueue,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+    her2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        DoubleComplex alpha,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        const cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_mem A,
+        size_t offa,
+        size_t lda,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueue,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+        hemv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        FloatComplex alpha,
+        const cl_mem A,
+        size_t offa,
+            size_t lda,
+            const cl_mem X,
+            size_t offx,
+            int incx,
+        FloatComplex beta,
+            cl_mem Y,
+        size_t offy,
+            int incy,
+        cl_uint numCommandQueues,
+            cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+            const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+    hemv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        DoubleComplex alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        DoubleComplex beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+//HEMM
+static clblasStatus
+    hemm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        FloatComplex alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem B,
+        size_t offb,
+        size_t ldb,
+        FloatComplex beta,
+        cl_mem C,
+        size_t offc,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+    hemm(
+        clblasOrder order,
+        clblasSide side,
+        clblasUplo uplo,
+        size_t M,
+        size_t N,
+        DoubleComplex alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem B,
+        size_t offb,
+        size_t ldb,
+        DoubleComplex beta,
+        cl_mem C,
+        size_t offc,
+        size_t ldc,
+        cl_uint numCommandQueues,
+	cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+// HERK wrappers
+ static clblasStatus
+    herk(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        size_t N,
+        size_t K,
+        float alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        float beta,
+        cl_mem C,
+        size_t offC,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+ static clblasStatus
+    herk(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        size_t N,
+        size_t K,
+        double alpha,
+        const cl_mem A,
+        size_t offA,
+        size_t lda,
+        double beta,
+        cl_mem C,
+        size_t offC,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+// TPMV wrappers
+static  clblasStatus
+	tpmv(
+		DataType type,
+		clblasOrder order,
+		clblasUplo uplo,
+		clblasTranspose trans,
+		clblasDiag diag,
+		size_t N,
+		const cl_mem AP,
+		size_t offa,
+		cl_mem X,
+		size_t offx,
+		int incx,
+		cl_mem scratchBuff,
+		cl_uint numCommandQueues,
+		cl_command_queue *commandQueues,
+		cl_uint numEventsInWaitList,
+		const cl_event *eventWaitList,
+		cl_event *events);
+
+// SPMV wrappers
+    static clblasStatus
+    spmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        cl_float alpha,
+        const cl_mem AP,
+        size_t offa,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        cl_float beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+    static clblasStatus
+    spmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        cl_double alpha,
+        const cl_mem AP,
+        size_t offa,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        cl_double beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+// HPMV wrappers
+static clblasStatus
+    hpmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        FloatComplex alpha,
+        const cl_mem AP,
+        size_t offa,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        FloatComplex beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+    hpmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        DoubleComplex alpha,
+        const cl_mem AP,
+        size_t offa,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        DoubleComplex beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+// SPR wrappers
+static clblasStatus
+	spr(
+		clblasOrder order,
+		clblasUplo uplo,
+		size_t N,
+		float alpha,
+		const cl_mem X,
+		size_t offx,
+		int incx,
+		cl_mem AP,
+		size_t offa,
+		cl_uint numCommandQueues,
+		cl_command_queue *commandQueue,
+		cl_uint numEventsInWaitList,
+		const cl_event *eventWaitList,
+		cl_event *events);
+
+static clblasStatus
+	spr(
+		clblasOrder order,
+		clblasUplo uplo,
+		size_t N,
+		double alpha,
+		const cl_mem X,
+		size_t offx,
+		int incx,
+		cl_mem AP,
+		size_t offa,
+		cl_uint numCommandQueues,
+		cl_command_queue *commandQueue,
+		cl_uint numEventsInWaitList,
+		const cl_event *eventWaitList,
+		cl_event *events);
+
+// HPR wrappers
+static clblasStatus
+    hpr(
+        clblasOrder order,
+	    clblasUplo uplo,
+        size_t N,
+        float alpha,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        cl_mem AP,
+        size_t offa,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+
+static clblasStatus
+    hpr(
+        clblasOrder order,
+	    clblasUplo uplo,
+        size_t N,
+        double alpha,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        cl_mem AP,
+        size_t offa,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+// SPR2 wrappers
+static clblasStatus
+	spr2(
+		clblasOrder order,
+		clblasUplo uplo,
+		size_t N,
+		float alpha,
+		const cl_mem X,
+		size_t offx,
+		int incx,
+		const cl_mem Y,
+		size_t offy,
+		int incy,
+		cl_mem AP,
+		size_t offa,
+		cl_uint numCommandQueues,
+		cl_command_queue *commandQueue,
+		cl_uint numEventsInWaitList,
+		const cl_event *eventWaitList,
+		cl_event *events);
+
+static clblasStatus
+	spr2(
+		clblasOrder order,
+		clblasUplo uplo,
+		size_t N,
+		double alpha,
+		const cl_mem X,
+		size_t offx,
+		int incx,
+		const cl_mem Y,
+		size_t offy,
+		int incy,
+		cl_mem AP,
+		size_t offa,
+		cl_uint numCommandQueues,
+		cl_command_queue *commandQueue,
+		cl_uint numEventsInWaitList,
+		const cl_event *eventWaitList,
+		cl_event *events);
+
+//HPR2 wrappers
+ static clblasStatus
+    hpr2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        FloatComplex alpha,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        const cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_mem AP,
+        size_t offa,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueue,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+    hpr2(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        DoubleComplex alpha,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        const cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_mem AP,
+        size_t offa,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueue,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+    gbmv(
+        clblasOrder order,
+        clblasTranspose trans,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        cl_float alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        cl_float beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+    gbmv(
+        clblasOrder order,
+        clblasTranspose trans,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        cl_double alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        cl_double beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+    gbmv(
+        clblasOrder order,
+        clblasTranspose trans,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        cl_float2 alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        cl_float2 beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+    gbmv(
+        clblasOrder order,
+        clblasTranspose trans,
+        size_t M,
+        size_t N,
+        size_t KL,
+        size_t KU,
+        cl_double2 alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        cl_double2 beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+    tbmv(
+        DataType type,
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_mem scratchBuff,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+//SBMV
+
+static clblasStatus
+    sbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        size_t K,
+        cl_float alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        cl_float beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+    sbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        size_t K,
+        cl_double alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        cl_double beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+//HBMV
+
+static clblasStatus
+    hbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        size_t K,
+        cl_float2 alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        cl_float2 beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+    hbmv(
+        clblasOrder order,
+        clblasUplo uplo,
+        size_t N,
+        size_t K,
+        cl_double2 alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem X,
+        size_t offx,
+        int incx,
+        cl_double2 beta,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+//TBSV
+
+static clblasStatus
+    tbsv(
+        DataType type,
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose trans,
+        clblasDiag diag,
+        size_t N,
+        size_t K,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        //cl_mem scratchBuff,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+    her2k(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        size_t N,
+        size_t K,
+        cl_float2 alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem B,
+        size_t offb,
+        size_t ldb,
+        cl_float beta,
+        cl_mem C,
+        size_t offc,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+ static clblasStatus
+    her2k(
+        clblasOrder order,
+        clblasUplo uplo,
+        clblasTranspose transA,
+        size_t N,
+        size_t K,
+        cl_double2 alpha,
+        const cl_mem A,
+        size_t offa,
+        size_t lda,
+        const cl_mem B,
+        size_t offb,
+        size_t ldb,
+        cl_double beta,
+        cl_mem C,
+        size_t offc,
+        size_t ldc,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+ static clblasStatus
+    scal(
+        bool is_css_zds,
+        size_t N,
+        cl_float alpha,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+ static clblasStatus
+    scal(
+        bool is_css_zds,
+        size_t N,
+        cl_double alpha,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+
+ static clblasStatus
+    scal(
+        bool is_css_zds,
+        size_t N,
+        FloatComplex alpha,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+ static clblasStatus
+    scal(
+        bool is_css_zds,
+        size_t N,
+        DoubleComplex alpha,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+ //swap calls
+ static clblasStatus
+	swap(
+        DataType type,
+		size_t N,
+		cl_mem X,
+		size_t offBX,
+		int incx,
+		cl_mem Y,
+		size_t offCY,
+		int incy,
+		cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+
+//copy
+    static clblasStatus
+    copy(
+        DataType type,
+        size_t N,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        //cl_mem scratchBuff,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+//DOT
+
+static clblasStatus
+    dot(
+        DataType type,
+        size_t N,
+        cl_mem dotProduct,
+        size_t offDP,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_mem scratchBuff,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+//ASUM
+static clblasStatus
+    asum(
+        DataType type,
+        size_t N,
+        cl_mem asum,
+        size_t offAsum,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_mem scratchBuff,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+//DOTC
+static clblasStatus
+    dotc(
+        DataType type,
+        size_t N,
+        cl_mem dotProduct,
+        size_t offDP,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_mem scratchBuff,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+ //axpy calls
+ static clblasStatus
+	axpy(
+		size_t N,
+        cl_float alpha,
+		cl_mem X,
+		size_t offBX,
+		int incx,
+		cl_mem Y,
+		size_t offCY,
+		int incy,
+		cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+ static clblasStatus
+	axpy(
+		size_t N,
+        cl_double alpha,
+		cl_mem X,
+		size_t offBX,
+		int incx,
+		cl_mem Y,
+		size_t offCY,
+		int incy,
+		cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+ static clblasStatus
+	axpy(
+		size_t N,
+        FloatComplex alpha,
+		cl_mem X,
+		size_t offBX,
+		int incx,
+		cl_mem Y,
+		size_t offCY,
+		int incy,
+		cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+ static clblasStatus
+	axpy(
+		size_t N,
+        DoubleComplex alpha,
+		cl_mem X,
+		size_t offBX,
+		int incx,
+		cl_mem Y,
+		size_t offCY,
+		int incy,
+		cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+    rotmg(
+        DataType type,
+        cl_mem D1,
+        size_t offD1,
+        cl_mem D2,
+        size_t offD2,
+        cl_mem X1,
+        size_t offX1,
+        cl_mem Y1,
+        size_t offY1,
+        cl_mem PARAM,
+        size_t offParam,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+    rotm(
+        DataType type,
+        size_t N,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_mem PARAM,
+        size_t offParam,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+    rotg(
+        DataType type,
+        cl_mem SA,
+        size_t offSA,
+        cl_mem SB,
+        size_t offSB,
+        cl_mem C,
+        size_t offC,
+        cl_mem S,
+        size_t offS,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+    rot(
+        size_t N,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_float C,
+        cl_float S,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+    rot(
+        size_t N,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        cl_double C,
+        cl_double S,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+    rot(
+        size_t N,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        FloatComplex C,
+        FloatComplex S,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+static clblasStatus
+    rot(
+        size_t N,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_mem Y,
+        size_t offy,
+        int incy,
+        DoubleComplex C,
+        DoubleComplex S,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+//AMAX
+static clblasStatus
+    iamax(
+        DataType type,
+        size_t N,
+        cl_mem iMax,
+        size_t offiMax,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_mem scratchBuff,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+
+static clblasStatus
+    nrm2(
+        DataType type,
+        size_t N,
+        cl_mem NRM2,
+        size_t offNRM2,
+        cl_mem X,
+        size_t offx,
+        int incx,
+        cl_mem scratchBuff,
+        cl_uint numCommandQueues,
+        cl_command_queue *commandQueues,
+        cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList,
+        cl_event *events);
+
+ }; // clblas class
+}   // namespace clMath
+
+#endif  // CLBLAS_WRAPPER_H_
+
diff --git a/src/tests/include/cmdline.h b/src/tests/include/cmdline.h
new file mode 100644
index 0000000..addb929
--- /dev/null
+++ b/src/tests/include/cmdline.h
@@ -0,0 +1,105 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef CMDLINE_H_
+#define CMDLINE_H_
+
+#include <clBLAS.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct ComplexLong {
+    long re;
+    long imag;
+} ComplexLong;
+
+// flags showing wheter an option was set through the command line
+typedef enum SetoptFlags {
+    NO_FLAGS  = 0,
+    SET_SEED  = (1 << 0),
+    SET_ALPHA = (1 << 1),
+    SET_BETA  = (1 << 2),
+    SET_M     = (1 << 3),
+    SET_N     = (1 << 4),
+    SET_K     = (1 << 5),
+    SET_USE_IMAGES = (1 << 6),
+    SET_DEVICE_TYPE = (1 << 7),
+    SET_INCX  = (1 << 8),
+    SET_INCY  = (1 << 9),
+    SET_NUM_COMMAND_QUEUES = (1 << 10)
+} SetoptFlags;
+
+typedef struct TestParams {
+    clblasOrder order;
+    clblasTranspose transA;
+    clblasTranspose transB;
+    clblasTranspose transC;
+    size_t M;
+    size_t N;
+    size_t K;
+    size_t KL;
+    size_t KU;
+    int incx;
+    int incy;
+    size_t offA;
+    size_t offBX;
+    size_t offCY;
+    size_t rowsA;
+    size_t columnsA;
+    size_t rowsB;
+    size_t columnsB;
+    size_t rowsC;
+    size_t columnsC;
+	size_t offa;
+	size_t offb;
+	size_t offc;
+    // reminded alpha value set through the command line
+    ComplexLong alpha;
+    size_t lda;
+    size_t ldb;
+    // reminded beta value set through the command line
+    ComplexLong beta;
+    size_t ldc;
+    clblasSide side;
+    clblasUplo uplo;
+    clblasDiag diag;
+    unsigned int seed;
+    int useImages;
+    cl_device_type devType;
+    const char*    devName;
+    cl_uint numCommandQueues;
+    SetoptFlags optFlags;
+} TestParams;
+
+int
+parseBlasCmdLineArgs(
+    int argc,
+    char *argv[],
+    TestParams *params);
+
+void
+printUsage(const char *appName);
+
+void parseEnv(TestParams *params);
+
+#ifdef __cplusplus
+}       /* extern "C" { */
+#endif
+
+#endif  /* CMDLINE_H_ */
diff --git a/src/tests/include/common.h b/src/tests/include/common.h
new file mode 100644
index 0000000..edbdb43
--- /dev/null
+++ b/src/tests/include/common.h
@@ -0,0 +1,697 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef COMMON_H_
+#define COMMON_H_
+
+#if defined (_MSC_VER)
+#define __template_static static
+#else   /* _MSC_VER */
+#define __template_static
+#endif  /* !_MSC_VER */
+
+#define MAX(a, b)  ((a>b)? a: b)
+
+#include <clBLAS.h>
+#include <cmdline.h>
+#include <math.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	typedef enum BlasRoutineID {
+		CLBLAS_GEMV,
+		CLBLAS_SYMV,
+		CLBLAS_GEMM,
+		CLBLAS_GEMM2,
+		CLBLAS_GEMM_TAIL,
+		CLBLAS_TRMM,
+		CLBLAS_TRSM,
+		CLBLAS_SYRK,
+		CLBLAS_SYR2K,
+		CLBLAS_TRMV,
+        CLBLAS_TPMV,
+		CLBLAS_TRSV,
+		CLBLAS_TRSV_GEMV,	// Need a Kludge as current "gemv" don't support complex types
+		CLBLAS_SYMM,
+		CLBLAS_GER,
+		CLBLAS_SYR,
+		CLBLAS_HER,
+		CLBLAS_HER2,
+		CLBLAS_HEMM,
+		CLBLAS_HERK,
+		CLBLAS_SWAP,
+		CLBLAS_COPY,
+		CLBLAS_DOT,
+		CLBLAS_SCAL,
+        CLBLAS_AXPY,
+		CLBLAS_ROTG,
+		CLBLAS_ROTM,
+		CLBLAS_ROT,
+		CLBLAS_ROTMG,
+		CLBLAS_NRM2,
+        CLBLAS_ASUM,
+        CLBLAS_iAMAX,
+
+		/* ! Must be the last */
+		BLAS_FUNCTIONS_NUMBER
+	} BlasRoutineID;
+
+	typedef enum BlasFunction {
+    FN_SGEMV,
+    FN_DGEMV,
+    FN_CGEMV,
+    FN_ZGEMV,
+
+    FN_SSYMV,
+    FN_DSYMV,
+
+    FN_SSPMV,
+    FN_DSPMV,
+
+    FN_SGEMM,
+    FN_DGEMM,
+    FN_CGEMM,
+    FN_ZGEMM,
+
+    FN_SGEMM_2,
+    FN_DGEMM_2,
+    FN_CGEMM_2,
+    FN_ZGEMM_2,
+
+    FN_STRMM,
+    FN_DTRMM,
+    FN_CTRMM,
+    FN_ZTRMM,
+
+    FN_STRSM,
+    FN_DTRSM,
+    FN_CTRSM,
+    FN_ZTRSM,
+
+    FN_SSYR2K,
+    FN_DSYR2K,
+    FN_CSYR2K,
+    FN_ZSYR2K,
+
+    FN_SSYRK,
+    FN_DSYRK,
+    FN_CSYRK,
+    FN_ZSYRK,
+
+    FN_STRMV,
+    FN_DTRMV,
+    FN_CTRMV,
+    FN_ZTRMV,
+
+    FN_STPMV,
+    FN_DTPMV,
+    FN_CTPMV,
+    FN_ZTPMV,
+
+    FN_STRSV,
+    FN_DTRSV,
+    FN_CTRSV,
+    FN_ZTRSV,
+
+    FN_STPSV,
+    FN_DTPSV,
+    FN_CTPSV,
+    FN_ZTPSV,
+
+    FN_SSYMM,
+    FN_DSYMM,
+    FN_CSYMM,
+    FN_ZSYMM,
+
+	FN_SSYR,
+	FN_DSYR,
+
+    FN_SSPR,
+	FN_DSPR,
+
+    FN_SGER,
+    FN_DGER,
+    FN_CGERU,
+    FN_ZGERU,
+    FN_CGERC,
+    FN_ZGERC,
+
+    FN_CHER,
+    FN_ZHER,
+	FN_CHER2,
+	FN_ZHER2,
+
+    FN_CHPR,
+    FN_ZHPR,
+	FN_CHPR2,
+	FN_ZHPR2,
+
+	FN_SSYR2,
+	FN_DSYR2,
+
+    FN_SSPR2,
+	FN_DSPR2,
+
+	FN_CHEMV,
+	FN_ZHEMV,
+
+    FN_CHPMV,
+	FN_ZHPMV,
+
+	FN_CHEMM,
+	FN_ZHEMM,
+
+	FN_CHERK,
+	FN_ZHERK,
+
+	FN_SGBMV,
+	FN_DGBMV,
+	FN_CGBMV,
+	FN_ZGBMV,
+
+	FN_STBMV,
+	FN_DTBMV,
+	FN_CTBMV,
+	FN_ZTBMV,
+
+	FN_SSBMV,
+	FN_DSBMV,
+
+	FN_CHBMV,
+	FN_ZHBMV,
+
+	FN_STBSV,
+	FN_DTBSV,
+	FN_CTBSV,
+	FN_ZTBSV,
+
+	FN_CHER2K,
+	FN_ZHER2K,
+
+    FN_SCOPY,
+    FN_DCOPY,
+    FN_CCOPY,
+    FN_ZCOPY,
+
+    FN_SSWAP,
+    FN_DSWAP,
+    FN_CSWAP,
+    FN_ZSWAP,
+
+    FN_SDOT,
+    FN_DDOT,
+    FN_CDOTU,
+    FN_ZDOTU,
+    FN_CDOTC,
+    FN_ZDOTC,
+
+    FN_SSCAL,
+    FN_DSCAL,
+    FN_CSCAL,
+    FN_ZSCAL,
+    FN_CSSCAL,
+    FN_ZDSCAL,
+
+    FN_SAXPY,
+    FN_DAXPY,
+    FN_CAXPY,
+    FN_ZAXPY,
+
+    FN_SROTG,
+    FN_DROTG,
+    FN_CROTG,
+    FN_ZROTG,
+
+    FN_SROTM,
+    FN_DROTM,
+
+	FN_SROT,
+    FN_DROT,
+	FN_CSROT,
+    FN_ZDROT,
+
+    FN_SROTMG,
+    FN_DROTMG,
+
+    FN_SNRM2,
+    FN_DNRM2,
+    FN_SCNRM2,
+    FN_DZNRM2,
+
+    FN_SASUM,
+    FN_DASUM,
+    FN_SCASUM,
+    FN_DZASUM,
+
+    FN_iSAMAX,
+    FN_iDAMAX,
+    FN_iCAMAX,
+    FN_iZAMAX,
+
+    BLAS_FUNCTION_END
+} BlasFunctionID;
+
+cl_context
+getQueueContext(cl_command_queue commandQueue, cl_int *error);
+
+cl_int
+waitForSuccessfulFinish(
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues,
+    cl_event *events);
+
+cl_int
+flushAll(
+    cl_uint numCommandQueues,
+    cl_command_queue *commandQueues);
+
+const char* orderStr(clblasOrder order);
+const char* sideStr(clblasSide side);
+const char* uploStr(clblasUplo uplo);
+const char* transStr(clblasTranspose trans);
+const char* diagStr(clblasDiag diag);
+
+char encodeTranspose(clblasTranspose value);
+char encodeUplo(clblasUplo value);
+char encodeDiag(clblasDiag value);
+char encodeSide(clblasSide value);
+
+int functionBlasLevel(BlasFunctionID funct);
+
+size_t trsmBlockSize(void);
+
+#ifdef __cplusplus
+}       // extern "C"
+#endif
+
+#ifdef __cplusplus
+
+template <typename T>
+static T
+convertMultiplier(ComplexLong arg)
+{
+    return static_cast<T>(arg.re);
+}
+
+template<>
+__template_static FloatComplex
+convertMultiplier(ComplexLong arg)
+{
+    return floatComplex(
+        static_cast<float>(arg.re), static_cast<float>(arg.imag));
+}
+
+template<>
+__template_static DoubleComplex
+convertMultiplier(ComplexLong arg)
+{
+    return doubleComplex(arg.re, arg.imag);
+}
+
+template <typename T>
+static cl_double returnMax(T arg)
+{
+    return static_cast<cl_double>(fabs(arg));
+}
+
+ template<>
+__template_static cl_double returnMax<FloatComplex> (FloatComplex arg)
+{
+    return static_cast<cl_double>( MAX( fabs(CREAL(arg)), fabs(CIMAG(arg)) ) );
+}
+
+ template<>
+__template_static cl_double returnMax<DoubleComplex> (DoubleComplex arg)
+{
+    return static_cast<cl_double>( MAX( fabs(CREAL(arg)), fabs(CIMAG(arg)) ) );
+}
+
+// xGEMM
+void
+printTestParams(
+    clblasOrder order,
+    clblasTranspose transA,
+    clblasTranspose transB,
+    size_t M,
+    size_t N,
+    size_t K,
+    bool useAlpha,
+    ComplexLong alpha,
+    size_t offA,
+    size_t lda,
+    size_t offB,
+    size_t ldb,
+    bool useBeta,
+    ComplexLong beta,
+    size_t offC,
+    size_t ldc);
+
+// xTRMM, xTRSM
+void
+printTestParams(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t M,
+    size_t N,
+    bool useAlpha,
+    ComplexLong alpha,
+    size_t offA,
+    size_t lda,
+    size_t offB,
+    size_t ldb);
+
+//xTRMV, xTRSV
+void
+printTestParams(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    size_t lda,
+    int incx,
+    size_t offa,
+    size_t offx);
+
+//xTPMV
+void
+printTestParams(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    int incx,
+    size_t offa,
+    size_t offx);
+
+//xSYR xHER
+void
+printTestParams(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    double alpha,
+    size_t offx,
+    int incx,
+    size_t offa,
+    size_t lda);
+
+
+//xHER2
+void
+printTestParams(
+        clblasOrder order,
+        clblasUplo  uplo,
+        size_t N,
+        bool useAlpha,
+        cl_float2 alpha,
+        size_t offx,
+        int incx,
+        size_t offy,
+        int incy,
+        size_t offa,
+        size_t lda);
+
+//xCOPY , xSWAP
+void
+printTestParams(
+        size_t N,
+        size_t offx,
+        int incx,
+        size_t offy,
+        int incy);
+
+//xSyr2
+void
+printTestParams(
+	clblasOrder order,
+	clblasUplo  uplo,
+	size_t N,
+	double alpha,
+	size_t offx,
+	int incx,
+	size_t offy,
+	int incy,
+	size_t offa,
+	size_t lda);
+
+//HEMV
+void
+printTestParams(
+    clblasOrder order,
+    clblasUplo  uplo,
+    size_t N,
+    ComplexLong alpha,
+    size_t offa,
+    size_t lda,
+    size_t offx,
+    int incx,
+    ComplexLong beta,
+    size_t offy,
+    int incy);
+
+//xSymm,
+void
+printTestParams(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    size_t M,
+    size_t N,
+    bool useAlpha,
+    ComplexLong alpha,
+    bool useBeta,
+    ComplexLong beta,
+    size_t lda,
+    size_t ldb,
+    size_t ldc,
+    size_t offa,
+    size_t offb,
+    size_t offc );
+
+//xHEMM
+void
+printTestParams(
+    clblasOrder order,
+    clblasSide side,
+    clblasUplo uplo,
+    size_t M,
+    size_t N,
+    bool useAlpha,
+    cl_float2 alpha,
+    bool useBeta,
+    cl_float2 beta,
+    size_t lda,
+    size_t ldb,
+    size_t ldc,
+    size_t offa,
+    size_t offb,
+    size_t offc );
+
+
+//xGER , xGERC
+
+void
+printTestParams(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    bool useAlpha,
+    ComplexLong alpha,
+    size_t lda,
+    int incx,
+    int incy,
+    size_t offa,
+    size_t offx,
+    size_t offy );
+
+// xGEMV
+void
+printTestParams(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    bool useAlpha,
+    ComplexLong alpha,
+    size_t offA,
+    size_t lda,
+    int incx,
+    bool useBeta,
+    ComplexLong beta,
+    int incy);
+
+// xGBMV
+void
+printTestParams(
+    clblasOrder order,
+    clblasTranspose transA,
+    size_t M,
+    size_t N,
+    size_t KL,
+    size_t KU,
+    ComplexLong alpha,
+    size_t offA,
+    size_t lda,
+    size_t offx,
+    int incx,
+    ComplexLong beta,
+    size_t offy,
+    int incy);
+
+//xHBMV/xSBMV
+
+void
+printTestParams(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    size_t K,
+    ComplexLong alpha,
+    size_t offA,
+    size_t lda,
+    size_t offx,
+    int incx,
+    ComplexLong beta,
+    size_t offy,
+    int incy);
+
+
+// xTBMV
+void
+printTestParams(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    clblasDiag diag,
+    size_t N,
+    size_t KLU,
+    size_t offA,
+    size_t lda,
+    size_t offx,
+    int incx,
+    size_t offy,
+    int incy);
+
+// xSYMV
+void
+printTestParams(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    bool useAlpha,
+    ComplexLong alpha,
+    size_t offA,
+    size_t lda,
+    int incx,
+    bool useBeta,
+    ComplexLong beta,
+    int incy);
+
+// xSYR2K
+void
+printTestParams(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    bool useAlpha,
+    ComplexLong alpha,
+    size_t offA,
+    size_t lda,
+    size_t offB,
+    size_t ldb,
+    bool useBeta,
+    ComplexLong beta,
+    size_t offC,
+    size_t ldc);
+
+// xSYRK
+void
+printTestParams(
+    clblasOrder order,
+    clblasUplo uplo,
+    clblasTranspose transA,
+    size_t N,
+    size_t K,
+    bool useAlpha,
+    ComplexLong alpha,
+    size_t offA,
+    size_t lda,
+    bool useBeta,
+    ComplexLong beta,
+    size_t offC,
+    size_t ldc);
+
+// xSCAL
+void
+printTestParams(
+    size_t N,
+    ComplexLong alpha,
+    size_t offx,
+    int incx);
+
+// xAXPY
+void
+printTestParams(
+    size_t N,
+    ComplexLong alpha,
+    size_t offx,
+    int incx,
+    size_t offy,
+    int incy);
+
+// For ROT
+void
+printTestParams(
+    size_t N,
+    size_t offx,
+    int incx,
+	size_t offy,
+	int incy,
+	ComplexLong alpha,
+	ComplexLong beta);
+
+// xROTG, check if other ROTs can use this too
+void
+printTestParams(size_t offSA, size_t offSB, size_t offC, size_t offS);
+
+// xROTM
+void
+printTestParams(size_t N, size_t offx, int incx, size_t offy, int incy, size_t offParam, ComplexLong sflagParam);
+
+//xROTMG
+void
+printTestParams(int offX, int offY, int offD1, int offD2, int offParam, ComplexLong sflagParam);
+
+// xNRM2, AMAX and ASUM
+void
+printTestParams(
+    size_t N,
+    size_t offx,
+    int incx);
+
+#endif  // __cplusplus
+
+#endif  /* COMMON_H_ */
diff --git a/src/tests/include/copy.h b/src/tests/include/copy.h
new file mode 100644
index 0000000..5c26a22
--- /dev/null
+++ b/src/tests/include/copy.h
@@ -0,0 +1,83 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <blas-math.h>
+
+using ::testing::TestWithParam;
+
+class COPY : public TestWithParam<
+    ::std::tr1::tuple<
+    int,                // N
+    int,                // incx, should be greater than 0
+    int,                //incy
+	int,				//offx
+	int,				//offy
+    int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->N = N;
+        params->incx = incx;
+        params->incy = incy;
+		params->offBX = offx;
+		params->offCY = offy;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+
+        N = ::std::tr1::get<0>(GetParam());
+        incx = ::std::tr1::get<1>(GetParam());
+        incy = ::std::tr1::get<2>(GetParam());
+		offx = ::std::tr1::get<3>(GetParam());
+		offy = ::std::tr1::get<4>(GetParam());
+        numCommandQueues = ::std::tr1::get<5>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        if (base->useN()) {
+            N = base->N();
+        }
+
+		printTestParams(N, offx, incx, offy, incy);
+			::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    size_t N;
+    int incx;
+    int incy;
+    size_t offx, offy;
+
+    ::clMath::BlasBase *base;
+    cl_ulong imageA, imageX;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+
diff --git a/src/tests/include/dot.h b/src/tests/include/dot.h
new file mode 100644
index 0000000..7b15528
--- /dev/null
+++ b/src/tests/include/dot.h
@@ -0,0 +1,88 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <blas-math.h>
+
+using ::testing::TestWithParam;
+
+class DOT : public TestWithParam<
+    ::std::tr1::tuple<
+    int,                // N
+    int,                // incx, should be greater than 0
+    int,                //incy
+	int,				//offx
+	int,				//offy
+	int,				//offa -- for offDP
+    int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->N = N;
+        params->incx = incx;
+        params->incy = incy;
+		params->offBX = offx;
+		params->offCY = offy;
+		params->offa = offDP;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        //size_t lenX;
+
+        N = ::std::tr1::get<0>(GetParam());
+        incx = ::std::tr1::get<1>(GetParam());
+        incy = ::std::tr1::get<2>(GetParam());
+		offx = ::std::tr1::get<3>(GetParam());
+		offy = ::std::tr1::get<4>(GetParam());
+		offDP = ::std::tr1::get<5>(GetParam());
+        numCommandQueues = ::std::tr1::get<6>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        if (base->useN()) {
+            N = base->N();
+        }
+
+		printTestParams(N, offx, incx, offy, incy);
+        ::std::cerr << "offDP = " << offDP << ::std::endl;
+		::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    size_t N;
+    int incx;
+    int incy;
+    size_t offx, offy, offDP;
+
+    ::clMath::BlasBase *base;
+    cl_ulong imageA, imageX;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+
diff --git a/src/tests/include/dotc.h b/src/tests/include/dotc.h
new file mode 100644
index 0000000..815ad1c
--- /dev/null
+++ b/src/tests/include/dotc.h
@@ -0,0 +1,88 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <blas-math.h>
+
+using ::testing::TestWithParam;
+
+class DOTC : public TestWithParam<
+    ::std::tr1::tuple<
+    int,                // N
+    int,                // incx, should be greater than 0
+    int,                //incy
+	int,				//offx
+	int,				//offy
+	int,				//offa -- for offDP
+    int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->N = N;
+        params->incx = incx;
+        params->incy = incy;
+		params->offBX = offx;
+		params->offCY = offy;
+		params->offa = offDP;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        //size_t lenX;
+
+        N = ::std::tr1::get<0>(GetParam());
+        incx = ::std::tr1::get<1>(GetParam());
+        incy = ::std::tr1::get<2>(GetParam());
+		offx = ::std::tr1::get<3>(GetParam());
+		offy = ::std::tr1::get<4>(GetParam());
+		offDP = ::std::tr1::get<5>(GetParam());
+        numCommandQueues = ::std::tr1::get<6>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        if (base->useN()) {
+            N = base->N();
+        }
+
+		printTestParams(N, offx, incx, offy, incy);
+        ::std::cerr << "offDP = " << offDP << ::std::endl;
+		::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    size_t N;
+    int incx;
+    int incy;
+    size_t offx, offy, offDP;
+
+    ::clMath::BlasBase *base;
+    cl_ulong imageA, imageX;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+
diff --git a/src/tests/include/gbmv.h b/src/tests/include/gbmv.h
new file mode 100644
index 0000000..f26dbd7
--- /dev/null
+++ b/src/tests/include/gbmv.h
@@ -0,0 +1,183 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef GBMV_H_
+#define GBMV_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <ExtraTestSizes.h>
+#include <blas-random.h>
+#include <blas-math.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+class GBMV : public TestWithParam<
+    ::std::tr1::tuple<
+        clblasOrder,     // order
+        clblasTranspose, // transA
+        int,                // M
+        int,                // N
+        int,                // KL
+        int,                // KU
+        ExtraTestSizes,
+        ComplexLong,		// Alpha
+		ComplexLong, 		// Beta
+        int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        memset(params, 0, sizeof(TestParams));
+
+        params->order = order;
+        params->transA = transA;
+        params->seed = seed;
+        params->M = M;
+        params->N = N;
+        params->KL = KL;
+        params->KU = KU;
+        params->lda = lda;
+        params->incx = incx;
+        params->incy = incy;
+        params->offA = offA;
+        params->offa = offA;
+        params->offBX = offx;
+        params->offCY = offy;
+        params->alpha = paramAlpha;
+        params->beta = paramBeta;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        ExtraTestSizes extra;
+
+        order = ::std::tr1::get<0>(GetParam());
+        transA = ::std::tr1::get<1>(GetParam());
+        M = ::std::tr1::get<2>(GetParam());
+        N = ::std::tr1::get<3>(GetParam());
+        KL = ::std::tr1::get<4>(GetParam());
+        KU = ::std::tr1::get<5>(GetParam());
+        extra = ::std::tr1::get<6>(GetParam());
+        offA = extra.offA;
+        offx = extra.offBX;
+        offy = extra.offCY;
+        lda = extra.strideA.ld;
+        incx = extra.strideBX.inc;
+        incy = extra.strideCY.inc;
+        paramAlpha = ::std::tr1::get<7>(GetParam());
+		paramBeta  = ::std::tr1::get<8>(GetParam());
+        numCommandQueues = ::std::tr1::get<9>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        KL = KL % M;
+        KU = KU % N;
+        lda = ::std::max(lda, (KL+KU+1));
+
+        printTestParams(order, transA, M, N, KL, KU, paramAlpha, offA,
+                            lda, offx, incx, paramBeta, offy, incy);
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+    clblasTranspose transA;
+    size_t M, N, KL, KU;
+    size_t lda;
+    int incx, incy;
+    size_t offA, offx, offy;
+    unsigned int seed;
+
+    ComplexLong paramAlpha, paramBeta;
+    ::clMath::BlasBase *base;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+template <typename T>
+static void
+randomGbmvMatrices(
+    clblasOrder order,
+    clblasTranspose trans,
+    size_t M,
+    size_t N,
+    T *alpha,
+    T *beta,
+    T *A,
+    size_t lda,
+    T *X,
+	int incx,
+	T *Y,
+	int incy
+    )
+{
+    size_t i;
+	size_t lenX, lenY, lenA;
+    cl_double bound, maxAB, maxMN;
+
+	// bound is calculated by solving the equation (alpha*x^2 + x - UPPER_BOUND) < 0
+	bound = UPPER_BOUND<T>();
+	if(module(maxVal(*alpha)) > (sqrt(bound) / (2.0)))
+		*alpha = random<T>((sqrt(bound) / (2.0)));
+
+    if(module(maxVal(*beta)) > (sqrt(bound) / (2.0)))
+		*beta = random<T>((sqrt(bound) / (2.0)));
+
+	maxAB = module( ::std::max(maxVal(*alpha), maxVal(*beta)) );   // Take max of alpha & beta
+	maxMN = (cl_double)::std::max( M, N );
+	bound = sqrt( bound / (maxAB*maxMN) );           // (maxAB * N * bound^2 + maxAB * bound - UPPER_BOUND) < 0
+
+    lenA = ((order == clblasRowMajor)? M: N) * lda;
+    for (i = 0; i < lenA; i++) {
+        A[i] = random<T>(bound);
+    }
+
+	if( trans == clblasNoTrans )
+	{
+    	lenX = 1 + ((N - 1) * abs(incx));
+    	lenY = 1 + ((M - 1) * abs(incy));
+    }
+    else {
+        lenX = 1 + ((M - 1) * abs(incx));
+    	lenY = 1 + ((N - 1) * abs(incy));
+    }
+    if (X != NULL) {
+        for (i = 0; i < lenX; i++) {
+			X[i] = random<T>(bound);
+        }
+    }
+    if (Y != NULL) {
+        for (i = 0; i < lenY; i++) {
+			Y[i] = random<T>(bound);
+        }
+    }
+}
+
+#endif  // GBMV_H_
diff --git a/src/tests/include/gemm-2.h b/src/tests/include/gemm-2.h
new file mode 100644
index 0000000..a4103e7
--- /dev/null
+++ b/src/tests/include/gemm-2.h
@@ -0,0 +1,181 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef GEMM_2_H_
+#define GEMM_2_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <ExtraTestSizes.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+class GEMM2 : public TestWithParam<
+    ::std::tr1::tuple<
+        clblasOrder,         // order
+        clblasTranspose,     // transA
+        clblasTranspose,     // transB
+        int,                    // M
+        int,                    // N
+        int,                    // K
+        ExtraTestSizes,
+        int                     // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->order = order;
+        params->transA = transA;
+        params->transB = transB;
+        params->seed = seed;
+        params->M = M;
+        params->N = N;
+        params->K = K;
+        params->offA = offA;
+        params->offBX = offB;
+        params->offCY = offC;
+        params->lda = lda;
+        params->ldb = ldb;
+        params->ldc = ldc;
+        params->rowsA = rowsA;
+        params->columnsA = columnsA;
+        params->rowsB = rowsB;
+        params->columnsB = columnsB;
+        params->rowsC = rowsC;
+        params->columnsC = columnsC;
+        params->alpha = paramAlpha;
+        params->beta = paramBeta;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        ExtraTestSizes extra;
+
+        order = ::std::tr1::get<0>(GetParam());
+        transA = ::std::tr1::get<1>(GetParam());
+        transB = ::std::tr1::get<2>(GetParam());
+        M = ::std::tr1::get<3>(GetParam());
+        N = ::std::tr1::get<4>(GetParam());
+        K = ::std::tr1::get<5>(GetParam());
+        extra = ::std::tr1::get<6>(GetParam());
+        offA = extra.offA;
+        offB = extra.offBX;
+        offC = extra.offCY;
+        lda = extra.strideA.ld;
+        ldb = extra.strideBX.ld;
+        ldc = extra.strideCY.ld;
+        numCommandQueues = ::std::tr1::get<7>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        useAlpha = base->useAlpha();
+        if (useAlpha != 0) {
+            paramAlpha = base->alpha();
+        }
+        useBeta = base->useBeta();
+        if (useBeta != 0) {
+            paramBeta = base->beta();
+        }
+        if (base->useM()) {
+            M = base->M();
+        }
+        if (base->useN()) {
+            N = base->N();
+        }
+        if (base->useK()) {
+            K = base->K();
+        }
+
+        if (transA == clblasNoTrans) {
+            rowsA = M;
+            columnsA = K;
+        }
+        else {
+            rowsA = K;
+            columnsA = M;
+        }
+        if (transB == clblasNoTrans) {
+            rowsB = K;
+            columnsB = N;
+        }
+        else {
+            rowsB = N;
+            columnsB = K;
+        }
+        rowsC = M;
+        columnsC = N;
+
+        switch (order) {
+        case clblasRowMajor:
+            lda = ::std::max(lda, columnsA);
+            columnsA = lda;
+            ldb = ::std::max(ldb, columnsB);
+            columnsB = ldb;
+            ldc = ::std::max(ldc, columnsC);
+            columnsC = ldc;
+            break;
+        case clblasColumnMajor:
+            lda = ::std::max(lda, rowsA);
+            rowsA = lda;
+            ldb = ::std::max(ldb, rowsB);
+            rowsB = ldb;
+            ldc = ::std::max(ldc, rowsC);
+            rowsC = ldc;
+            break;
+        }
+
+        printTestParams(order, transA, transB, M, N, K, useAlpha,
+                        base->alpha(), offA, lda, offB, ldb, useBeta,
+                        base->beta(), offC, ldc);
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+    clblasTranspose transA;
+    clblasTranspose transB;
+    size_t M, N, K;
+    size_t offA, offB, offC;
+    size_t lda, ldb, ldc;
+    unsigned int seed;
+
+    bool useAlpha, useBeta;
+    ComplexLong paramAlpha, paramBeta;
+
+    size_t rowsA, columnsA;
+    size_t rowsB, columnsB;
+    size_t rowsC, columnsC;
+
+    ::clMath::BlasBase *base;
+    cl_ulong imageA, imageB;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#endif  // GEMM_2_H_
diff --git a/src/tests/include/gemm.h b/src/tests/include/gemm.h
new file mode 100644
index 0000000..c580d45
--- /dev/null
+++ b/src/tests/include/gemm.h
@@ -0,0 +1,183 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef GEMM_H_
+#define GEMM_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <ExtraTestSizes.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+class GEMM : public TestWithParam<
+    ::std::tr1::tuple<
+        clblasOrder,         // order
+        clblasTranspose,     // transA
+        clblasTranspose,     // transB
+        int,                    // M
+        int,                    // N
+        int,                    // K
+        ExtraTestSizes,
+        int                     // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        memset(params, 0, sizeof(TestParams));
+
+        params->order = order;
+        params->transA = transA;
+        params->transB = transB;
+        params->seed = seed;
+        params->M = M;
+        params->N = N;
+        params->K = K;
+        params->offA = offA;
+        params->offBX = offB;
+        params->offCY = offC;
+        params->lda = lda;
+        params->ldb = ldb;
+        params->ldc = ldc;
+        params->rowsA = rowsA;
+        params->columnsA = columnsA;
+        params->rowsB = rowsB;
+        params->columnsB = columnsB;
+        params->rowsC = rowsC;
+        params->columnsC = columnsC;
+        params->alpha = paramAlpha;
+        params->beta = paramBeta;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        ExtraTestSizes extra;
+
+        order = ::std::tr1::get<0>(GetParam());
+        transA = ::std::tr1::get<1>(GetParam());
+        transB = ::std::tr1::get<2>(GetParam());
+        M = ::std::tr1::get<3>(GetParam());
+        N = ::std::tr1::get<4>(GetParam());
+        K = ::std::tr1::get<5>(GetParam());
+        extra = ::std::tr1::get<6>(GetParam());
+        offA = extra.offA;
+        offB = extra.offBX;
+        offC = extra.offCY;
+        lda = extra.strideA.ld;
+        ldb = extra.strideBX.ld;
+        ldc = extra.strideCY.ld;
+        numCommandQueues = ::std::tr1::get<7>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        useAlpha = base->useAlpha();
+        if (useAlpha != 0) {
+            paramAlpha = base->alpha();
+        }
+        useBeta = base->useBeta();
+        if (useBeta != 0) {
+            paramBeta = base->beta();
+        }
+        if (base->useM()) {
+            M = base->M();
+        }
+        if (base->useN()) {
+            N = base->N();
+        }
+        if (base->useK()) {
+            K = base->K();
+        }
+
+        if (transA == clblasNoTrans) {
+            rowsA = M;
+            columnsA = K;
+        }
+        else {
+            rowsA = K;
+            columnsA = M;
+        }
+        if (transB == clblasNoTrans) {
+            rowsB = K;
+            columnsB = N;
+        }
+        else {
+            rowsB = N;
+            columnsB = K;
+        }
+        rowsC = M;
+        columnsC = N;
+
+        switch (order) {
+        case clblasRowMajor:
+            lda = ::std::max(lda, columnsA);
+            columnsA = lda;
+            ldb = ::std::max(ldb, columnsB);
+            columnsB = ldb;
+            ldc = ::std::max(ldc, columnsC);
+            columnsC = ldc;
+            break;
+        case clblasColumnMajor:
+            lda = ::std::max(lda, rowsA);
+            rowsA = lda;
+            ldb = ::std::max(ldb, rowsB);
+            rowsB = ldb;
+            ldc = ::std::max(ldc, rowsC);
+            rowsC = ldc;
+            break;
+        }
+
+        printTestParams(order, transA, transB, M, N, K, useAlpha,
+                        base->alpha(), offA, lda, offB, ldb, useBeta,
+                        base->beta(), offC, ldc);
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+    clblasTranspose transA;
+    clblasTranspose transB;
+    size_t M, N, K;
+    size_t offA, offB, offC;
+    size_t lda, ldb, ldc;
+    unsigned int seed;
+
+    bool useAlpha, useBeta;
+    ComplexLong paramAlpha, paramBeta;
+
+    size_t rowsA, columnsA;
+    size_t rowsB, columnsB;
+    size_t rowsC, columnsC;
+
+    ::clMath::BlasBase *base;
+    cl_ulong imageA, imageB;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#endif  // GEMM_H_
diff --git a/src/tests/include/gemv.h b/src/tests/include/gemv.h
new file mode 100644
index 0000000..77f1c8e
--- /dev/null
+++ b/src/tests/include/gemv.h
@@ -0,0 +1,257 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef GEMV_H_
+#define GEMV_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <ExtraTestSizes.h>
+#include <blas-math.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+class GEMV : public TestWithParam<
+    ::std::tr1::tuple<
+        clblasOrder,     // order
+        clblasTranspose, // transA
+        int,                // M
+        int,                // N
+        ExtraTestSizes,
+        int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        memset(params, 0, sizeof(TestParams));
+
+        params->order = order;
+        params->transA = transA;
+        params->transB = transB;
+        params->transC = transC;
+        params->seed = seed;
+        params->M = M;
+        params->N = N;
+        params->K = L;
+        params->lda = lda;
+        params->ldb = ldb;
+        params->ldc = ldc;
+        params->rowsA = rowsA;
+        params->rowsB = rowsB;
+        params->rowsC = rowsC;
+        params->columnsA = columnsA;
+        params->columnsB = columnsB;
+        params->columnsC = columnsC;
+        params->incx = incx;
+        params->incy = incy;
+        params->offA = offA;
+        params->offBX = offx;
+        params->offCY = offy;
+        params->alpha = paramAlpha;
+        params->beta = paramBeta;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        ExtraTestSizes extra;
+        size_t lenX, lenY;
+        bool seqX, seqY;
+
+        order = ::std::tr1::get<0>(GetParam());
+        transA = ::std::tr1::get<1>(GetParam());
+        M = ::std::tr1::get<2>(GetParam());
+        N = ::std::tr1::get<3>(GetParam());
+        extra = ::std::tr1::get<4>(GetParam());
+        offA = extra.offA;
+        lda = extra.strideA.ld;
+        incx = extra.strideBX.inc;
+        incy = extra.strideCY.inc;
+        numCommandQueues = ::std::tr1::get<5>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        useAlpha = base->useAlpha();
+        if (useAlpha != 0) {
+            paramAlpha = base->alpha();
+        }
+        useBeta = base->useBeta();
+        if (useBeta != 0) {
+            paramBeta = base->beta();
+        }
+        if (base->useM()) {
+            M = base->M();
+        }
+        if (base->useN()) {
+            N = base->N();
+        }
+        if (base->useIncX()) {
+            incx = base->incX();
+        }
+        if (base->useIncY()) {
+            incy = base->incY();
+        }
+
+        ldb = ldc = 0;
+        offx = offy = 0;
+
+        L = (M + N) / 2; //It doesn't matter, can be any value
+
+        seqX = module(incx) == 1;
+        seqY = module(incy) == 1;
+
+        if (transA == clblasNoTrans) {
+            lenX = N;
+            lenY = M;
+        }
+        else {
+            lenX = M;
+            lenY = N;
+        }
+
+        rowsA = M;
+        columnsA = N;
+
+        switch (order) {
+        case clblasRowMajor:
+
+            lda = ::std::max(lda, columnsA);
+            columnsA = lda;
+
+            if (seqX) {
+                //x is a middle row in row major matrix
+                rowsB = L;
+                columnsB = lenX;
+                ldb = ::std::max(ldb, columnsB);
+                transB = clblasTrans;
+                offx = (rowsB / 2) * ldb;
+            }
+            else {
+                //x is a middle column column in row major matrix
+                rowsB = lenX;
+                columnsB = L;
+                ldb = ::std::max((size_t)module(incx), columnsB);
+                transB = clblasNoTrans;
+                offx = columnsB / 2;
+            }
+            columnsB = ldb;
+
+            if (seqY) {
+                //y is a middle row in row major matrix
+                rowsC = L;
+                columnsC = lenY;
+                ldc = ::std::max(ldc, columnsC);
+                transC = clblasTrans;
+                offy = (rowsC / 2) * ldc;
+            }
+            else {
+                //y is a middle column in row major matrix
+                rowsC = lenY;
+                columnsC = L;
+                ldc = ::std::max((size_t)module(incy), columnsC);
+                transC = clblasNoTrans;
+                offy = columnsC / 2;
+            }
+            columnsC = ldc;
+            break;
+        case clblasColumnMajor:
+
+            lda = ::std::max(lda, rowsA);
+            rowsA = lda;
+
+            if (seqX) {
+                //x is a middle column in column major matrix
+                rowsB = lenX;
+                columnsB = L;
+                ldb = ::std::max(ldb, rowsB);
+                transB = clblasNoTrans;
+                offx = (columnsB / 2) * ldb;
+            }
+            else {
+                //x is a middle row in column major matrix
+                rowsB = L;
+                columnsB = lenX;
+                ldb = ::std::max((size_t)module(incx), rowsB);
+                transB = clblasTrans;
+                offx = rowsB / 2;
+            }
+            rowsB = ldb;
+
+            if (seqY) {
+                //y is a middle column in column major matrix
+                rowsC = lenY;
+                columnsC = L;
+                ldc = ::std::max(ldc, rowsC);
+                transC = clblasNoTrans;
+                offy = (columnsC / 2) * ldc;
+            }
+            else {
+                //y is a middle row in column major matrix
+                rowsC = L;
+                columnsC = lenY;
+                ldc = ::std::max((size_t)module(incy), rowsC);
+                transC = clblasTrans;
+                offy = rowsC / 2;
+            }
+            rowsC = ldc;
+            break;
+        }
+
+        if (!seqX) {
+            incx = incx > 0 ? (int)ldb : (int)(0-ldb);
+        }
+        if (!seqY) {
+            incy = incy > 0 ? (int)ldc : (int)(0-ldc);
+        }
+
+        printTestParams(order, transA, M, N, useAlpha, base->alpha(), offA,
+                        lda, incx, useBeta, base->beta(), incy);
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+    clblasTranspose transA, transB, transC;
+    size_t M, N, L;
+    size_t lda, ldb, ldc;
+    int incx, incy;
+    size_t offA, offx, offy;
+    unsigned int seed;
+
+    bool useAlpha, useBeta;
+    ComplexLong paramAlpha, paramBeta;
+
+    size_t rowsA, rowsB, rowsC, columnsA, columnsB, columnsC;
+
+    ::clMath::BlasBase *base;
+    cl_ulong imageA, imageX;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#endif  // GEMV_H_
diff --git a/src/tests/include/ger.h b/src/tests/include/ger.h
new file mode 100644
index 0000000..3c746dd
--- /dev/null
+++ b/src/tests/include/ger.h
@@ -0,0 +1,129 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef GER_H_
+#define GER_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+
+using ::testing::TestWithParam;
+
+class GER : public TestWithParam<
+    ::std::tr1::tuple<
+        clblasOrder,    // order
+        int,                // M
+        int,                // N
+        int,                // lda
+        int,                //incx
+        int,                //incy
+        int,                // offx
+	int,		    // offy
+	int,                // offa			//FIX_ME.. gtest not allowing to add more parameters
+        int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->order = order;
+        params->M = M;
+        params->N = N;
+        params->lda = lda;
+        params->incx = incx;
+        params->incy = incy;
+	params->offa = offa;
+	params->offBX = offx;
+	params->offCY = offy;
+        params->rowsA = rowsA;
+        params->alpha = paramAlpha;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+	order = ::std::tr1::get<0>(GetParam());
+        M = ::std::tr1::get<1>(GetParam());
+        N = ::std::tr1::get<2>(GetParam());
+        lda = ::std::tr1::get<3>(GetParam());
+        incx = ::std::tr1::get<4>(GetParam());
+	incy = ::std::tr1::get<5>(GetParam());
+	offa = ::std::tr1::get<6>(GetParam());
+	offx = ::std::tr1::get<7>(GetParam());
+	offy = ::std::tr1::get<8>(GetParam());
+        numCommandQueues = ::std::tr1::get<9>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+	ComplexLong fAlpha;
+	fAlpha.re = 3, fAlpha.imag = 4;
+	base->setAlpha(fAlpha);
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        useAlpha = base->useAlpha();
+        if (useAlpha != 0) {
+            paramAlpha = base->alpha();
+        }
+        if (base->useM()) {
+            M = base->M();
+        }
+        if (base->useN()) {
+            N = base->N();
+        }
+
+        rowsA = M;
+        columnsA = N;
+
+	switch (order) {
+        case clblasRowMajor:
+            lda = ::std::max(lda, columnsA);
+            break;
+        case clblasColumnMajor:
+            lda = ::std::max(lda, rowsA);
+            break;
+        }
+
+
+	printTestParams(order, M, N, useAlpha,
+                   	base->alpha(),
+			lda, incx, incy, offa, offx, offy);
+
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+    size_t M, N;
+    size_t lda;
+    int incx, incy;
+    size_t offa, offx, offy;
+    unsigned int seed;
+    bool useAlpha;
+    ComplexLong paramAlpha;
+    size_t rowsA, columnsA;
+    ::clMath::BlasBase *base;
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#endif  // GER_H_
diff --git a/src/tests/include/gerc.h b/src/tests/include/gerc.h
new file mode 100644
index 0000000..23b09c3
--- /dev/null
+++ b/src/tests/include/gerc.h
@@ -0,0 +1,123 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef GERC_H_
+#define GERC_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+
+using ::testing::TestWithParam;
+
+class GERC : public TestWithParam<
+    ::std::tr1::tuple<
+        clblasOrder,    // order
+        int,                // M
+        int,                // N
+        int,                // lda
+        int,                //incx
+        int,                //incy
+        int,                // offx
+	int,		    // offy
+	int,                // offa			//FIX_ME.. gtest not allowing to add more parameters
+        int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->order = order;
+        params->M = M;
+        params->N = N;
+        params->lda = lda;
+        params->incx = incx;
+        params->incy = incy;
+	params->offa = offa;
+	params->offBX = offx;
+	params->offCY = offy;
+        params->rowsA = rowsA;
+        params->alpha = paramAlpha;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+	order = ::std::tr1::get<0>(GetParam());
+        M = ::std::tr1::get<1>(GetParam());
+        N = ::std::tr1::get<2>(GetParam());
+        lda = ::std::tr1::get<3>(GetParam());
+        incx = ::std::tr1::get<4>(GetParam());
+	incy = ::std::tr1::get<5>(GetParam());
+	offa = ::std::tr1::get<6>(GetParam());
+	offx = ::std::tr1::get<7>(GetParam());
+	offy = ::std::tr1::get<8>(GetParam());
+        numCommandQueues = ::std::tr1::get<9>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+	ComplexLong fAlpha;
+	fAlpha.re = 3, fAlpha.imag = 4;
+	base->setAlpha(fAlpha);
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        useAlpha = base->useAlpha();
+        if (useAlpha != 0) {
+            paramAlpha = base->alpha();
+        }
+        if (base->useM()) {
+            M = base->M();
+        }
+        if (base->useN()) {
+            N = base->N();
+        }
+
+        rowsA = M;
+        columnsA = N;
+
+
+	if( lda == 0 )
+        	lda = ::std::max(M, N);
+
+	printTestParams(order, M, N, useAlpha,
+                   	base->alpha(),
+			lda, incx, incy, offa, offx, offy);
+
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+    size_t M, N;
+    size_t lda;
+    int incx, incy;
+    size_t offa, offx, offy;
+    unsigned int seed;
+    bool useAlpha;
+    ComplexLong paramAlpha;
+    size_t rowsA, columnsA;
+    ::clMath::BlasBase *base;
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#endif  // GERC_H_
diff --git a/src/tests/include/hbmv.h b/src/tests/include/hbmv.h
new file mode 100644
index 0000000..adbd022
--- /dev/null
+++ b/src/tests/include/hbmv.h
@@ -0,0 +1,120 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef HBMV_H_
+#define HBMV_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <ExtraTestSizes.h>
+#include <blas-random.h>
+#include <blas-math.h>
+#include <tbmv.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+class HBMV : public TestWithParam<
+    ::std::tr1::tuple<
+        clblasOrder,     // order
+        clblasUplo,      // uplo
+        int,                // N
+        int,                // K
+        ExtraTestSizes,
+        ComplexLong,		// Alpha
+		ComplexLong, 		// Beta
+        int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        memset(params, 0, sizeof(TestParams));
+
+        params->order = order;
+        params->uplo = uplo;
+        params->seed = seed;
+        params->N = N;
+        params->K = KLU;
+        params->lda = lda;
+        params->incx = incx;
+        params->incy = incy;
+        params->offA = offA;
+        params->offa = offA;
+        params->offBX = offx;
+        params->offCY = offy;
+        params->alpha = paramAlpha;
+        params->beta = paramBeta;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        ExtraTestSizes extra;
+
+        order = ::std::tr1::get<0>(GetParam());
+        uplo = ::std::tr1::get<1>(GetParam());
+        N = ::std::tr1::get<2>(GetParam());
+        KLU = ::std::tr1::get<3>(GetParam());
+        extra = ::std::tr1::get<4>(GetParam());
+        offA = extra.offA;
+        offx = extra.offBX;
+        offy = extra.offCY;
+        lda = extra.strideA.ld;
+        incx = extra.strideBX.inc;
+        incy = extra.strideCY.inc;
+        paramAlpha = ::std::tr1::get<5>(GetParam());
+		paramBeta  = ::std::tr1::get<6>(GetParam());
+        numCommandQueues = ::std::tr1::get<7>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        KLU = KLU % N;
+        lda = ::std::max(lda, (KLU+1));
+
+        printTestParams(order, uplo, N, KLU, paramAlpha, offA,
+                            lda, offx, incx, paramBeta, offy, incy);
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+    clblasUplo uplo;
+    size_t  N, KLU;
+    size_t lda;
+    int incx, incy;
+    size_t offA, offx, offy;
+    unsigned int seed;
+
+    ComplexLong paramAlpha, paramBeta;
+    ::clMath::BlasBase *base;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+
+
+#endif  // HBMV_H_
diff --git a/src/tests/include/hemm.h b/src/tests/include/hemm.h
new file mode 100644
index 0000000..000c897
--- /dev/null
+++ b/src/tests/include/hemm.h
@@ -0,0 +1,141 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef HEMM_H_
+#define HEMM_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <ExtraTestSizes.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+class HEMM : public TestWithParam<
+    ::std::tr1::tuple<
+        clblasOrder,		 // order
+        clblasSide,		 // side
+        clblasUplo,		// uplo
+        int,                // M
+        int,            	 // N
+		cl_float2,			//alpha
+		cl_float2,			//beta
+		ExtraTestSizes,     // to get more than ten parameters in gtest.
+        int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->order = order;
+        params->seed = seed;
+		params->side = side;
+		params->uplo = uplo;
+        params->M = M;
+        params->N = N;
+        params->lda = lda;
+        params->ldb = ldb;
+        params->ldc = ldc;
+		params->offA = offA;
+		params->offBX = offb;
+		params->offCY = offc;
+        params->alpha.re = (long)CREAL(alpha);
+		params->alpha.imag = (long)CIMAG(alpha);
+		params->beta.re = (long)CREAL(beta);
+        params->beta.imag = (long)CIMAG(beta);
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+		ExtraTestSizes extra;
+		order = ::std::tr1::get<0>(GetParam());
+        side = ::std::tr1::get<1>(GetParam());
+        uplo = ::std::tr1::get<2>(GetParam());
+        M = ::std::tr1::get<3>(GetParam());
+        N = ::std::tr1::get<4>(GetParam());
+		alpha = ::std::tr1::get<5>(GetParam());
+        beta  = ::std::tr1::get<6>(GetParam());
+		extra = ::std::tr1::get<7>(GetParam());
+
+		offA = extra.offA;
+        offb = extra.offBX;
+        offc = extra.offCY;
+		lda = extra.strideA.ld;
+		ldb = extra.strideBX.ld;
+		ldc = extra.strideCY.ld;
+
+        numCommandQueues = ::std::tr1::get<8>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        if (base->useM()) {
+            M = base->M();
+        }
+        if (base->useN()) {
+            N = base->N();
+        }
+
+		if( side == clblasLeft )
+		{
+			lda = ::std::max(lda, M);
+		}
+		else
+		{
+			lda = ::std::max(lda, N);
+		}
+
+
+		switch (order) {
+        case clblasRowMajor:
+            ldb = ::std::max(ldb, N);
+            ldc = ::std::max(ldc, N);
+            break;
+        case clblasColumnMajor:
+            ldb = ::std::max(ldb, M);
+            ldc = ::std::max(ldc, M);
+            break;
+        }
+
+		printTestParams(order, side, uplo, M, N, 1, alpha, 1, beta, lda, ldb, ldc, offA, offb, offc);
+
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+	clblasSide side;
+	clblasUplo uplo;
+    size_t M, N;
+    size_t lda, ldb, ldc;
+    size_t offA, offb, offc;
+    unsigned int seed;
+    cl_float2 alpha, beta;
+    ::clMath::BlasBase *base;
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#endif  // HEMM_H_
diff --git a/src/tests/include/hemv.h b/src/tests/include/hemv.h
new file mode 100644
index 0000000..dcdb84a
--- /dev/null
+++ b/src/tests/include/hemv.h
@@ -0,0 +1,149 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#if !defined(HEMV_PACKED)
+    #ifndef HEMV_H
+        #define HEMV_H
+    #else
+        #define DUPLICIT
+    #endif
+#endif
+
+#ifndef DUPLICIT
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <ExtraTestSizes.h>
+#include <blas-math.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+#ifndef HEMV_PACKED
+class HEMV : public TestWithParam<
+#else
+class HPMV : public TestWithParam<
+#endif
+    ::std::tr1::tuple<
+        clblasOrder,     // order
+        clblasUplo, 		// uplo
+        int,                // N
+        ComplexLong,		// Alpha
+		ComplexLong, 		// Beta
+		size_t,				// offA
+		size_t,				// offx
+		size_t, 			// offy
+		ExtraTestSizes,
+        int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->order = order;
+        params->uplo = uplo;
+        params->seed = seed;
+        params->N = N;
+        params->lda = lda;
+        params->incx = incx;
+        params->incy = incy;
+        params->offA = offA;
+        params->offBX = offx;
+        params->offCY = offy;
+        params->alpha = paramAlpha;
+        params->beta = paramBeta;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        ExtraTestSizes extra;
+
+        order 	   = ::std::tr1::get<0>(GetParam());
+        uplo 	   = ::std::tr1::get<1>(GetParam());
+        N 		   = ::std::tr1::get<2>(GetParam());
+        paramAlpha = ::std::tr1::get<3>(GetParam());
+		paramBeta  = ::std::tr1::get<4>(GetParam());
+		offA	   = ::std::tr1::get<5>(GetParam());
+		offx	   = ::std::tr1::get<6>(GetParam());
+		offy	   = ::std::tr1::get<7>(GetParam());
+		extra 	   = ::std::tr1::get<8>(GetParam());
+        lda 	   = extra.strideA.ld;
+        incx 	   = extra.strideBX.inc;
+        incy       = extra.strideCY.inc;
+
+		numCommandQueues = ::std::tr1::get<9>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+		#ifndef HEMV_PACKED
+		    lda = ::std::max( lda, N );
+        #else
+            lda =0;
+        #endif
+
+		useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        useAlpha = base->useAlpha();
+        if (useAlpha != 0) {
+            paramAlpha = base->alpha();
+        }
+        useBeta = base->useBeta();
+        if (useBeta != 0) {
+            paramBeta = base->beta();
+        }
+        if (base->useN()) {
+            N = base->N();
+        }
+        if (base->useIncX()) {
+            incx = base->incX();
+        }
+        if (base->useIncY()) {
+            incy = base->incY();
+        }
+
+        printTestParams(order, uplo, N, paramAlpha, offA,
+                        lda, offx, incx, paramBeta, offy, incy);
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+    clblasUplo uplo;
+    size_t N;
+    size_t lda;
+    int incx, incy;
+    size_t offA, offx, offy;
+    unsigned int seed;
+
+    bool useAlpha, useBeta;
+    ComplexLong paramAlpha, paramBeta;
+
+    ::clMath::BlasBase *base;
+    cl_ulong imageA, imageX, imageY;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#endif  // HEMV_H_
diff --git a/src/tests/include/her.h b/src/tests/include/her.h
new file mode 100644
index 0000000..84b405c
--- /dev/null
+++ b/src/tests/include/her.h
@@ -0,0 +1,175 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+
+#if !defined(HER_PACKED)
+    #ifndef HER_H
+        #define HER_H
+    #else
+        #define DUPLICIT
+    #endif
+#endif
+
+#ifndef DUPLICIT
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <blas-math.h>
+
+using ::testing::TestWithParam;
+
+#ifndef HER_PACKED
+class HER : public TestWithParam<
+#else
+class HPR : public TestWithParam<
+#endif
+
+    ::std::tr1::tuple<
+        clblasOrder,    // order
+		clblasUplo,     // uplo
+        int,                // N
+		double,             //alpha
+        int,                // lda
+        int,                //incx
+        int,                // offx
+		int,                // offa			//FIX_ME.. gtest not allowing to add more parameters
+        int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->order = order;
+        params->uplo = uplo;
+        params->N = N;
+        params->alpha.re = (long)alpha;
+        params->lda = lda;
+        params->incx = incx;
+		params->offa = offa;
+		params->offBX = offx;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+		order = ::std::tr1::get<0>(GetParam());
+		uplo = ::std::tr1::get<1>(GetParam());
+        N = ::std::tr1::get<2>(GetParam());
+        alpha = ::std::tr1::get<3>(GetParam());
+        lda = ::std::tr1::get<4>(GetParam());
+        incx = ::std::tr1::get<5>(GetParam());
+		offa = ::std::tr1::get<6>(GetParam());
+		offx = ::std::tr1::get<7>(GetParam());
+        numCommandQueues = ::std::tr1::get<8>(GetParam());
+
+        #ifndef HER_PACKED
+		    lda = ::std::max( lda, N );
+        #else
+            lda =0;
+        #endif
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        if (base->useN()) {
+            N = base->N();
+        }
+
+	printTestParams(order, uplo, N, alpha,
+			offx, incx, offa, lda );
+
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+    clblasUplo uplo;
+    size_t  N;
+    size_t lda;
+    int incx;
+    size_t offa, offx;
+    unsigned int seed;
+    double  alpha;
+    ComplexLong paramAlpha;
+    size_t rowsA, columnsA;
+    ::clMath::BlasBase *base;
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#ifndef RANDOM_HER
+#define RANDOM_HER
+
+template <typename T>
+static void
+randomHerMatrices(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    T *alpha,
+    T *A,
+    size_t lda,
+    T *X,
+	int incx
+    )
+{
+    size_t i, j;
+	size_t lengthX;
+    cl_double bound, max;
+
+	// bound is calculated by solving the equation (alpha*x^2 + x - UPPER_BOUND) < 0
+	bound = UPPER_BOUND<T>();
+	if(module(CREAL(*alpha)) > (sqrt(bound) / (2.0)))
+		*alpha = random<T>((sqrt(bound) / (2.0)));
+
+	max = module(CREAL(*alpha));
+	bound = bound / max / 2.0;
+    bound = sqrt( ((((1.0) / max) / (4.0)) / max) + bound) - ((1.0) / ((2.0) * max));
+
+    if( lda )
+    {
+    for (i = 0; i < N; i++) {
+        for (j = 0; j < N; j++) {
+            setElement<T>(order, clblasNoTrans, i, j, A, lda, random<T>(bound));
+        }
+    }
+    } else {
+        for (i = 0; i < N; i++) {
+            for (j = 0; j < N; j++) {
+                setElementPacked<T>(order, clblasNoTrans, uplo, i, j, A, N, random<T>(bound));
+            }
+        }
+    }
+
+	lengthX = 1 + ((N - 1) * abs(incx));
+    if (X != NULL) {
+        for (i = 0; i < lengthX; i++) {
+			X[i] = random<T>(bound);
+        }
+    }
+}
+#endif // RANDOM_HER
+
+#endif  // HER_H_
diff --git a/src/tests/include/her2.h b/src/tests/include/her2.h
new file mode 100644
index 0000000..f64cb30
--- /dev/null
+++ b/src/tests/include/her2.h
@@ -0,0 +1,195 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#if !defined(HER2_PACKED)
+    #ifndef HER2_H
+        #define HER2_H
+    #else
+        #define DUPLICIT
+    #endif
+#endif
+
+#ifndef DUPLICIT
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <blas-math.h>
+
+using ::testing::TestWithParam;
+
+#ifndef HER2_PACKED
+class HER2 : public TestWithParam<
+#else
+class HPR2 : public TestWithParam<
+#endif
+
+    ::std::tr1::tuple<
+    	clblasOrder,     // order
+		clblasUplo,		// uplo
+        int,                // N
+		cl_float2,				//alpha
+		int,				// offx
+        int,                // incx, should be greater than 0
+		int,				// offy,
+		//int,				// incy, should be greater than 0.
+							// Since tuple doesnot allow more than 10 arguments we assume incy = incx;
+		int,				// offa
+        int,                // lda, 0 - undefined
+        int             	// numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->order = order;
+		params->uplo  = uplo;
+        params->seed  = seed;
+        params->N     = N;
+		params->alpha.re = (long)(CREAL(alpha)); // This will cast alpha to long. So the real value that is
+		params->alpha.imag = (long)(CIMAG(alpha));								// passed is not the same as what is set in the test case
+		params->offBX  = offx;
+        params->incx  = incx;
+		params->offCY  = offy;
+		params->incy  = incy;
+		params->offa  = offa;
+        params->lda   = lda;
+
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        order = ::std::tr1::get<0>(GetParam());
+        uplo  = ::std::tr1::get<1>(GetParam());
+        N     = ::std::tr1::get<2>(GetParam());
+		alpha = ::std::tr1::get<3>(GetParam());
+		offx  = ::std::tr1::get<4>(GetParam());
+        incx  = ::std::tr1::get<5>(GetParam());
+		offy  = ::std::tr1::get<6>(GetParam());
+		offa  = ::std::tr1::get<7>(GetParam());
+        lda   = ::std::tr1::get<8>(GetParam());
+  	    numCommandQueues = ::std::tr1::get<9>(GetParam());
+
+        incy  = incx; //GTest allows only 10 arguments to be passed and
+					  //hence we define incy to be equivalent to incx.
+
+		#ifndef HER2_PACKED
+		lda = ::std::max( lda, N );
+        #else
+            lda =0;
+        #endif
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        if (base->useN()) {
+            N = base->N();
+        }
+
+		printTestParams(order, uplo, N, 1, alpha, offx, incx, offy, incy, offa, lda);
+
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+	clblasUplo uplo;
+    size_t N;
+    size_t lda;
+    int incx, incy;
+    size_t offx, offy, offa;
+    unsigned int seed;
+	cl_float2 alpha;
+    ::clMath::BlasBase *base;
+    cl_ulong imageA, imageX, imageY;
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#ifndef RANDOM_HER2
+#define RANDOM_HER2
+
+template <typename T>
+static void
+randomHer2Matrices(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    T *alpha,
+    T *A,
+    size_t lda,
+    T *X,
+	int incx,
+	T *Y,
+	int incy
+    )
+{
+    size_t i, j;
+	size_t lengthX;
+    size_t lengthY;
+	cl_double bound, max;
+
+	// bound is calculated by solving the equation (2*alpha*x^2 + x - UPPER_BOUND) < 0
+	bound = UPPER_BOUND<T>();
+	max = module( ::std::max( alpha->s[0], alpha->s[1] ) );
+
+	if(max > (sqrt(bound) / (4.0)))
+		*alpha = random<T>((sqrt(bound) / (4.0)));
+	max = module( ::std::max( alpha->s[0], alpha->s[1] ) );
+
+	bound = bound / ( 2 * max);
+    bound = sqrt( ((((1.0) / max) / (16.0)) / max) + bound) - ((1.0) / ((4.0) * max));
+
+    if( lda )
+    {
+    for (i = 0; i < N; i++) {
+        for (j = 0; j < N; j++) {
+            setElement<T>(order, clblasNoTrans, i, j, A, lda, random<T>(bound));
+        }
+    }
+    } else {
+        for (i = 0; i < N; i++) {
+            for (j = 0; j < N; j++) {
+                setElementPacked<T>(order, clblasNoTrans, uplo, i, j, A, N, random<T>(bound));
+            }
+        }
+    }
+
+	lengthX = 1 + ((N - 1) * abs(incx));
+    if (X != NULL) {
+        for (i = 0; i < lengthX; i++) {
+			X[i] = random<T>(bound);
+        }
+    }
+	lengthY = 1 + (N - 1) * abs(incy);
+	if (Y != NULL) {
+		for (i = 0; i < lengthY; i++) {
+			Y[i] = random<T>(bound);
+		}
+	}
+}
+#endif //RANDOM_HER2
+
+#endif  //HER2_H_
diff --git a/src/tests/include/her2k.h b/src/tests/include/her2k.h
new file mode 100644
index 0000000..32e6172
--- /dev/null
+++ b/src/tests/include/her2k.h
@@ -0,0 +1,167 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef HER2K_H_
+#define HER2K_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <ExtraTestSizes.h>
+#include <common.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+class HER2K : public TestWithParam<
+    ::std::tr1::tuple<
+        clblasOrder,     // order
+        clblasUplo,      // uplo
+        clblasTranspose, // transA
+        int,                // N
+        int,                // K
+        ComplexLong,		// alpha
+		ComplexLong,		// beta
+		ExtraTestSizes,		// offa, offb, offc, lda, ldb, ldc.
+        int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->order = order;
+        params->uplo = uplo;
+        params->transA = transA;
+        params->seed = seed;
+        params->N = N;
+        params->K = K;
+        params->offA = offa;
+        params->offa = offa;
+        params->offBX = offB;
+        params->offCY = offC;
+        params->lda = lda;
+        params->ldb = ldb;
+        params->ldc = ldc;
+        params->rowsA = rowsA;
+        params->columnsA = columnsA;
+        params->rowsB = rowsB;
+        params->columnsB = columnsB;
+        params->rowsC = rowsC;
+        params->columnsC = columnsC;
+        params->numCommandQueues = numCommandQueues;
+		params->alpha = paramAlpha;
+		params->beta  = paramBeta;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        ExtraTestSizes extra;
+
+        order = ::std::tr1::get<0>(GetParam());
+        uplo = ::std::tr1::get<1>(GetParam());
+        transA = ::std::tr1::get<2>(GetParam());
+        N = ::std::tr1::get<3>(GetParam());
+        K = ::std::tr1::get<4>(GetParam());
+		paramAlpha = ::std::tr1::get<5>(GetParam());
+		paramBeta  = ::std::tr1::get<6>(GetParam());
+        paramBeta.imag = 0;     // Beta is a real number
+
+        extra = ::std::tr1::get<7>(GetParam());
+        offa = extra.offA;
+        offB = extra.offBX;
+        offC = extra.offCY;
+        lda = extra.strideA.ld;
+        ldb = extra.strideBX.ld;
+        ldc = extra.strideCY.ld;
+        numCommandQueues = ::std::tr1::get<8>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        if (base->useN()) {
+            N = base->N();
+        }
+        if (base->useK()) {
+            K = base->K();
+        }
+
+        if (transA == clblasNoTrans)
+        {
+            rowsA = rowsB = N;
+            columnsA = columnsB = K;
+        }
+        else
+        {
+            rowsA = rowsB = K;
+            columnsA = columnsB = N;
+        }
+        rowsC = N;
+        columnsC = N;
+
+        switch (order)
+        {
+            case clblasRowMajor:
+                lda = ::std::max(lda, columnsA);
+                columnsA = lda;
+                ldb = ::std::max(ldb, columnsB);
+                columnsB = ldb;
+                ldc = ::std::max(ldc, columnsC);
+                columnsC = ldc;
+                break;
+            case clblasColumnMajor:
+                lda = ::std::max(lda, rowsA);
+                rowsA = lda;
+                ldb = ::std::max(ldb, rowsB);
+                rowsB = ldb;
+                ldc = ::std::max(ldc, rowsC);
+                rowsC = ldc;
+                break;
+        }
+
+        printTestParams(order, uplo, transA, N, K, true, paramAlpha,
+                            offa, lda, offB, ldb, true, paramBeta, offC, ldc);
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+    clblasUplo uplo;
+    clblasTranspose transA;
+    size_t N, K;
+    size_t offa, offC, offB;
+    size_t lda, ldc, ldb;
+    unsigned int seed;
+
+    ComplexLong paramAlpha, paramBeta;
+
+    size_t rowsA, columnsA;
+    size_t rowsC, columnsC;
+    size_t rowsB, columnsB;
+
+    ::clMath::BlasBase *base;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#endif  // HER2K_H_
diff --git a/src/tests/include/herk.h b/src/tests/include/herk.h
new file mode 100644
index 0000000..7a5d5b5
--- /dev/null
+++ b/src/tests/include/herk.h
@@ -0,0 +1,160 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef HERK_H_
+#define HERK_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <ExtraTestSizes.h>
+#include <common.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+class HERK : public TestWithParam<
+    ::std::tr1::tuple<
+        clblasOrder,     // order
+        clblasUplo,      // uplo
+        clblasTranspose, // transA
+        int,                // N
+        int,                // K
+        ComplexLong,		// alpha
+		ComplexLong,		// beta
+		ExtraTestSizes,		// offa, offc, lda, ldc.
+        int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->order = order;
+        params->uplo = uplo;
+        params->transA = transA;
+        params->seed = seed;
+        params->N = N;
+        params->K = K;
+        params->offA = offA;
+        params->offCY = offC;
+        params->lda = lda;
+        params->ldc = ldc;
+        params->rowsA = rowsA;
+        params->columnsA = columnsA;
+        params->rowsC = rowsC;
+        params->columnsC = columnsC;
+        params->numCommandQueues = numCommandQueues;
+		params->alpha = paramAlpha;
+		params->beta  = paramBeta;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        ExtraTestSizes extra;
+
+        order = ::std::tr1::get<0>(GetParam());
+        uplo = ::std::tr1::get<1>(GetParam());
+        transA = ::std::tr1::get<2>(GetParam());
+        N = ::std::tr1::get<3>(GetParam());
+        K = ::std::tr1::get<4>(GetParam());
+		paramAlpha = ::std::tr1::get<5>(GetParam());
+		paramBeta  = ::std::tr1::get<6>(GetParam());
+
+        extra = ::std::tr1::get<7>(GetParam());
+        offA = extra.offA;
+        offC = extra.offCY;
+        lda = extra.strideA.ld;
+        ldc = extra.strideCY.ld;
+        numCommandQueues = ::std::tr1::get<8>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        useAlpha = base->useAlpha();
+        if (useAlpha != 0) {
+            paramAlpha = base->alpha();
+        }
+        useBeta = base->useBeta();
+        if (useBeta != 0) {
+            paramBeta = base->beta();
+        }
+        if (base->useN()) {
+            N = base->N();
+        }
+        if (base->useK()) {
+            K = base->K();
+        }
+
+        if (transA == clblasNoTrans) {
+            rowsA = N;
+            columnsA = K;
+        }
+        else {
+            rowsA = K;
+            columnsA = N;
+        }
+        rowsC = N;
+        columnsC = N;
+
+        switch (order) {
+        case clblasRowMajor:
+            lda = ::std::max(lda, columnsA);
+            columnsA = lda;
+            ldc = ::std::max(ldc, columnsC);
+            columnsC = ldc;
+            break;
+        case clblasColumnMajor:
+            lda = ::std::max(lda, rowsA);
+            rowsA = lda;
+            ldc = ::std::max(ldc, rowsC);
+            rowsC = ldc;
+            break;
+        }
+
+        printTestParams(order, uplo, transA, N, K, true, paramAlpha,
+                        offA, lda, true, paramBeta, offC, ldc);
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+    clblasUplo uplo;
+    clblasTranspose transA;
+    size_t N, K;
+    size_t offA, offC;
+    size_t lda, ldc;
+    unsigned int seed;
+
+    bool useAlpha, useBeta;
+    ComplexLong paramAlpha, paramBeta;
+
+    size_t rowsA, columnsA;
+    size_t rowsC, columnsC;
+
+    ::clMath::BlasBase *base;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#endif  // HERK_H_
diff --git a/src/tests/include/hpmv.h b/src/tests/include/hpmv.h
new file mode 100644
index 0000000..62111cf
--- /dev/null
+++ b/src/tests/include/hpmv.h
@@ -0,0 +1,27 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef HPMV_H_
+#define HPMV_H_
+
+#define HEMV_PACKED
+
+#include <hemv.h>
+
+#undef HEMV_PACKED
+
+#endif  // HPMV_H_
\ No newline at end of file
diff --git a/src/tests/include/hpr.h b/src/tests/include/hpr.h
new file mode 100644
index 0000000..9f002d6
--- /dev/null
+++ b/src/tests/include/hpr.h
@@ -0,0 +1,28 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+
+
+#ifndef HPR_H_
+
+#define HPR_H_
+#define HER_PACKED
+#include "her.h"
+
+#undef  HER_PACKED
+
+#endif
\ No newline at end of file
diff --git a/src/tests/include/hpr2.h b/src/tests/include/hpr2.h
new file mode 100644
index 0000000..6ae9c01
--- /dev/null
+++ b/src/tests/include/hpr2.h
@@ -0,0 +1,25 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#ifndef HPR2_H_
+
+#define HPR2_H_
+#define HER2_PACKED
+#include "her2.h"
+
+#undef  HER2_PACKED
+
+#endif
\ No newline at end of file
diff --git a/src/tests/include/iamax.h b/src/tests/include/iamax.h
new file mode 100644
index 0000000..b81c50a
--- /dev/null
+++ b/src/tests/include/iamax.h
@@ -0,0 +1,78 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <blas-math.h>
+
+using ::testing::TestWithParam;
+
+class iAMAX : public TestWithParam<
+    ::std::tr1::tuple<
+    int,                // N
+    int,                // incx, should be greater than 0
+	int,				//offx
+	int,				//offa -- for offiAmax
+    int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->N = N;
+        params->incx = incx;
+		params->offBX = offx;
+		params->offa = offiAmax;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        N = ::std::tr1::get<0>(GetParam());
+        incx = ::std::tr1::get<1>(GetParam());
+		offx = ::std::tr1::get<2>(GetParam());
+		offiAmax = ::std::tr1::get<3>(GetParam());
+        numCommandQueues = ::std::tr1::get<4>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        if (base->useN()) {
+            N = base->N();
+        }
+
+		printTestParams(N, offx, incx);
+        ::std::cerr << "offiAmax = " << offiAmax << ::std::endl;
+    }
+
+    size_t N;
+    int incx;
+    size_t offx, offiAmax;
+
+    ::clMath::BlasBase *base;
+    cl_ulong imageA, imageX;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+
diff --git a/src/tests/include/matrix.h b/src/tests/include/matrix.h
new file mode 100644
index 0000000..65757ad
--- /dev/null
+++ b/src/tests/include/matrix.h
@@ -0,0 +1,798 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef MATRIX_H_
+#define MATRIX_H_
+
+#include <clBLAS.h>
+#include <blas-math.h>
+#include <stdio.h>
+#include <iomanip>
+
+// Data Generation
+#include <testDG.h>
+
+template <typename T>
+static T
+getElement(
+    clblasOrder order,
+    clblasTranspose trans,
+    size_t row,
+    size_t column,
+    const T *A,
+    size_t lda)
+{
+if ( lda > 0) // General case
+{
+    switch (order) {
+    case clblasRowMajor:
+        if (trans == clblasNoTrans) {
+            A += lda * row;
+            return A[column];
+        }
+        else {
+            A += lda * column;
+            return A[row];
+        }
+        break;
+    case clblasColumnMajor:
+        if (trans == clblasNoTrans) {
+            A += lda * column;
+            return A[row];
+        }
+        else {
+            A += lda * row;
+            return A[column];
+        }
+        break;
+    }
+
+    /* Unreachable point */
+    return FNAN<T>();
+}
+else
+{
+	// Needed for Macro : testDG.h
+	int vectorLength = 1;
+	const T* data = A;
+
+	if ( order == clblasRowMajor)
+	{
+		return *RMLPacked(row, column);
+	}
+	else
+	{
+  		// return CMLPacked(row, column);
+		return FNAN<T>();
+	}
+
+}
+}
+
+template <typename T>
+static void
+setElement(
+    clblasOrder order,
+    clblasTranspose trans,
+    size_t row,
+    size_t column,
+    T *A,
+    size_t lda,
+    T value)
+{
+    switch (order) {
+    case clblasRowMajor:
+        if (trans == clblasNoTrans) {
+            A += lda * row;
+            A[column] = value;
+        }
+        else {
+            A += lda * column;
+            A[row] = value;
+        }
+        break;
+    case clblasColumnMajor:
+        if (trans == clblasNoTrans) {
+            A += lda * column;
+            A[row] = value;
+        }
+        else {
+            A += lda * row;
+            A[column] = value;
+        }
+        break;
+    }
+}
+
+template <typename T>
+static void
+setElementPacked(
+    clblasOrder order,
+    clblasTranspose trans,
+    clblasUplo uplo,
+    size_t row,
+    size_t column,
+    T *A,
+    size_t rows,
+    T value)
+{
+     // Needed for Macro : testDG.h
+    int vectorLength = 1;
+    const T* data = A;
+    clblasUplo fUplo = (trans == clblasNoTrans) ? uplo : ((uplo == clblasUpper) ? clblasLower : clblasUpper);
+
+    if(fUplo == clblasLower) //Should not access elements out of bounds.
+    {
+        if (column > row)
+           return;
+    }
+    else
+    {
+        if (column < row)
+            return;
+    }
+    switch (order) {
+    case clblasRowMajor:
+        if (fUplo == clblasLower)
+        {
+            *RMLPacked(row, column) = value;
+        }
+        else {
+            *RMUPacked(row, column) = value;
+        }
+        break;
+    case clblasColumnMajor:
+        if (fUplo == clblasLower)
+        {
+            *CMLPacked(row, column) = value;
+        }
+        else {
+            *CMUPacked(row, column) = value;
+        }
+        break;
+    }
+}
+
+template <typename T>
+static T
+getElementPacked(
+    clblasOrder order,
+    clblasTranspose trans,
+    clblasUplo uplo,
+    size_t row,
+    size_t column,
+    T *A,
+    size_t rows)
+{
+     // Needed for Macro : testDG.h
+    int vectorLength = 1;
+    const T* data = A;
+    clblasUplo fUplo = (trans == clblasNoTrans) ? uplo : ((uplo == clblasUpper) ? clblasLower : clblasUpper);
+
+    if(fUplo == clblasLower) //Should not access elements out of bounds.
+    {
+        if (column > row)
+           return FNAN<T>();
+    }
+    else
+    {
+        if (column < row)
+            return FNAN<T>();
+    }
+    switch (order) {
+        case clblasRowMajor:
+            if (fUplo == clblasLower)
+            {
+                return *RMLPacked(row, column);
+            }
+            else {
+                return *RMUPacked(row, column);
+            }
+            break;
+        case clblasColumnMajor:
+            if (fUplo == clblasLower)
+            {
+                return *CMLPacked(row, column);
+            }
+            else {
+                return *CMUPacked(row, column);
+           }
+            break;
+        default: return FNAN<T>();
+    }
+}
+
+
+template <typename T>
+static void
+printElement(T a)
+{
+    std::cout << a << "\t";
+}
+
+template<>
+__template_static void
+printElement<FloatComplex>(FloatComplex a)
+{
+    std::cout << "(" << a.s[0] << ", " << a.s[1] << ")\t";
+}
+
+template<>
+__template_static void
+printElement<DoubleComplex>(DoubleComplex a)
+{
+    std::cout << "(" << a.s[0] << ", " << a.s[1] << ")\t";
+}
+
+template <typename T>
+static void
+printMatrixBlock(
+    clblasOrder order,
+    size_t startRow,
+    size_t startCol,
+    size_t nrRows,
+    size_t nrCols,
+    size_t lda,
+    T *A)
+{
+    // FIXME : Packed Matrix
+    size_t i, j;
+    T a;
+
+    for (i = 0; i < nrRows; i++) {
+        for (j = 0; j < nrCols; j++) {
+            a = getElement(order, clblasNoTrans, startRow + i,
+                           startCol + j, A, lda);
+            printElement<T>(a);
+        }
+        std::cout << std::endl;
+    }
+    std::cout << std::endl << std::endl;
+}
+
+template <typename T>
+static void
+reorderMatrix(
+    clblasOrder order,
+    size_t rowsA,
+    size_t columnsA,
+    const T *A,
+    T *B)
+{
+    size_t lda = 0, ldb = 0;
+    size_t x, y;
+    clblasOrder orderB = clblasRowMajor;
+
+    switch (order) {
+    case clblasColumnMajor:
+        orderB = clblasRowMajor;
+        lda = rowsA;
+        ldb = columnsA;
+        break;
+    case clblasRowMajor:
+        orderB = clblasColumnMajor;
+        lda = columnsA;
+        ldb = rowsA;
+        break;
+    }
+
+    for (y = 0; y < rowsA; y++) {
+        for (x = 0; x < columnsA; x++) {
+            setElement<T>(orderB, clblasNoTrans, y, x, B, ldb,
+                getElement<T>(order, clblasNoTrans, y, x, A, lda));
+        }
+    }
+}
+
+template <typename T>
+static void
+compareMatrices(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    const T *A,
+    const T *B,
+    size_t lda,
+    const cl_double *absDelta = NULL)
+{
+    size_t m = 0, n = 0;
+    T a, b;
+    cl_double delta;
+
+    if( lda > 0 ) // General case
+    {
+    for (m = 0; m < M; m++) {
+        for (n = 0; n < N; n++) {
+            a = getElement<T>(order, clblasNoTrans, m, n, A, lda);
+            b = getElement<T>(order, clblasNoTrans, m, n, B, lda);
+            delta = 0.0;
+            if (absDelta != NULL) {
+                delta = absDelta[m * N + n];
+            }
+			if( module(a-b) > delta )		printf("m : %d\t n: %d\n", (int)m, (int)n);
+            ASSERT_NEAR(a, b, delta);
+        }
+    }
+    }
+    else // Packed case
+    {
+	if ( order == clblasColumnMajor)
+	{
+		for ( n = 0; n < N; n++)
+		{
+			for( m=n; m < M; m++)
+			{
+            			a = getElement<T>(order, clblasNoTrans, m, n, A, lda);
+			        b = getElement<T>(order, clblasNoTrans, m, n, B, lda);
+            			delta = 0.0;
+            			if (absDelta != NULL) {
+                			//delta = absDelta[m * N + n];
+            			}
+						if( module(a-b) > delta )		printf("m : %d\t n: %d\n", (int)m, (int)n);
+            			ASSERT_NEAR(a, b, delta);
+			}
+		}
+	}
+	else
+	{
+		for ( m = 0; m < M; m++)
+		{
+			for( n = 0; n <= m; n++)
+			{
+            			a = getElement<T>(order, clblasNoTrans, m, n, A, lda);
+			        b = getElement<T>(order, clblasNoTrans, m, n, B, lda);
+            			delta = 0.0;
+            			if (absDelta != NULL) {
+                			//delta = absDelta[m * N + n];
+            			}
+						if( module(a-b) > delta )		printf("m : %d\t n: %d\n", (int)m, (int)n);
+            			ASSERT_NEAR(a, b, delta);
+			}
+		}
+	}
+    }
+}
+
+template<>
+__template_static void
+compareMatrices<FloatComplex>(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    const FloatComplex *A,
+    const FloatComplex *B,
+    size_t lda,
+    const cl_double *absDelta)
+{
+    size_t m = 0, n = 0;
+    FloatComplex a, b;
+    cl_double delta;
+
+if ( lda > 0 )
+{
+    for (m = 0; m < M; m++) {
+        for (n = 0; n < N; n++) {
+            a = getElement<FloatComplex>(order, clblasNoTrans, m, n, A, lda);
+            b = getElement<FloatComplex>(order, clblasNoTrans, m, n, B, lda);
+            delta = 0.0;
+            if (absDelta != NULL) {
+                delta = absDelta[m * N + n];
+            }
+			if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) )
+					printf("m : %d\t n: %d\n", (int)m, (int)n);
+            ASSERT_NEAR(CREAL(a), CREAL(b), delta);
+            ASSERT_NEAR(CIMAG(a), CIMAG(b), delta);
+        }
+    }
+}
+    else // Packed case
+    {
+	if ( order == clblasColumnMajor)
+	{
+		for ( n = 0; n < N; n++)
+		{
+			for( m=n; m < M; m++)
+			{
+            			a = getElement<FloatComplex>(order, clblasNoTrans, m, n, A, lda);
+				        b = getElement<FloatComplex>(order, clblasNoTrans, m, n, B, lda);
+            			delta = 0.0;
+            			if (absDelta != NULL) {
+                			//delta = absDelta[m * N + n];
+            			}
+            			if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) )
+							printf("m : %d\t n: %d\n", (int)m, (int)n);
+            			ASSERT_NEAR(CREAL(a), CREAL(b), delta);
+		            	ASSERT_NEAR(CIMAG(a), CIMAG(b), delta);
+			}
+		}
+	}
+	else
+	{
+		for ( m = 0; m < M; m++)
+		{
+			for( n = 0; n <= m; n++)
+			{
+            			a = getElement<FloatComplex>(order, clblasNoTrans, m, n, A, lda);
+			        b = getElement<FloatComplex>(order, clblasNoTrans, m, n, B, lda);
+            			delta = 0.0;
+            			if (absDelta != NULL) {
+                			//delta = absDelta[m * N + n];
+            			}
+						if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) )
+							printf("m : %d\t n: %d\n", (int)m, (int)n);
+            			ASSERT_NEAR(CREAL(a), CREAL(b), delta);
+		            	ASSERT_NEAR(CIMAG(a), CIMAG(b), delta);
+			}
+		}
+	}
+    }
+
+}
+
+template<>
+__template_static void
+compareMatrices<DoubleComplex>(
+    clblasOrder order,
+    size_t M,
+    size_t N,
+    const DoubleComplex *A,
+    const DoubleComplex *B,
+    size_t lda,
+    const cl_double *absDelta)
+{
+    size_t m = 0, n = 0;
+    DoubleComplex a, b;
+    cl_double delta;
+if( lda > 0 )
+{
+    for (m = 0; m < M; m++) {
+        for (n = 0; n < N; n++) {
+            a = getElement<DoubleComplex>(order, clblasNoTrans, m, n, A, lda);
+            b = getElement<DoubleComplex>(order, clblasNoTrans, m, n, B, lda);
+            delta = 0.0;
+            if (absDelta != NULL) {
+                delta = absDelta[m * N + n];
+            }
+			if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) )
+					printf("m : %d\t n: %d\n", (int)m, (int)n);
+            ASSERT_NEAR(CREAL(a), CREAL(b), delta);
+            ASSERT_NEAR(CIMAG(a), CIMAG(b), delta);
+        }
+    }
+}
+    else // Packed case
+    {
+	if ( order == clblasColumnMajor)
+	{
+		for ( n = 0; n < N; n++)
+		{
+			for( m=n; m < M; m++)
+			{
+            			a = getElement<DoubleComplex>(order, clblasNoTrans, m, n, A, lda);
+			        b = getElement<DoubleComplex>(order, clblasNoTrans, m, n, B, lda);
+            			delta = 0.0;
+            			if (absDelta != NULL) {
+                			//delta = absDelta[m * N + n];
+            			}
+						if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) )
+							printf("m : %d\t n: %d\n", (int)m, (int)n);
+            			ASSERT_NEAR(CREAL(a), CREAL(b), delta);
+		            	ASSERT_NEAR(CIMAG(a), CIMAG(b), delta);
+			}
+		}
+	}
+	else
+	{
+		for ( m = 0; m < M; m++)
+		{
+			for( n = 0; n <= m; n++)
+			{
+            			a = getElement<DoubleComplex>(order, clblasNoTrans, m, n, A, lda);
+			        b = getElement<DoubleComplex>(order, clblasNoTrans, m, n, B, lda);
+            			delta = 0.0;
+            			if (absDelta != NULL) {
+                			//delta = absDelta[m * N + n];
+            			}
+						if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) )
+							printf("m : %d\t n: %d\n", (int)m, (int)n);
+            			ASSERT_NEAR(CREAL(a), CREAL(b), delta);
+		            	ASSERT_NEAR(CIMAG(a), CIMAG(b), delta);
+			}
+		}
+	}
+    }
+
+}
+
+template <typename T>
+static void
+setNans(
+    size_t len,
+    T *buf)
+{
+    size_t i;
+    for (i = 0; i < len; i++) {
+        buf[i] = FNAN<T>();
+    }
+}
+
+// set to NAN elements of upper or lower triangle of square matrix
+template <typename T>
+static void
+setTriangleNans(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    T *A,
+    size_t lda)
+{
+    size_t i, j;
+
+    // For matrix A
+    for (i = 0; i < N; i++) {
+        switch (uplo) {
+        case clblasUpper:
+            for (j = 0; j < i; j++) {
+                setElement<T>(order, clblasNoTrans, i, j, A, lda, FNAN<T>());
+            }
+            break;
+        case clblasLower:
+            for (j = i + 1; j < N; j++) {
+                setElement<T>(order, clblasNoTrans, i, j, A, lda, FNAN<T>());
+            }
+            break;
+        }
+    }
+}
+
+template <typename T>
+static void
+setVectorNans(
+    size_t offset,
+    size_t dx,
+    T *B,
+    size_t N,
+    size_t memLen)
+{
+    size_t i;
+    for (i = 0; i < offset; i++) {
+        B[i] = FNAN<T>();
+    }
+    for (i = offset; i <= offset + dx * (N - 1); i++) {
+        if (((i - offset) % dx) != 0) {
+            B[i] = FNAN<T>();
+        }
+    }
+    for (; i < memLen; i++) {
+        B[i] = FNAN<T>();
+    }
+}
+
+template <typename T>
+static void
+compareVectors(
+    size_t offset,
+    size_t N,
+    size_t dy,
+    size_t memLen,
+    T *blasC,
+    T *clblasC)
+{
+    size_t tailBegin, tailEnd;
+
+    // check the beginning containing NANs
+    ASSERT_FALSE(memcmp(blasC, clblasC, offset * sizeof(blasC[0])));
+
+    // check vector values
+    compareMatrices<T>(clblasRowMajor, N, 1, blasC + offset,
+                       clblasC + offset, dy);
+    // check NANs between vector values
+    if (dy != 1) {
+        size_t i;
+        size_t start, end;
+        start = offset + 1;
+        end = start + dy - 1;
+        for (i = 0; i < N - 1; i++) {
+            ASSERT_FALSE(memcmp(blasC + start, clblasC + start,
+                         (end - start) * sizeof(blasC[0])));
+        }
+    }
+    // check tail containing NANs
+    tailBegin = offset;
+    if (dy == 1) {
+        tailBegin += N;
+    }
+    else {
+        tailBegin += N + (N - 1) * (dy - 1);
+    }
+    tailEnd = memLen;
+
+    ASSERT_FALSE(memcmp(blasC + tailBegin, clblasC + tailBegin,
+                        (tailEnd - tailBegin) * sizeof(blasC[0])));
+}
+
+
+
+// Works only for NxN matrix
+template <typename T>
+static T
+getElementBanded(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t row,
+    size_t column,
+    size_t K,
+    const T *A,
+    size_t lda)
+{
+    switch (order)
+    {
+        case clblasRowMajor:
+            A += lda * row;
+            return (uplo == clblasLower)? A[ K - (row-column) ]: A[ column-row ];
+        break;
+
+        case clblasColumnMajor:
+            A += lda * column;
+            return (uplo == clblasLower)? A[ row-column ]: A[ K - (column-row) ];
+        break;
+    }
+
+    /* Unreachable point */
+    return FNAN<T>();
+}
+
+template <typename T>
+static void
+setElementBanded(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t row,
+    size_t column,
+    size_t K,
+    T *A,
+    size_t lda,
+    T value)
+{
+    switch (order)
+    {
+        case clblasRowMajor:
+            A += lda * row;
+            if (uplo == clblasLower)
+            {
+                A[ K - (row-column) ] = value;
+            }
+            else {
+                A[ column-row ] = value;
+            }
+        break;
+
+        case clblasColumnMajor:
+            A += lda * column;
+            if (uplo == clblasLower)
+            {
+                A[ row-column ] = value;
+            }
+            else {
+                A[ K - (column-row) ] = value;
+            }
+        break;
+    }
+}
+
+
+//conjugate function to handle rowmajor as columnmajor
+// for float and double do nothing
+template <typename T>
+static void
+doConjugate(
+    T *A,
+    size_t M,
+    size_t N,
+    size_t lda)
+{
+	if( M || N || lda|| A){} // Dummy to avoid warnings
+
+	return;
+}
+
+
+template<>
+__template_static void
+doConjugate<FloatComplex>(
+    FloatComplex *A,
+    size_t M,
+    size_t N,
+    size_t lda)
+{
+    size_t m, n;
+    FloatComplex b;
+
+    if ( lda > 0 )
+    {
+        for (m = 0; m < M; m++)
+        {
+            for (n = 0; n < N; n++)
+            {
+			    b = getElement<FloatComplex>(clblasRowMajor, clblasNoTrans, m, n, A, lda);
+                CIMAG(b) *= (-1);
+                setElement<FloatComplex>(clblasRowMajor, clblasNoTrans, m, n, A, lda, b);
+			}
+        }
+    }
+}
+
+template<>
+__template_static void
+doConjugate<DoubleComplex>(
+    DoubleComplex *A,
+    size_t M,
+    size_t N,
+    size_t lda)
+{
+    size_t m, n;
+    DoubleComplex b;
+
+if ( lda > 0 )
+{
+    for (m = 0; m < M; m++) {
+        for (n = 0; n < N; n++) {
+            b = getElement<DoubleComplex>(clblasRowMajor, clblasNoTrans, m, n, A, lda);
+            CIMAG(b) *= (-1);
+            setElement<DoubleComplex>(clblasRowMajor, clblasNoTrans, m, n, A, lda, b);
+        }
+    }
+}
+}
+
+
+template <typename T>
+static void compareValues(
+    const T *A, const T *B, const cl_double absDelta=0.0 )
+{
+    T a, b;
+    a = *A;
+    b = *B;
+    ASSERT_NEAR(a, b, absDelta);
+}
+
+ template<>
+__template_static void
+compareValues<FloatComplex> (
+    const FloatComplex *A, const FloatComplex *B, const cl_double absDelta )
+{
+    FloatComplex a, b;
+
+    a = *A;
+    b = *B;
+    ASSERT_NEAR(CREAL(a), CREAL(b), absDelta);
+    ASSERT_NEAR(CIMAG(a), CIMAG(b), absDelta);
+}
+
+ template<>
+__template_static void
+compareValues<DoubleComplex> (
+    const DoubleComplex *A, const DoubleComplex *B, const cl_double absDelta )
+{
+    DoubleComplex a, b;
+
+    a = *A;
+    b = *B;
+    ASSERT_NEAR(CREAL(a), CREAL(b), absDelta);
+    ASSERT_NEAR(CIMAG(a), CIMAG(b), absDelta);
+}
+#endif  // MATRIX_H_
diff --git a/src/tests/include/nrm2.h b/src/tests/include/nrm2.h
new file mode 100644
index 0000000..8637067
--- /dev/null
+++ b/src/tests/include/nrm2.h
@@ -0,0 +1,81 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <blas-math.h>
+
+using ::testing::TestWithParam;
+
+class NRM2 : public TestWithParam<
+    ::std::tr1::tuple<
+    int,                // N
+    int,                // incx
+	int,				// offx
+	int,				// offa -- for offNRM2
+    int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->N = N;
+        params->incx = incx;
+		params->offBX = offx;
+		params->offa = offNRM2;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        //size_t lenX;
+
+        N = ::std::tr1::get<0>(GetParam());
+        incx = ::std::tr1::get<1>(GetParam());
+		offx = ::std::tr1::get<2>(GetParam());
+		offNRM2 = ::std::tr1::get<3>(GetParam());
+        numCommandQueues = ::std::tr1::get<4>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        if (base->useN()) {
+            N = base->N();
+        }
+
+		printTestParams(N, offx, incx);
+        ::std::cerr << "offNRM2 = " << offNRM2 << ::std::endl;
+		::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    size_t N;
+    int incx;
+    size_t offx, offNRM2;
+
+    ::clMath::BlasBase *base;
+    cl_ulong imageA, imageX;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+
diff --git a/src/tests/include/rot.h b/src/tests/include/rot.h
new file mode 100644
index 0000000..5444509
--- /dev/null
+++ b/src/tests/include/rot.h
@@ -0,0 +1,87 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#ifndef ROT_H_
+#define ROT_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <blas-math.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+class ROT : public TestWithParam<
+    ::std::tr1::tuple<
+	int,				// N
+    int,             	// offx
+    int,                // incx
+    int,             	// offy
+    int,                // incy
+	ComplexLong,				// C
+	ComplexLong,				// S
+    int                 // numCommandQueues
+        > >
+{
+public:
+    void getParams(TestParams *params)
+    {
+        params->N = N;
+        params->offa= offa;  //offx
+		params->offb = offb; // offy
+        params->incx = incx;
+        params->incy = incy;
+        params->alpha = alpha; // C
+		params->beta = beta;	//S
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        N = ::std::tr1::get<0>(GetParam());
+        offa = ::std::tr1::get<1>(GetParam());
+		incx = ::std::tr1::get<2>(GetParam());
+		offb = ::std::tr1::get<3>(GetParam());
+        incy = ::std::tr1::get<4>(GetParam());
+        alpha = ::std::tr1::get<5>(GetParam());
+		beta = ::std::tr1::get<6>(GetParam());
+        numCommandQueues = ::std::tr1::get<7>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues)
+        {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+		printTestParams(N, offa, incx, offb, incy, alpha, beta );
+		::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    size_t N, offa, offb;
+    int incx, incy;
+    ComplexLong alpha;
+	ComplexLong beta;
+    ::clMath::BlasBase *base;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+#endif
diff --git a/src/tests/include/rotg.h b/src/tests/include/rotg.h
new file mode 100644
index 0000000..874f757
--- /dev/null
+++ b/src/tests/include/rotg.h
@@ -0,0 +1,76 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#ifndef ROTG_H_
+#define ROTG_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <blas-math.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+class ROTG : public TestWithParam<
+    ::std::tr1::tuple<
+	int,				//offsa
+	int,				//offsb
+    int,                //offc
+    int,                //offs
+    int                 //numCommandQueues
+        > >
+{
+public:
+    void getParams(TestParams *params)
+    {
+        params->offa = offC;
+        params->offb = offS;
+		params->offBX = offSA;
+		params->offCY = offSB;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        offSA = ::std::tr1::get<0>(GetParam());
+        offSB = ::std::tr1::get<1>(GetParam());
+		offC = ::std::tr1::get<2>(GetParam());
+		offS = ::std::tr1::get<3>(GetParam());
+        numCommandQueues = ::std::tr1::get<4>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues)
+        {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+		printTestParams(offSA, offSB, offC, offS);
+			::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    size_t offSA, offSB, offC, offS;
+
+    ::clMath::BlasBase *base;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+#endif
diff --git a/src/tests/include/rotm.h b/src/tests/include/rotm.h
new file mode 100644
index 0000000..9600b5a
--- /dev/null
+++ b/src/tests/include/rotm.h
@@ -0,0 +1,86 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#ifndef ROTM_H_
+#define ROTM_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <blas-math.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+class ROTM : public TestWithParam<
+    ::std::tr1::tuple<
+	int,				// N
+    int,             // offx
+    int,                // incx
+    int,             // offy
+    int,                // incy
+	int,				// offParam
+    ComplexLong,        // SFLAG Param
+    int                 // numCommandQueues
+        > >
+{
+public:
+    void getParams(TestParams *params)
+    {
+        params->N = N;
+        params->offa= offa;  // corrosponds to offx
+		params->offb = offb; // corrosponds to offy
+		params->offc = offc; // corrosponds to offParam
+        params->incx = incx;
+        params->incy = incy;
+        params->alpha = alpha; // corrosponds to sflagparam
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        N = ::std::tr1::get<0>(GetParam());
+        offa = ::std::tr1::get<1>(GetParam());
+		incx = ::std::tr1::get<2>(GetParam());
+		offb = ::std::tr1::get<3>(GetParam());
+        incy = ::std::tr1::get<4>(GetParam());
+        offc = ::std::tr1::get<5>(GetParam());
+        alpha = ::std::tr1::get<6>(GetParam());
+        numCommandQueues = ::std::tr1::get<7>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues)
+        {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+		printTestParams(N, offa, incx, offb, incy, offc, alpha);
+		::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    size_t N, offa, offb, offc;
+    int incx, incy;
+    ComplexLong alpha;
+    ::clMath::BlasBase *base;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+#endif
diff --git a/src/tests/include/rotmg.h b/src/tests/include/rotmg.h
new file mode 100644
index 0000000..c535719
--- /dev/null
+++ b/src/tests/include/rotmg.h
@@ -0,0 +1,108 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#ifndef ROTMG_H_
+#define ROTMG_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <blas-math.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+class ROTMG : public TestWithParam<
+    ::std::tr1::tuple<
+	int,				// offD1
+    int,                // offD2
+    int,                // offBX
+    int,                // offCY
+    int,                // offParam
+    ComplexLong,        // SFLAG Param
+    int                 // numCommandQueues
+        > >
+{
+public:
+    void getParams(TestParams *params)
+    {
+        params->offBX= offBX;   // corrosponds to offx
+		params->offCY = offCY;  // corrosponds to offy
+		params->offa = offa;    // corrosponds to offD1
+        params->offb = offb;    // corrosponds to offD2
+        params->offc = offc;    // corrospods to offParam
+        params->alpha = alpha;  // corrosponds to sflagparam
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        offBX = ::std::tr1::get<0>(GetParam());
+        offCY = ::std::tr1::get<1>(GetParam());
+		offa = ::std::tr1::get<2>(GetParam());
+		offb = ::std::tr1::get<3>(GetParam());
+        offc = ::std::tr1::get<4>(GetParam());
+        alpha = ::std::tr1::get<5>(GetParam());
+        numCommandQueues = ::std::tr1::get<6>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues)
+        {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+		printTestParams(offBX, offCY, offa, offb, offc, alpha);
+		::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    int offa, offb, offc, offBX, offCY;
+    ComplexLong alpha;
+    ::clMath::BlasBase *base;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+template <typename T>
+static void
+randomRotmg(
+    T *D1,
+    T *D2,
+    T *X,
+    T *Y,
+    T *PARAM
+    )
+{
+    // Since rotmg involves upto 3 multiplication on an element, taking cube-root
+    cl_double bound = pow(UPPER_BOUND<T>(), (1.0/3)) / 2.0;
+
+    *D1 = random<T>(bound);
+    *D2 = random<T>(bound);
+    *X = random<T>(bound);
+    *Y = random<T>(bound);
+
+    // Populate PARAM. Flag in PARAM[0] is expected to be set outside this function call
+    for(int i=1; i<=4; i++) {
+        PARAM[i] = random<T>(bound);
+    }
+}
+
+#endif
diff --git a/src/tests/include/sbmv.h b/src/tests/include/sbmv.h
new file mode 100644
index 0000000..0428910
--- /dev/null
+++ b/src/tests/include/sbmv.h
@@ -0,0 +1,177 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef SBMV_H_
+#define SBMV_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <ExtraTestSizes.h>
+#include <blas-random.h>
+#include <blas-math.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+class SBMV : public TestWithParam<
+    ::std::tr1::tuple<
+        clblasOrder,     // order
+        clblasUplo,      // uplo
+        int,                // N
+        int,                // K
+        ExtraTestSizes,
+        ComplexLong,		// Alpha
+		ComplexLong, 		// Beta
+        int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        memset(params, 0, sizeof(TestParams));
+
+        params->order = order;
+        params->uplo = uplo;
+        params->seed = seed;
+        params->N = N;
+        params->K = KLU;
+        params->lda = lda;
+        params->incx = incx;
+        params->incy = incy;
+        params->offA = offA;
+        params->offa = offA;
+        params->offBX = offx;
+        params->offCY = offy;
+        params->alpha = paramAlpha;
+        params->beta = paramBeta;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        ExtraTestSizes extra;
+
+        order = ::std::tr1::get<0>(GetParam());
+        uplo = ::std::tr1::get<1>(GetParam());
+        N = ::std::tr1::get<2>(GetParam());
+        KLU = ::std::tr1::get<3>(GetParam());
+        extra = ::std::tr1::get<4>(GetParam());
+        offA = extra.offA;
+        offx = extra.offBX;
+        offy = extra.offCY;
+        lda = extra.strideA.ld;
+        incx = extra.strideBX.inc;
+        incy = extra.strideCY.inc;
+        paramAlpha = ::std::tr1::get<5>(GetParam());
+		paramBeta  = ::std::tr1::get<6>(GetParam());
+        numCommandQueues = ::std::tr1::get<7>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        KLU = KLU % N;
+
+        lda = ::std::max(lda, (KLU+1));
+
+        printTestParams(order, uplo, N, KLU, paramAlpha, offA,
+                            lda, offx, incx, paramBeta, offy, incy);
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+    clblasUplo uplo;
+    size_t N, KLU;
+    size_t lda;
+    int incx, incy;
+    size_t offA, offx, offy;
+    unsigned int seed;
+
+    ComplexLong paramAlpha, paramBeta;
+    ::clMath::BlasBase *base;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+/*template <typename T>
+static void
+randomGbmvMatrices(
+    clblasOrder order,
+    clblasTranspose trans,
+    size_t M,
+    size_t N,
+    T *alpha,
+    T *beta,
+    T *A,
+    size_t lda,
+    T *X,
+	int incx,
+	T *Y,
+	int incy
+    )
+{
+    size_t i;
+	size_t lenX, lenY, lenA;
+    cl_double bound, maxAB, maxMN;
+
+	// bound is calculated by solving the equation (alpha*x^2 + x - UPPER_BOUND) < 0
+	bound = UPPER_BOUND<T>();
+	if(module(maxVal(*alpha)) > (sqrt(bound) / (2.0)))
+		*alpha = random<T>((sqrt(bound) / (2.0)));
+
+    if(module(maxVal(*beta)) > (sqrt(bound) / (2.0)))
+		*beta = random<T>((sqrt(bound) / (2.0)));
+
+	maxAB = module( ::std::max(maxVal(*alpha), maxVal(*beta)) );   // Take max of alpha & beta
+	maxMN = ::std::max( M, N );
+	bound = sqrt( bound / (maxAB*maxMN) );           // (maxAB * N * bound^2 + maxAB * bound - UPPER_BOUND) < 0
+
+    lenA = ((order == clblasRowMajor)? M: N) * lda;
+    for (i = 0; i < lenA; i++) {
+        A[i] = random<T>(bound);
+    }
+
+	if( trans == clblasNoTrans )
+	{
+    	lenX = 1 + ((N - 1) * abs(incx));
+    	lenY = 1 + ((M - 1) * abs(incy));
+    }
+    else {
+        lenX = 1 + ((M - 1) * abs(incx));
+    	lenY = 1 + ((N - 1) * abs(incy));
+    }
+    if (X != NULL) {
+        for (i = 0; i < lenX; i++) {
+			X[i] = random<T>(bound);
+        }
+    }
+    if (Y != NULL) {
+        for (i = 0; i < lenY; i++) {
+			Y[i] = random<T>(bound);
+        }
+    }
+}*/
+
+#endif  // GBMV_H_
diff --git a/src/tests/include/scal.h b/src/tests/include/scal.h
new file mode 100644
index 0000000..922407c
--- /dev/null
+++ b/src/tests/include/scal.h
@@ -0,0 +1,82 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef SCAL_H_
+#define SCAL_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <ExtraTestSizes.h>
+#include <common.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+class SCAL : public TestWithParam<
+    ::std::tr1::tuple<
+        int,                // N
+        ComplexLong,		// alpha
+        int,                // offx
+        int,                // incx
+        int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->N = N;
+        params->alpha = paramAlpha;
+        params->offBX = offx;
+        params->incx = incx;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        N = ::std::tr1::get<0>(GetParam());
+		paramAlpha = ::std::tr1::get<1>(GetParam());
+		offx = ::std::tr1::get<2>(GetParam());
+		incx = ::std::tr1::get<3>(GetParam());
+        numCommandQueues = ::std::tr1::get<4>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        printTestParams(N, paramAlpha, offx, incx);
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    size_t N;
+    unsigned int seed;
+    size_t offx;
+    int incx;
+    bool useAlpha;
+    ComplexLong paramAlpha;
+    ::clMath::BlasBase *base;
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#endif  // SCAL_H_
diff --git a/src/tests/include/spmv.h b/src/tests/include/spmv.h
new file mode 100644
index 0000000..dce8293
--- /dev/null
+++ b/src/tests/include/spmv.h
@@ -0,0 +1,212 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#ifndef SPMV_H_
+#define SPMV_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+#include <ExtraTestSizes.h>
+#include <blas-math.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+class SPMV : public TestWithParam<
+    ::std::tr1::tuple<
+        clblasOrder,     // order
+        clblasUplo, 		// uplo
+        int,                // N
+        ComplexLong,		// Alpha
+		ComplexLong, 		// Beta
+		size_t,				// offA
+		size_t,				// offx
+		size_t, 			// offy
+		ExtraTestSizes,
+        int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->order = order;
+        params->uplo = uplo;
+        params->seed = seed;
+        params->N = N;
+        params->lda = lda;
+        params->incx = incx;
+        params->incy = incy;
+        params->offA = offA;
+        params->offBX = offx;
+        params->offCY = offy;
+        params->alpha = paramAlpha;
+        params->beta = paramBeta;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        ExtraTestSizes extra;
+
+        order 	   = ::std::tr1::get<0>(GetParam());
+        uplo 	   = ::std::tr1::get<1>(GetParam());
+        N 		   = ::std::tr1::get<2>(GetParam());
+        paramAlpha = ::std::tr1::get<3>(GetParam());
+		paramBeta  = ::std::tr1::get<4>(GetParam());
+		offA	   = ::std::tr1::get<5>(GetParam());
+		offx	   = ::std::tr1::get<6>(GetParam());
+		offy	   = ::std::tr1::get<7>(GetParam());
+		extra 	   = ::std::tr1::get<8>(GetParam());
+        lda 	   = 0;
+        incx 	   = extra.strideBX.inc;
+        incy       = extra.strideCY.inc;
+
+		numCommandQueues = ::std::tr1::get<9>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+		useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        useAlpha = base->useAlpha();
+        if (useAlpha != 0) {
+            paramAlpha = base->alpha();
+        }
+        useBeta = base->useBeta();
+        if (useBeta != 0) {
+            paramBeta = base->beta();
+        }
+        if (base->useN()) {
+            N = base->N();
+        }
+        if (base->useIncX()) {
+            incx = base->incX();
+        }
+        if (base->useIncY()) {
+            incy = base->incY();
+        }
+
+        printTestParams(order, uplo, N, paramAlpha, offA,
+                        0, offx, incx, paramBeta, offy, incy);
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+    clblasUplo uplo;
+    size_t N;
+    size_t lda;
+    int incx, incy;
+    size_t offA, offx, offy;
+    unsigned int seed;
+
+    bool useAlpha, useBeta;
+    ComplexLong paramAlpha, paramBeta;
+
+    ::clMath::BlasBase *base;
+    cl_ulong imageA, imageX, imageY;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+template <typename T>
+static void
+randomSpmvMatrices(
+    clblasOrder order,
+    clblasUplo uplo,
+    size_t N,
+    bool useAlpha,
+    T *alpha,
+    T *A,
+    T *X,
+    int incx,
+	bool useBeta,
+	T *beta,
+    T *Y,
+    int incy
+    )
+{
+    size_t i, j;
+    size_t lengthX;
+    size_t lengthY;
+    cl_double bound;
+	cl_double fAlpha, fBeta;
+
+    if (!useAlpha) {
+        *alpha = random<T>(100);
+        if (module(*alpha) == 0.0) {
+            *alpha = 1.0;
+        }
+    }
+
+	if (!useBeta) {
+        *beta = random<T>(100);
+        if (module(*beta) == 0.0) {
+            *beta = 1.0;
+        }
+    }
+
+    bound = UPPER_BOUND<T>();
+
+    if(module(*alpha) > bound)
+        *alpha = random<T>((sqrt(bound) / ((2.0) * N)));
+	if (module(*alpha) == 0.0) {
+            *alpha = 1.0;
+    }
+
+	if(module(*beta) > bound)
+        *beta = random<T>((sqrt(bound)));
+	if (module(*beta) == 0.0) {
+            *beta = 1.0;
+    }
+
+	fAlpha = module(*alpha);
+	fBeta  = module(*beta);
+
+    bound = bound / (fAlpha * N);
+
+    bound = sqrt( ((((((fBeta * fBeta)) / fAlpha) / (4.0)) / fAlpha) / (N * N)) + bound) - ((fBeta) / ((2.0) * (fAlpha) * N));
+
+
+    for (i = 0; i < N; i++) {
+        for (j = 0; j < N; j++) {
+            setElementPacked<T>(order, clblasNoTrans, uplo, i, j, A, N, random<T>(bound));
+        }
+    }
+
+
+    lengthX = 1 + ((N - 1) * abs(incx));
+    if (X != NULL) {
+        for (i = 0; i < lengthX; i++) {
+            X[i] = random<T>(bound);
+        }
+    }
+    lengthY = 1 + (N - 1) * abs(incy);
+    if (Y != NULL) {
+        for (i = 0; i < lengthY; i++) {
+            Y[i] = random<T>(bound);
+        }
+    }
+}
+
+#endif  // SPMV_H_
diff --git a/src/tests/include/spr.h b/src/tests/include/spr.h
new file mode 100644
index 0000000..f699aac
--- /dev/null
+++ b/src/tests/include/spr.h
@@ -0,0 +1,23 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#ifndef SPR_H_
+#define SPR_H_
+#define SYR_PACKED
+#include "syr.h"
+#undef  SYR_PACKED
+#endif
+
diff --git a/src/tests/include/spr2.h b/src/tests/include/spr2.h
new file mode 100644
index 0000000..e0ca492
--- /dev/null
+++ b/src/tests/include/spr2.h
@@ -0,0 +1,26 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef SPR2_H_
+
+#define SPR2_H_
+#define SYR2_PACKED
+#include "syr2.h"
+
+#undef  SYR2_PACKED
+
+#endif
\ No newline at end of file
diff --git a/src/tests/include/swap.h b/src/tests/include/swap.h
new file mode 100644
index 0000000..6afda58
--- /dev/null
+++ b/src/tests/include/swap.h
@@ -0,0 +1,89 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#ifndef SWAP__H_
+#define SWAP__H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+// Name SWAP creates problem in gTest
+class SWAPXY : public TestWithParam<
+
+    ::std::tr1::tuple<
+    int,                // N
+    int,                // offBX
+    int,                // incx, should not be  0
+	int,				//offCY
+	int,				//incy, should not be 0
+    int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->N = N;
+        params->offBX = offBX;
+        params->incx = incx;
+		params->offCY = offCY;
+		params->incy = incy;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        N = ::std::tr1::get<0>(GetParam());
+        offBX = ::std::tr1::get<1>(GetParam());
+        incx = ::std::tr1::get<2>(GetParam());
+		offCY = ::std::tr1::get<3>(GetParam());
+		incy = ::std::tr1::get<4>(GetParam());
+        numCommandQueues = ::std::tr1::get<5>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        if (base->useN()) {
+            N = base->N();
+        }
+
+		printTestParams(N, offBX, incx, offCY, incy);
+		::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    size_t N;
+    size_t offBX;
+    int incx;
+    size_t offCY;
+	int incy;
+	unsigned int seed;
+
+    ::clMath::BlasBase *base;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#endif
diff --git a/src/tests/include/symm.h b/src/tests/include/symm.h
new file mode 100644
index 0000000..29214b3
--- /dev/null
+++ b/src/tests/include/symm.h
@@ -0,0 +1,143 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef SYMM_H_
+#define SYMM_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <ExtraTestSizes.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+class SYMM : public TestWithParam<
+    ::std::tr1::tuple<
+        clblasOrder,		 // order
+        clblasSide,		 // side
+        clblasUplo,		// uplo
+        int,                // M
+        int,            	 // N
+		cl_float2,				//alpha
+		cl_float2,				//beta
+		ExtraTestSizes,     // to get more than ten parameters in gtest.
+        int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->order = order;
+        params->seed = seed;
+		params->side = side;
+		params->uplo = uplo;
+        params->M = M;
+        params->N = N;
+        params->lda = lda;
+        params->ldb = ldb;
+        params->ldc = ldc;
+		params->offa = offa;
+		params->offb = offb;
+		params->offc = offc;
+		params->alpha.re = (long)CREAL(alpha);
+        params->alpha.imag = (long)CIMAG(alpha);
+        params->beta.re = (long)CREAL(beta);
+        params->beta.imag = (long)CIMAG(beta);
+
+
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+		ExtraTestSizes extra;
+		order = ::std::tr1::get<0>(GetParam());
+        side = ::std::tr1::get<1>(GetParam());
+        uplo = ::std::tr1::get<2>(GetParam());
+        M = ::std::tr1::get<3>(GetParam());
+        N = ::std::tr1::get<4>(GetParam());
+		alpha = ::std::tr1::get<5>(GetParam());
+        beta  = ::std::tr1::get<6>(GetParam());
+		extra = ::std::tr1::get<7>(GetParam());
+
+		offa = extra.offA;
+        offb = extra.offBX;
+        offc = extra.offCY;
+		lda = extra.strideA.ld;
+		ldb = extra.strideBX.ld;
+		ldc = extra.strideCY.ld;
+
+        numCommandQueues = ::std::tr1::get<8>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        if (base->useM()) {
+            M = base->M();
+        }
+        if (base->useN()) {
+            N = base->N();
+        }
+
+		if( side == clblasLeft )
+		{
+			lda = ::std::max(lda, M);
+		}
+		else
+		{
+			lda = ::std::max(lda, N);
+		}
+
+
+		switch (order) {
+        case clblasRowMajor:
+            ldb = ::std::max(ldb, N);
+            ldc = ::std::max(ldc, N);
+            break;
+        case clblasColumnMajor:
+            ldb = ::std::max(ldb, M);
+            ldc = ::std::max(ldc, M);
+            break;
+        }
+
+		printTestParams(order, side, uplo, M, N, 1, alpha, 1, beta, lda, ldb, ldc, offa, offb, offc);
+
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+	clblasSide side;
+	clblasUplo uplo;
+    size_t M, N;
+    size_t lda, ldb, ldc;
+    size_t offa, offb, offc;
+    unsigned int seed;
+    cl_float2 alpha, beta;
+    ::clMath::BlasBase *base;
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#endif  // SYMM_H_
diff --git a/src/tests/include/symv.h b/src/tests/include/symv.h
new file mode 100644
index 0000000..f8f76a6
--- /dev/null
+++ b/src/tests/include/symv.h
@@ -0,0 +1,184 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef SYMV_H_
+#define SYMV_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <BlasBase.h>
+#include <ExtraTestSizes.h>
+#include <common.h>
+#include <math.h>
+#include <blas-math.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+class SYMV : public TestWithParam<
+    ::std::tr1::tuple<
+        clblasOrder,     // order
+        clblasUplo,      // uplo
+        int,                // N
+        ExtraTestSizes,
+        int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        memset(params, 0, sizeof(TestParams));
+
+        params->order = order;
+        params->uplo = uplo;
+        params->seed = seed;
+        params->N = N;
+        params->lda = lda;
+        params->ldb = ldb;
+        params->ldc = ldc;
+        params->rowsA = rowsA;
+        params->rowsB = rowsB;
+        params->rowsC = rowsC;
+        params->columnsA = columnsA;
+        params->columnsB = columnsB;
+        params->columnsC = columnsC;
+        params->incx = incx;
+        params->incy = incy;
+        params->offA = offsetA;
+        params->offBX = offsetx;
+        params->offCY = offsety;
+        params->alpha = paramAlpha;
+        params->beta = paramBeta;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        ExtraTestSizes extra;
+
+        order = ::std::tr1::get<0>(GetParam());
+        uplo = ::std::tr1::get<1>(GetParam());
+        N = ::std::tr1::get<2>(GetParam());
+        extra = ::std::tr1::get<3>(GetParam());
+        offsetA = extra.offA;
+        lda = extra.strideA.ld;
+        incx = extra.strideBX.inc;
+        incy = extra.strideCY.inc;
+        numCommandQueues = ::std::tr1::get<4>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        useAlpha = base->useAlpha();
+        if (useAlpha != 0) {
+            paramAlpha = base->alpha();
+        }
+        useBeta = base->useBeta();
+        if (useBeta != 0) {
+            paramBeta = base->beta();
+        }
+        if (base->useN()) {
+            N = base->N();
+        }
+        if (base->useIncX()) {
+            incx = base->incX();
+        }
+        if (base->useIncY()) {
+            incy = base->incY();
+        }
+
+        lda = ::std::max(lda, N);
+
+        if (incx == 1 || incx == -1) {
+            /* X is row vector for row major matrix B
+             * or column vector for column major matrix B */
+            ldb = lda;
+            offsetx = (N / 2) * ldb;
+        }
+        else {
+            /* X is column vector for row major matrix B
+             * or row vector for column major matrix B */
+            ldb = ::std::max(N, (size_t)module(incx));
+            offsetx = N / 2;
+            incx = incx > 0 ? (int)ldb : (int)(0-ldb);
+        }
+
+        if (incy == 1 || incy == -1) {
+            /* Y is row vector in row major matrix C
+             * or column vector in column major matrix C */
+            ldc = lda;
+            offsety = (N / 2) * ldc;
+        }
+        else {
+            /* Y is column vector in matrix C
+             * or row vector in column major matrix C */
+            ldc = ::std::max(N, (size_t)module(incy));
+            offsety = N / 2;
+            incy = incy > 0 ? (int)ldc : (int)(0-ldc);
+        }
+
+        switch (order) {
+        case clblasRowMajor:
+            columnsA = lda;
+            columnsB = ldb;
+            columnsC = ldc;
+            rowsA = N;
+            rowsB = N;
+            rowsC = N;
+            break;
+        case clblasColumnMajor:
+            rowsA = lda;
+            rowsB = ldb;
+            rowsC = ldc;
+            columnsA = N;
+            columnsB = N;
+            columnsC = N;
+            break;
+        }
+
+        printTestParams(order, uplo, N, useAlpha, base->alpha(), offsetA, lda,
+                        incx, useBeta, base->beta(), incy);
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+    clblasUplo uplo;
+    size_t N;
+    size_t lda, ldb, ldc;
+    size_t offsetA, offsetx, offsety;
+    int incx, incy;
+    unsigned int seed;
+
+    bool useAlpha, useBeta;
+    ComplexLong paramAlpha, paramBeta;
+
+    size_t rowsA, columnsA, rowsB, columnsB, rowsC, columnsC;
+
+    ::clMath::BlasBase *base;
+    cl_ulong imageA, imageX;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#endif  // SYMV_H_
diff --git a/src/tests/include/syr.h b/src/tests/include/syr.h
new file mode 100644
index 0000000..056dcba
--- /dev/null
+++ b/src/tests/include/syr.h
@@ -0,0 +1,129 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#if !defined(SYR_PACKED)
+    #ifndef SYR_H
+        #define SYR_H
+    #else
+        #define DUPLICIT
+    #endif
+#endif
+
+#ifndef DUPLICIT
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <blas-math.h>
+
+using ::testing::TestWithParam;
+
+#ifndef SYR_PACKED
+class SYR : public TestWithParam<
+#else
+class SPR : public TestWithParam<
+#endif
+    ::std::tr1::tuple<
+    	clblasOrder,     // order
+		clblasUplo,		// uplo
+        int,                // N
+		double,				// alpha
+		int,				// offx
+        int,                // incx, should be greater than 0
+		int,				// offa
+        int,                // lda, 0 - undefined
+        int             	// numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->order = order;
+		params->uplo = uplo;
+        params->seed = seed;
+        params->N = N;
+		params->alpha.re = (long)alpha; // This will cast alpha to long. So the real value that is
+										// passed is not the same as what is set in the test case
+		params->offBX = offx;
+        params->incx = incx;
+		params->offa = offa;
+        params->lda = lda;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        order = ::std::tr1::get<0>(GetParam());
+        uplo  = ::std::tr1::get<1>(GetParam());
+        N     = ::std::tr1::get<2>(GetParam());
+		alpha = ::std::tr1::get<3>(GetParam());
+		offx  = ::std::tr1::get<4>(GetParam());
+        incx  = ::std::tr1::get<5>(GetParam());
+		offa  = ::std::tr1::get<6>(GetParam());
+        lda   = ::std::tr1::get<7>(GetParam());
+
+  	    numCommandQueues = ::std::tr1::get<8>(GetParam());
+
+        #ifndef SYR_PACKED
+		lda = ::std::max( lda, N );
+        #else
+        lda =0;
+        #endif
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+		//base->setAlpha(50);
+		//alpha = 50.0;
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        if (base->useN()) {
+            N = base->N();
+        }
+//		if (base->useAlpha()) {
+//			alpha = base->Alpha();
+//		}
+
+		printTestParams(order, uplo, N, alpha, offx, incx, offa, lda);
+
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+	clblasUplo uplo;
+    size_t N;
+    size_t lda;
+    int incx;
+    size_t offx, offa;
+
+    unsigned int seed;
+
+	double alpha;
+
+    ::clMath::BlasBase *base;
+    cl_ulong imageA, imageX;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#endif  // SYR_H_
diff --git a/src/tests/include/syr2.h b/src/tests/include/syr2.h
new file mode 100644
index 0000000..7045613
--- /dev/null
+++ b/src/tests/include/syr2.h
@@ -0,0 +1,136 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#if !defined(SYR2_PACKED)
+    #ifndef SYR2_H
+        #define SYR2_H
+    #else
+        #define DUPLICIT
+    #endif
+#endif
+
+#ifndef DUPLICIT
+
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <blas-math.h>
+
+using ::testing::TestWithParam;
+
+#ifndef SYR2_PACKED
+class SYR2 : public TestWithParam<
+#else
+class SPR2 : public TestWithParam<
+#endif
+
+    ::std::tr1::tuple<
+    	clblasOrder,     // order
+		clblasUplo,		// uplo
+        int,                // N
+		double,				//alpha
+		int,				// offx
+        int,                // incx, should be greater than 0
+		int,				// offy,
+		//int,				// incy, should be greater than 0.
+							// Since tuple doesnot allow more than 10 arguments we assume incy = incx;
+		int,				// offa
+        int,                // lda, 0 - undefined
+        int             	// numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->order = order;
+		params->uplo  = uplo;
+        params->seed  = seed;
+        params->N     = N;
+		params->alpha.re = (long)alpha; // This will cast alpha to long. So the real value that is
+										// passed is not the same as what is set in the test case
+		params->offBX  = offx;
+        params->incx  = incx;
+		params->offCY  = offy;
+		params->incy  = incy;
+		params->offa  = offa;
+        params->lda   = lda;
+
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        order = ::std::tr1::get<0>(GetParam());
+        uplo  = ::std::tr1::get<1>(GetParam());
+        N     = ::std::tr1::get<2>(GetParam());
+		alpha = ::std::tr1::get<3>(GetParam());
+		offx  = ::std::tr1::get<4>(GetParam());
+        incx  = ::std::tr1::get<5>(GetParam());
+		offy  = ::std::tr1::get<6>(GetParam());
+		offa  = ::std::tr1::get<7>(GetParam());
+        lda   = ::std::tr1::get<8>(GetParam());
+
+  	    numCommandQueues = ::std::tr1::get<9>(GetParam());
+
+        incy  = incx; //GTest allows only 10 arguments to be passed and
+					  //hence we define incy to be equivalent to incx.
+
+		#ifndef SYR2_PACKED
+		lda = ::std::max( lda, N );
+        #else
+            lda =0;
+        #endif
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        if (base->useN()) {
+            N = base->N();
+        }
+
+		printTestParams(order, uplo, N, alpha, offx, incx, offy, incy, offa, lda);
+
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+	clblasUplo uplo;
+    size_t N;
+    size_t lda;
+    int incx, incy;
+    size_t offx, offy, offa;
+
+    unsigned int seed;
+
+	double alpha;
+
+    ::clMath::BlasBase *base;
+    cl_ulong imageA, imageX, imageY;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#endif  // SYR2_H_
diff --git a/src/tests/include/syr2k.h b/src/tests/include/syr2k.h
new file mode 100644
index 0000000..ff09c2f
--- /dev/null
+++ b/src/tests/include/syr2k.h
@@ -0,0 +1,173 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef SYR2K_H_
+#define SYR2K_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <ExtraTestSizes.h>
+#include <common.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+class SYR2K : public TestWithParam<
+    ::std::tr1::tuple<
+        clblasOrder,     // order
+        clblasUplo,      // uplo
+        clblasTranspose, // transA
+        int,                // N
+        int,                // K
+        ExtraTestSizes,
+        int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        memset(params, 0, sizeof(TestParams));
+
+        params->order = order;
+        params->uplo = uplo;
+        params->transA = transA;
+        params->seed = seed;
+        params->N = N;
+        params->K = K;
+        params->offA = offA;
+        params->offBX = offB;
+        params->offCY = offC;
+        params->lda = lda;
+        params->ldb = ldb;
+        params->ldc = ldc;
+        params->rowsA = rowsA;
+        params->columnsA = columnsA;
+        params->rowsB = rowsB;
+        params->columnsB = columnsB;
+        params->rowsC = rowsC;
+        params->columnsC = columnsC;
+        params->alpha = paramAlpha;
+        params->beta = paramBeta;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        ExtraTestSizes extra;
+
+        order = ::std::tr1::get<0>(GetParam());
+        uplo = ::std::tr1::get<1>(GetParam());
+        transA = ::std::tr1::get<2>(GetParam());
+        N = ::std::tr1::get<3>(GetParam());
+        K = ::std::tr1::get<4>(GetParam());
+        extra = ::std::tr1::get<5>(GetParam());
+        offA = extra.offA;
+        offB = extra.offBX;
+        offC = extra.offCY;
+        lda = extra.strideA.ld;
+        ldb = extra.strideBX.ld;
+        ldc = extra.strideCY.ld;
+        numCommandQueues = ::std::tr1::get<6>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        useAlpha = base->useAlpha();
+        if (useAlpha != 0) {
+            paramAlpha = base->alpha();
+        }
+        useBeta = base->useBeta();
+        if (useBeta != 0) {
+            paramBeta = base->beta();
+        }
+        if (base->useN()) {
+            N = base->N();
+        }
+        if (base->useK()) {
+            K = base->K();
+        }
+
+        if (transA == clblasNoTrans) {
+            rowsA = N;
+            columnsA = K;
+            rowsB = N;
+            columnsB = K;
+        }
+        else {
+            rowsA = K;
+            columnsA = N;
+            rowsB = K;
+            columnsB = N;
+        }
+        rowsC = N;
+        columnsC = N;
+
+        switch (order) {
+        case clblasRowMajor:
+            lda = ::std::max(lda, columnsA);
+            columnsA = lda;
+            ldb = ::std::max(ldb, columnsB);
+            columnsB = ldb;
+            ldc = ::std::max(ldc, columnsC);
+            columnsC = ldc;
+            break;
+        case clblasColumnMajor:
+            lda = ::std::max(lda, rowsA);
+            rowsA = lda;
+            ldb = ::std::max(ldb, rowsB);
+            rowsB = ldb;
+            ldc = ::std::max(ldc, rowsC);
+            rowsC = ldc;
+            break;
+        }
+
+        printTestParams(order, uplo, transA, N, K, useAlpha, base->alpha(),
+                        offA, lda, offB, ldb, useBeta, base->beta(), offC, ldc);
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+    clblasUplo uplo;
+    clblasTranspose transA;
+    size_t N, K;
+    size_t offA, offB, offC;
+    size_t lda, ldb, ldc;
+    unsigned int seed;
+
+    bool useAlpha, useBeta;
+    ComplexLong paramAlpha, paramBeta;
+
+    size_t rowsA, columnsA;
+    size_t rowsB, columnsB;
+    size_t rowsC, columnsC;
+
+    ::clMath::BlasBase *base;
+    cl_ulong imageA, imageB;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#endif  // SYR2K_H_
diff --git a/src/tests/include/syrk.h b/src/tests/include/syrk.h
new file mode 100644
index 0000000..b855723
--- /dev/null
+++ b/src/tests/include/syrk.h
@@ -0,0 +1,155 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef SYRK_H_
+#define SYRK_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <ExtraTestSizes.h>
+#include <common.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+class SYRK : public TestWithParam<
+    ::std::tr1::tuple<
+        clblasOrder,     // order
+        clblasUplo,      // uplo
+        clblasTranspose, // transA
+        int,                // N
+        int,                // K
+        ExtraTestSizes,
+        int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        memset(params, 0, sizeof(TestParams));
+
+        params->order = order;
+        params->uplo = uplo;
+        params->transA = transA;
+        params->seed = seed;
+        params->N = N;
+        params->K = K;
+        params->offA = offA;
+        params->offCY = offC;
+        params->lda = lda;
+        params->ldc = ldc;
+        params->rowsA = rowsA;
+        params->columnsA = columnsA;
+        params->rowsC = rowsC;
+        params->columnsC = columnsC;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        ExtraTestSizes extra;
+
+        order = ::std::tr1::get<0>(GetParam());
+        uplo = ::std::tr1::get<1>(GetParam());
+        transA = ::std::tr1::get<2>(GetParam());
+        N = ::std::tr1::get<3>(GetParam());
+        K = ::std::tr1::get<4>(GetParam());
+        extra = ::std::tr1::get<5>(GetParam());
+        offA = extra.offA;
+        offC = extra.offCY;
+        lda = extra.strideA.ld;
+        ldc = extra.strideCY.ld;
+        numCommandQueues = ::std::tr1::get<6>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        useAlpha = base->useAlpha();
+        if (useAlpha != 0) {
+            paramAlpha = base->alpha();
+        }
+        useBeta = base->useBeta();
+        if (useBeta != 0) {
+            paramBeta = base->beta();
+        }
+        if (base->useN()) {
+            N = base->N();
+        }
+        if (base->useK()) {
+            K = base->K();
+        }
+
+        if (transA == clblasNoTrans) {
+            rowsA = N;
+            columnsA = K;
+        }
+        else {
+            rowsA = K;
+            columnsA = N;
+        }
+        rowsC = N;
+        columnsC = N;
+
+        switch (order) {
+        case clblasRowMajor:
+            lda = ::std::max(lda, columnsA);
+            columnsA = lda;
+            ldc = ::std::max(ldc, columnsC);
+            columnsC = ldc;
+            break;
+        case clblasColumnMajor:
+            lda = ::std::max(lda, rowsA);
+            rowsA = lda;
+            ldc = ::std::max(ldc, rowsC);
+            rowsC = ldc;
+            break;
+        }
+
+        printTestParams(order, uplo, transA, N, K, useAlpha, base->alpha(),
+                        offA, lda, useBeta, base->beta(), offC, ldc);
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+    clblasUplo uplo;
+    clblasTranspose transA;
+    size_t N, K;
+    size_t offA, offC;
+    size_t lda, ldc;
+    unsigned int seed;
+
+    bool useAlpha, useBeta;
+    ComplexLong paramAlpha, paramBeta;
+
+    size_t rowsA, columnsA;
+    size_t rowsC, columnsC;
+
+    ::clMath::BlasBase *base;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#endif  // SYRK_H_
diff --git a/src/tests/include/tbmv.h b/src/tests/include/tbmv.h
new file mode 100644
index 0000000..19063d6
--- /dev/null
+++ b/src/tests/include/tbmv.h
@@ -0,0 +1,145 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef TBMV_H_
+#define TBMV_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <ExtraTestSizes.h>
+#include <blas-random.h>
+#include <blas-math.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+class TBMV : public TestWithParam<
+    ::std::tr1::tuple<
+        clblasOrder,     // order
+        clblasUplo,      // uplo
+        clblasTranspose, // transA
+        clblasDiag,      // diag
+        int,                // N
+        int,                // KL or KU
+        ExtraTestSizes,
+        int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        memset(params, 0, sizeof(TestParams));
+
+        params->order = order;
+        params->uplo = uplo;
+        params->transA = transA;
+        params->diag = diag;
+        params->seed = seed;
+        params->N = N;
+        params->K = KLU;
+        params->lda = lda;
+        params->incx = incx;
+        params->offA = offA;
+        params->offa = offA;
+        params->offBX = offx;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        ExtraTestSizes extra;
+
+        order = ::std::tr1::get<0>(GetParam());
+        uplo = ::std::tr1::get<1>(GetParam());
+        transA = ::std::tr1::get<2>(GetParam());
+        diag = ::std::tr1::get<3>(GetParam());
+        N = ::std::tr1::get<4>(GetParam());
+        KLU = ::std::tr1::get<5>(GetParam());
+        extra = ::std::tr1::get<6>(GetParam());
+        offA = extra.offA;
+        offx = extra.offBX;
+        lda = extra.strideA.ld;
+        incx = extra.strideBX.inc;
+        numCommandQueues = ::std::tr1::get<7>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        KLU = KLU % N;
+        lda = ::std::max(lda, (KLU+1));
+
+        printTestParams(order, uplo, transA, diag, N, KLU, offA,
+                            lda, offx, incx, 0, 1);
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+    clblasTranspose transA;
+    clblasUplo uplo;
+    clblasDiag diag;
+    size_t N, KLU;
+    size_t lda;
+    int incx;
+    size_t offA, offx;
+    unsigned int seed;
+
+    ::clMath::BlasBase *base;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+template <typename T>
+static void
+randomTbmvMatrices(
+    size_t N,
+    T *A,
+    size_t lda,
+    T *X,
+	int incx
+    )
+{
+    size_t i;
+	size_t lenX, lenA;
+    cl_double bound;
+
+	// bound is calculated by solving the equation (x^2 + x - UPPER_BOUND) < 0
+	bound = UPPER_BOUND<T>();
+	bound = sqrt( bound / N );           // (N * bound^2 - UPPER_BOUND) < 0
+
+    lenA = (N) * lda;
+    for (i = 0; i < lenA; i++) {
+        A[i] = random<T>(bound);
+    }
+
+   	lenX = 1 + ((N - 1) * abs(incx));
+    if (X != NULL) {
+        for (i = 0; i < lenX; i++) {
+			X[i] = random<T>(bound);
+        }
+    }
+}
+
+#endif  // TBMV_H_
diff --git a/src/tests/include/tbsv.h b/src/tests/include/tbsv.h
new file mode 100644
index 0000000..890e1c2
--- /dev/null
+++ b/src/tests/include/tbsv.h
@@ -0,0 +1,224 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef TBSV_H_
+#define TBSV_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <ExtraTestSizes.h>
+#include <blas-random.h>
+#include <blas-math.h>
+#include <tbmv.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+class TBSV : public TestWithParam<
+    ::std::tr1::tuple<
+        clblasOrder,     // order
+        clblasUplo,      // uplo
+        clblasTranspose, // transA
+        clblasDiag,      // diag
+        int,                // N
+        int,                // KL or KU
+        ExtraTestSizes,
+        int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        memset(params, 0, sizeof(TestParams));
+
+        params->order = order;
+        params->uplo = uplo;
+        params->transA = transA;
+        params->diag = diag;
+        params->seed = seed;
+        params->N = N;
+        params->K = KLU;
+        params->lda = lda;
+        params->incx = incx;
+        params->offA = offA;
+        params->offa = offA;
+        params->offBX = offx;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        ExtraTestSizes extra;
+
+        order = ::std::tr1::get<0>(GetParam());
+        uplo = ::std::tr1::get<1>(GetParam());
+        transA = ::std::tr1::get<2>(GetParam());
+        diag = ::std::tr1::get<3>(GetParam());
+        N = ::std::tr1::get<4>(GetParam());
+        KLU = ::std::tr1::get<5>(GetParam());
+        extra = ::std::tr1::get<6>(GetParam());
+        offA = extra.offA;
+        offx = extra.offBX;
+        lda = extra.strideA.ld;
+        incx = extra.strideBX.inc;
+        numCommandQueues = ::std::tr1::get<7>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        KLU = KLU % N;
+        lda = ::std::max(lda, (KLU+1));
+
+        printTestParams(order, uplo, transA, diag, N, KLU, offA,
+                            lda, offx, incx, 0, 1);
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+    clblasTranspose transA;
+    clblasUplo uplo;
+    clblasDiag diag;
+    size_t N, KLU;
+    size_t lda;
+    int incx;
+    size_t offA, offx;
+    unsigned int seed;
+
+    ::clMath::BlasBase *base;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+template <typename T>
+static void
+randomTbsvMatrices(
+    clblasOrder order,
+	clblasUplo uplo,
+    clblasDiag diag,
+    size_t N,
+    size_t K,
+    T *A,
+    size_t lda,
+    T *X,
+    int incx)
+{
+    size_t i, j;
+    T min, max, x, y;
+    cl_double modMin, modMax, sum, maxDiag;
+
+    min = ZERO<T>();
+    max = ZERO<T>();
+    incx = abs(incx);
+    maxDiag = 1.0;
+
+    cl_double bound;
+    bound = (UPPER_BOUND<T>()/(N));
+
+    switch (diag) {
+    case clblasUnit:
+        for (i = 0; i < N; i++) {
+            // must not be accessed
+            setElementBanded<T>(order, uplo, i, i, K, A, lda, FNAN<T>());
+        }
+        break;
+    case clblasNonUnit:
+        /* Do not allow zeros on A's main diagonal and get a big number which is atleast greater than N/4*/
+        maxDiag = ((N/4) > bound) ? (bound/4) : (N/4);
+        maxDiag = (1 > (maxDiag)) ? 1 : maxDiag;
+        do {
+            max = randomTrsv<T>(bound);
+        } while ((module(max) < (maxDiag)));
+        modMax = module(max);
+        min = max / 100;
+        modMin = module(min);
+        setElementBanded<T>(order, uplo, 0, 0, K, A, lda, max);
+        //printf("Diagonals %d ", max);
+        for (i = 1; i < N; i++) {
+            x = randomTrsv<T>(modMin, modMax);
+            if (module(x) < 1) {
+                x = max;
+            }
+            //printf("%d ", x);
+            /*if(module(x) < 1)
+            {
+                printf("WARNING: Diagonal less than one\n");
+            }*/
+            setElementBanded<T>(order, uplo, i, i, K, A, lda, x);
+        }
+       // printf("\n");
+        break;
+    }
+
+    /* Generate a_{ij} for all j <> i. */
+    for (i = 0; i < N; i++) {
+
+        if (diag == clblasUnit) {
+            sum = module(ONE<T>());
+        }
+        else {
+            T temp;
+            temp = getElementBanded<T>(order, uplo, i, i, K, A, lda);
+            sum = module(temp);
+        }
+
+        for (j = 0; j < N; j++) {
+            if ((j == i) || (module((int)(i-j)) > ((int)K)) )    // Diagonal and out-of-band elemnts
+            {
+                continue;
+            }
+
+            if (((uplo == clblasUpper) && (j > i)) ||
+                ((uplo == clblasLower) && (j < i)))
+            {
+                x = randomTrsv<T>(sum/(K + 1)); //Only K + 1 accumulation not N.
+                setElementBanded<T>(order, uplo, i, j, K, A, lda, x);
+            }
+        }
+    }
+
+    /* Generate matrix X. */
+    sum = TRSM_LIMIT_B<T>();
+    for (i = 0; i < N; i++) {
+        if(diag == clblasNonUnit)
+        {
+            sum = module(getElementBanded<T>(order, uplo, i, i, K, A, lda));
+        }
+        else
+        {
+            sum = module(ONE<T>());
+        }
+        y = randomTrsv<T>(sum/(K+1));
+        setElement<T>(clblasColumnMajor, clblasNoTrans, (i * abs(incx)), 0, X, (1 + (N-1)*abs(incx)), y);
+        if (i == 0) {
+            min = y;
+        }
+        else if (module(y) < module(min)) {
+            min = y;
+        }
+    }
+}
+
+#endif  // TBSV_H_
diff --git a/src/tests/include/test-limits.h b/src/tests/include/test-limits.h
new file mode 100644
index 0000000..e9fa9aa
--- /dev/null
+++ b/src/tests/include/test-limits.h
@@ -0,0 +1,63 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef TEST_LIMITS_H_
+#define TEST_LIMITS_H_
+
+#define FLOAT_UPPER_BOUND   pow(2.0, 23)
+#define DOUBLE_UPPER_BOUND  pow(2.0, 52)
+
+#define TRSM_FLOAT_LIMIT_A  pow(2.0, 7)
+#define TRSM_DOUBLE_LIMIT_A pow(2.0, 5)
+#define TRSM_FLOAT_LIMIT_B  pow(2.0, 16)
+#define TRSM_DOUBLE_LIMIT_B pow(2.0, 47)
+
+// Type-dependant constants
+template <class T>
+static cl_double UPPER_BOUND();
+template<>
+__template_static cl_double UPPER_BOUND<cl_float>() { return FLOAT_UPPER_BOUND; }
+template<>
+__template_static cl_double UPPER_BOUND<cl_double>() { return DOUBLE_UPPER_BOUND;}
+template<>
+__template_static cl_double UPPER_BOUND<FloatComplex>() { return FLOAT_UPPER_BOUND; }
+template<>
+__template_static cl_double UPPER_BOUND<DoubleComplex>()  { return DOUBLE_UPPER_BOUND; }
+
+template <class T>
+static cl_double TRSM_LIMIT_A();
+template<>
+__template_static cl_double TRSM_LIMIT_A<cl_float>() { return TRSM_FLOAT_LIMIT_A; }
+template<>
+__template_static cl_double TRSM_LIMIT_A<cl_double>() { return TRSM_DOUBLE_LIMIT_A; }
+template<>
+__template_static cl_double TRSM_LIMIT_A<FloatComplex>() { return TRSM_FLOAT_LIMIT_A; }
+template<>
+__template_static cl_double TRSM_LIMIT_A<DoubleComplex>() { return TRSM_DOUBLE_LIMIT_A; }
+
+template <class T>
+static cl_double TRSM_LIMIT_B();
+template<>
+__template_static cl_double TRSM_LIMIT_B<cl_float>() { return TRSM_FLOAT_LIMIT_B; }
+template<>
+__template_static cl_double TRSM_LIMIT_B<cl_double>() { return TRSM_DOUBLE_LIMIT_B; }
+template<>
+__template_static cl_double TRSM_LIMIT_B<FloatComplex>() { return TRSM_FLOAT_LIMIT_B; }
+template<>
+__template_static cl_double TRSM_LIMIT_B<DoubleComplex>() { return TRSM_DOUBLE_LIMIT_B; }
+
+#endif /* TEST_LIMITS_H_ */
diff --git a/src/tests/include/testDG.h b/src/tests/include/testDG.h
new file mode 100644
index 0000000..03a3f37
--- /dev/null
+++ b/src/tests/include/testDG.h
@@ -0,0 +1,56 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#ifndef _TESTDG_H_
+#define _TESTDG_H_
+
+// Coming from testDG.hpp
+
+enum TRIANGLE_OPERATIONS {
+	LTOU,
+	UTOL,
+	SWAP
+};
+
+
+enum RealMatrixCreationFlags {
+		//NO_FLAGS			= 0,
+		ROW_MAJOR_ORDER 		= 1,
+		PACKED_MATRIX 			= 2,
+		SYMMETRIC_MATRIX		= 4,
+		UPPER_HALF_ONLY			= 8,
+		LOWER_HALF_ONLY			= 16,
+		NO_ALIGNMENT			= 32,
+		UNIT_DIAGONAL			= 64,
+		RANDOM_INIT			= 128,
+		ZERO_DIAGONAL			= 256
+	};
+
+#define setDiagonalUnity() 	setDiagonalUnityOrNonUnity(1, data, rows, cols, lda, vectorLength, creationFlags, bound) // Unity diagonal
+#define setDiagonalRandom() 	setDiagonalUnityOrNonUnity(2, data, rows, cols, lda, vectorLength, creationFlags, bound) // Random values
+#define setDiagonalZero()	setDiagonalUnityOrNonUnity(0, data, rows, cols, lda, vectorLength, creationFlags, bound) // Zero diagonal
+
+// Column-Major is i,j replaced and RML is CMU
+// So CMU(i,j) will be RML(j,i)
+// The following is Row-Major packed
+#define RMLPacked(i,j) ((T*)data + ((i*(i+1))/2 + j) * vectorLength)
+#define RMUPacked(i,j) ((T*)data + ((i*((2* rows) + 1 - i))/2 + (j -i))* vectorLength )
+
+#define CMUPacked(i,j) ((T*)data + ((j*(j+1))/2 + i)* vectorLength)
+#define CMLPacked(i,j) ((T*)data + ((j*((2*rows) + 1 - j))/2 + (i - j))* vectorLength)
+
+
+#endif
diff --git a/src/tests/include/timer.h b/src/tests/include/timer.h
new file mode 100644
index 0000000..29353ff
--- /dev/null
+++ b/src/tests/include/timer.h
@@ -0,0 +1,58 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef TIMER_H_
+#define TIMER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_MSC_VER)
+
+typedef unsigned long long nano_time_t;
+#define NANOTIME_MAX (~0ULL - 1)
+
+#else
+
+typedef unsigned long nano_time_t;
+#define NANOTIME_MAX (~0UL - 1)
+
+#endif
+
+#define NANOTIME_ERR (NANOTIME_MAX + 1)
+
+nano_time_t
+conv2millisec(nano_time_t t);
+
+nano_time_t
+conv2microsec(nano_time_t t);
+
+nano_time_t
+conv2nanosec(nano_time_t t);
+
+nano_time_t
+getCurrentTime(void);
+
+void
+sleepTime(nano_time_t t);
+
+#ifdef __cplusplus
+}   /* extern "C" { */
+#endif
+
+#endif  /* TIMER_H_ */
diff --git a/src/tests/include/tpmv.h b/src/tests/include/tpmv.h
new file mode 100644
index 0000000..140543b
--- /dev/null
+++ b/src/tests/include/tpmv.h
@@ -0,0 +1,25 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+
+#ifndef TPMV_H_
+#define TPMV_H_
+#define TRMV_PACKED
+#include "trmv.h"
+#undef  TRMV_PACKED
+#endif
+
diff --git a/src/tests/include/tpsv.h b/src/tests/include/tpsv.h
new file mode 100644
index 0000000..6392ac3
--- /dev/null
+++ b/src/tests/include/tpsv.h
@@ -0,0 +1,25 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+
+#ifndef TPSV_H_
+#define TPSV_H_
+#define TRSV_PACKED_
+#include "trsv.h"
+#undef  TRSV_PACKED_
+#endif
+
diff --git a/src/tests/include/trmm.h b/src/tests/include/trmm.h
new file mode 100644
index 0000000..b2e5482
--- /dev/null
+++ b/src/tests/include/trmm.h
@@ -0,0 +1,160 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef TRMM_H_
+#define TRMM_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <BlasBase.h>
+#include <ExtraTestSizes.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+class TRMM : public TestWithParam<
+    ::std::tr1::tuple<
+        clblasOrder,         // order
+        clblasSide,          // side
+        clblasUplo,          // uplo
+        clblasTranspose,     // transA
+        clblasDiag,          // diag
+        int,                    // M
+        int,                    // N
+        ExtraTestSizes,
+        int                     // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        memset(params, 0, sizeof(TestParams));
+
+        params->order = order;
+        params->side = side;
+        params->uplo = uplo;
+        params->transA = transA;
+        params->diag = diag;
+        params->seed = seed;
+        params->M = M;
+        params->N = N;
+        params->offA = offA;
+        params->offBX = offB;
+        params->lda = lda;
+        params->ldb = ldb;
+        params->rowsA = rowsA;
+        params->columnsA = columnsA;
+        params->rowsB = rowsB;
+        params->columnsB = columnsB;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        ExtraTestSizes extra;
+
+        order = ::std::tr1::get<0>(GetParam());
+        side = ::std::tr1::get<1>(GetParam());
+        uplo = ::std::tr1::get<2>(GetParam());
+        transA = ::std::tr1::get<3>(GetParam());
+        diag = ::std::tr1::get<4>(GetParam());
+        M = ::std::tr1::get<5>(GetParam());
+        N = ::std::tr1::get<6>(GetParam());
+        extra = ::std::tr1::get<7>(GetParam());
+        offA = extra.offA;
+        offB = extra.offBX;
+        lda = extra.strideA.ld;
+        ldb = extra.strideBX.ld;
+        numCommandQueues = ::std::tr1::get<8>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        useAlpha = base->useAlpha();
+        if (useAlpha != 0) {
+            paramAlpha = base->alpha();
+        }
+        if (base->useM()) {
+            M = base->M();
+        }
+        if (base->useN()) {
+            N = base->N();
+        }
+
+        switch (side) {
+        case clblasLeft:
+            rowsA = M;
+            columnsA = M;
+            break;
+        case clblasRight:
+            rowsA = N;
+            columnsA = N;
+            break;
+        }
+        rowsB = M;
+        columnsB = N;
+
+        switch (order) {
+        case clblasRowMajor:
+            lda = ::std::max(lda, columnsA);
+            columnsA = lda;
+            ldb = ::std::max(ldb, columnsB);
+            columnsB = ldb;
+            break;
+        case clblasColumnMajor:
+            lda = ::std::max(lda, rowsA);
+            rowsA = lda;
+            ldb = ::std::max(ldb, rowsB);
+            rowsB = ldb;
+            break;
+        }
+
+        printTestParams(order, side, uplo, transA, diag, M, N, useAlpha,
+                        base->alpha(), offA, lda, offB, ldb);
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+    clblasSide side;
+    clblasUplo uplo;
+    clblasTranspose transA;
+    clblasDiag diag;
+    size_t M, N;
+    size_t offA, offB;
+    size_t lda, ldb;
+    unsigned int seed;
+
+    bool useAlpha;
+    ComplexLong paramAlpha;
+
+    size_t rowsA, columnsA;
+    size_t rowsB, columnsB;
+
+    ::clMath::BlasBase *base;
+    cl_ulong imageA, imageB;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#endif  // TRMM_H_
diff --git a/src/tests/include/trmv.h b/src/tests/include/trmv.h
new file mode 100644
index 0000000..204bbce
--- /dev/null
+++ b/src/tests/include/trmv.h
@@ -0,0 +1,124 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#if !defined(TRMV_PACKED)
+    #ifndef TRMV_H
+        #define TRMV_H
+    #else
+        #define DUPLICIT
+    #endif
+#endif
+
+#ifndef DUPLICIT
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <blas-math.h>
+
+using ::testing::TestWithParam;
+
+#ifndef TRMV_PACKED
+class TRMV : public TestWithParam<
+#else
+class TPMV : public TestWithParam<
+#endif
+
+    ::std::tr1::tuple<
+    clblasOrder,     // order
+	clblasUplo,		// uplo
+    clblasTranspose, // transA
+	clblasDiag,		// diag
+    int,                // N
+    int,                // lda, 0 - undefined
+    int,                // incx, should be greater than 0
+	int,				//offa
+	int,				//offx
+    int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->order = order;
+		params->uplo = uplo;
+        params->transA = transA;
+		params->diag = diag;
+        params->seed = seed;
+        params->N = N;
+        params->lda = lda;
+        params->incx = incx;
+		params->offa = offa;
+		params->offBX = offx;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        order = ::std::tr1::get<0>(GetParam());
+        uplo = ::std::tr1::get<1>(GetParam());
+		transA = ::std::tr1::get<2>(GetParam());
+		diag = ::std::tr1::get<3>(GetParam());
+        N = ::std::tr1::get<4>(GetParam());
+        lda = ::std::tr1::get<5>(GetParam());
+        incx = ::std::tr1::get<6>(GetParam());
+		offa = ::std::tr1::get<7>(GetParam());
+		offx = ::std::tr1::get<8>(GetParam());
+        numCommandQueues = ::std::tr1::get<9>(GetParam());
+
+        #ifndef TRMV_PACKED
+		lda = ::std::max( lda, N );
+        #else
+        lda =0;
+        #endif
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        if (base->useN()) {
+            N = base->N();
+        }
+
+		printTestParams(order, uplo, transA, diag, N, lda, incx, offa, offx);
+			::std::cerr << "seed = " << seed << ::std::endl;
+			::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+	clblasUplo uplo;
+    clblasTranspose transA;
+	clblasDiag diag;
+    size_t N;
+    size_t lda;
+    int incx;
+    size_t offx, offa;
+    unsigned int seed;
+
+    ::clMath::BlasBase *base;
+    cl_ulong imageA, imageX;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#endif
diff --git a/src/tests/include/trsm.h b/src/tests/include/trsm.h
new file mode 100644
index 0000000..e0e90ea
--- /dev/null
+++ b/src/tests/include/trsm.h
@@ -0,0 +1,163 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#ifndef TRSM_H_
+#define TRSM_H_
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <BlasBase.h>
+#include <ExtraTestSizes.h>
+
+using namespace clMath;
+using ::testing::TestWithParam;
+
+class TRSM : public TestWithParam<
+    ::std::tr1::tuple<
+        clblasOrder,         // order
+        clblasSide,          // side
+        clblasUplo,          // uplo
+        clblasTranspose,     // transA
+        clblasDiag,          // diag
+        int,                    // M
+        int,                    // N
+        ExtraTestSizes,
+        int                     // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        memset(params, 0, sizeof(TestParams));
+
+        params->order = order;
+        params->side = side;
+        params->uplo = uplo;
+        params->transA = transA;
+        params->diag = diag;
+        params->seed = seed;
+        params->M = M;
+        params->N = N;
+        params->offA = offA;
+        params->offBX = offB;
+        params->lda = lda;
+        params->ldb = ldb;
+        params->rowsA = rowsA;
+        params->columnsA = columnsA;
+        params->rowsB = rowsB;
+        params->columnsB = columnsB;
+        params->alpha = paramAlpha;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        ExtraTestSizes extra;
+
+        order = ::std::tr1::get<0>(GetParam());
+        side = ::std::tr1::get<1>(GetParam());
+        uplo = ::std::tr1::get<2>(GetParam());
+        transA = ::std::tr1::get<3>(GetParam());
+        diag = ::std::tr1::get<4>(GetParam());
+        M = ::std::tr1::get<5>(GetParam());
+        N = ::std::tr1::get<6>(GetParam());
+        extra = ::std::tr1::get<7>(GetParam());
+        offA = extra.offA;
+        offB = extra.offBX;
+        lda = extra.strideA.ld;
+        ldb = extra.strideBX.ld;
+        numCommandQueues = ::std::tr1::get<8>(GetParam());
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        useAlpha = base->useAlpha();
+        if (useAlpha != 0) {
+            paramAlpha = base->alpha();
+        }
+        if (base->useM()) {
+            M = base->M();
+        }
+        if (base->useN()) {
+            N = base->N();
+        }
+
+        switch (side) {
+        case clblasLeft:
+            rowsA = M;
+            columnsA = M;
+            break;
+        case clblasRight:
+            rowsA = N;
+            columnsA = N;
+            break;
+        }
+        rowsB = M;
+        columnsB = N;
+
+        switch (order) {
+        case clblasRowMajor:
+            lda = ::std::max(lda, columnsA);
+            columnsA = lda;
+            ldb = ::std::max(ldb, columnsB);
+            columnsB = ldb;
+            break;
+        case clblasColumnMajor:
+            lda = ::std::max(lda, rowsA);
+            rowsA = lda;
+            ldb = ::std::max(ldb, rowsB);
+            rowsB = ldb;
+            break;
+        }
+
+        printTestParams(order, side, uplo, transA, diag, M, N, useAlpha,
+                        base->alpha(), offA, lda, offB, ldb);
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+    clblasSide side;
+    clblasUplo uplo;
+    clblasTranspose transA;
+    clblasDiag diag;
+    size_t M, N;
+    size_t offA, offB;
+    size_t lda, ldb;
+    unsigned int seed;
+
+    bool useAlpha;
+    ComplexLong paramAlpha;
+
+    size_t rowsA, columnsA;
+    size_t rowsB, columnsB;
+
+    ::clMath::BlasBase *base;
+    cl_ulong imageA;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#endif  // TRSM_H_
diff --git a/src/tests/include/trsv.h b/src/tests/include/trsv.h
new file mode 100644
index 0000000..6410d11
--- /dev/null
+++ b/src/tests/include/trsv.h
@@ -0,0 +1,123 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#if !defined(TRSV_PACKED_)
+    #ifndef TRSV_H_
+        #define TRSV_H_
+    #else
+        #define DUPLICIT
+    #endif
+#endif
+
+#ifndef DUPLICIT
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <BlasBase.h>
+#include <blas-math.h>
+
+using ::testing::TestWithParam;
+
+#ifndef TRSV_PACKED_
+class TRSV : public TestWithParam<
+#else
+class TPSV : public TestWithParam<
+#endif
+    ::std::tr1::tuple<
+        clblasOrder,     // order
+		clblasUplo,		// uplo
+        clblasTranspose, // transA
+		clblasDiag,		// diag
+        int,                // N
+        int,                // lda, 0 - undefined
+        int,                // incx, should be greater than 0
+		int,				//offa
+		int,				//offx
+        int                 // numCommandQueues
+        > > {
+public:
+    void getParams(TestParams *params)
+    {
+        params->order = order;
+		params->uplo = uplo;
+        params->transA = transA;
+		params->diag = diag;
+        params->seed = seed;
+        params->N = N;
+        params->lda = lda;
+        params->incx = incx;
+		params->offa = offa;
+		params->offBX = offx;
+        params->numCommandQueues = numCommandQueues;
+    }
+
+protected:
+    virtual void SetUp()
+    {
+        order = ::std::tr1::get<0>(GetParam());
+        uplo = ::std::tr1::get<1>(GetParam());
+		transA = ::std::tr1::get<2>(GetParam());
+		diag = ::std::tr1::get<3>(GetParam());
+        N = ::std::tr1::get<4>(GetParam());
+        lda = ::std::tr1::get<5>(GetParam());
+        incx = ::std::tr1::get<6>(GetParam());
+		offa = ::std::tr1::get<7>(GetParam());
+		offx = ::std::tr1::get<8>(GetParam());
+        numCommandQueues = ::std::tr1::get<9>(GetParam());
+
+
+        #ifndef TRSV_PACKED_
+		lda = ::std::max( lda, N );
+        #else
+        lda = 0;
+        #endif
+
+        base = ::clMath::BlasBase::getInstance();
+        seed = base->seed();
+
+        useNumCommandQueues = base->useNumCommandQueues();
+        if (useNumCommandQueues) {
+            numCommandQueues = base->numCommandQueues();
+        }
+
+        if (base->useN()) {
+            N = base->N();
+        }
+
+	    printTestParams(order, uplo, transA, diag, N, lda, incx, offa, offx);
+        ::std::cerr << "seed = " << seed << ::std::endl;
+        ::std::cerr << "queues = " << numCommandQueues << ::std::endl;
+    }
+
+    clblasOrder order;
+    clblasUplo uplo;
+    clblasTranspose transA;
+    clblasDiag diag;
+    size_t N;
+    size_t lda;
+    int incx;
+    size_t offx, offa;
+    unsigned int seed;
+
+    ::clMath::BlasBase *base;
+    cl_ulong imageA, imageX;
+
+    bool useNumCommandQueues;
+    cl_uint numCommandQueues;
+};
+
+#endif  // DUPLICIT
diff --git a/src/tests/performance/BlasBase-perf.cpp b/src/tests/performance/BlasBase-perf.cpp
new file mode 100644
index 0000000..96b4cc6
--- /dev/null
+++ b/src/tests/performance/BlasBase-perf.cpp
@@ -0,0 +1,118 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <iostream>
+#include <math.h>
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <BlasBase.h>
+
+namespace clMath {
+
+static size_t
+imageMaxDimension(cl_context context, int widthHeight)
+{
+    cl_int err;
+    cl_device_id devices[2];
+    size_t i, retSize;
+    size_t rc = (size_t)-1;
+    cl_device_info par;
+
+    par = (widthHeight) ? CL_DEVICE_IMAGE2D_MAX_HEIGHT :
+                          CL_DEVICE_IMAGE2D_MAX_WIDTH;
+
+    err = clGetContextInfo(context, CL_CONTEXT_DEVICES,
+        sizeof(devices), devices, &retSize);
+    if (err == CL_SUCCESS) {
+        size_t s;
+
+        retSize /= sizeof(cl_device_id);
+        for (i = 0; (i < retSize) && (err == CL_SUCCESS); i++) {
+            err = clGetDeviceInfo(devices[i], par, sizeof(s), &s, NULL);
+            if (err == CL_SUCCESS) {
+                rc = std::min(rc, s);
+            }
+        }
+    }
+
+    if (err != CL_SUCCESS) {
+        rc = 0;
+    }
+
+    return rc;
+}
+
+static size_t
+imageMaxWidth(cl_context context)
+{
+    return imageMaxDimension(context, 0);
+}
+
+static size_t
+imageMaxHeight(cl_context context)
+{
+    return imageMaxDimension(context, 1);
+}
+
+clblasStatus
+BlasBase::addScratchImages(void)
+{
+    //cl_ulong memSize, allocSize;
+    //size_t width, height;
+    //clblasStatus status;
+    //float scale;
+
+    ///*
+    // * get maximum amount of memory each image can takes, not
+    // * forgetting that it can be up to three matrices residing
+    // * in memory objects
+    // */
+    //allocSize = maxMemAllocSize();
+    //memSize = availGlobalMemSize(0);
+    //if (allocSize > memSize / 5) {
+    //    allocSize = memSize / 5;
+    //    scale = 1.4f;
+    //}
+    //else {
+    //    scale = 1.5f;
+    //}
+
+    //height = static_cast<size_t>(sqrt(static_cast<double>(allocSize) / sizeof(cl_float)));
+    //width  = height / 4;
+    //height = static_cast<size_t>(height / scale);
+    //width  = static_cast<size_t>(width * scale);
+
+    //if (height > imageMaxHeight(context_)) {
+    //    height = imageMaxHeight(context_);
+    //}
+    //if (width > imageMaxWidth(context_)) {
+    //    width = imageMaxWidth(context_);
+    //}
+
+    //imageA_ = clblasAddScratchImage(context_, width, height, &status);
+    //if (imageA_) {
+    //    imageB_ = clblasAddScratchImage(context_, width, height, &status);
+    //}
+
+    //return status;
+	return clblasNotImplemented;
+
+}
+
+}   // namespace
diff --git a/src/tests/performance/PerformanceRecorder.cpp b/src/tests/performance/PerformanceRecorder.cpp
new file mode 100644
index 0000000..d3655c5
--- /dev/null
+++ b/src/tests/performance/PerformanceRecorder.cpp
@@ -0,0 +1,151 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Overall performance recorder implementation
+ */
+
+#include <string.h>
+#include "PerformanceRecorder.h"
+#include <iostream>
+
+using namespace clMath;
+
+PerformanceRecorder::PerformanceRecorder()
+{
+    unsigned int size = static_cast<unsigned int>(BLAS_FUNCTION_END);
+
+    records_ = new PerfRecord[size];
+    memset(records_, 0, sizeof(PerfRecord) * size);
+}
+
+PerformanceRecorder::~PerformanceRecorder()
+{
+    delete[] records_;
+}
+
+void
+PerformanceRecorder::etalonRegPerf(
+    BlasFunction fn,
+    unsigned long us,
+    problem_size_t size)
+{
+    int id = static_cast<int>(fn);
+
+    records_[id].etalonGFlops += ((gflops_t)size / us) / 1000;
+	records_[id].etalonGbps   += ((gbps_t)size / us) / 1000;
+    records_[id].etalonNrRuns++;
+}
+
+void
+PerformanceRecorder::clblasRegPerf(
+    BlasFunction fn,
+    unsigned long us,
+    problem_size_t size)
+{
+    int id = static_cast<int>(fn);
+
+    records_[id].clblasGFlops += ((gflops_t)size / us) / 1000;
+	records_[id].clblasGbps   += ((gbps_t)size / us) / 1000;
+
+	if(  (functionBlasLevel(static_cast<BlasFunction>(fn)) == 2)  //display metrics in GBps if it is a BLAS-2/1 functio
+        || (functionBlasLevel(static_cast<BlasFunction>(fn)) == 1) )
+	{
+		    std::cerr << "clBlas GBPS : " << (((gbps_t)size / us) / 1000) << std::endl << std::endl << std::endl;
+	}
+	else
+	{
+			std::cerr << "clBlas GFLOPS : " << (((gflops_t)size / us) / 1000) << std::endl << std::endl << std::endl;
+	}
+
+    records_[id].clblasNrRuns++;
+}
+
+void
+PerformanceRecorder::regTimeRatio(BlasFunction fn, double ratio)
+{
+    int id = static_cast<int>(fn);
+
+    records_[id].timeRatio += ratio;
+    records_[id].nrRatios++;
+}
+
+gflops_t
+PerformanceRecorder::etalonAvgPerf(BlasFunction fn)
+{
+    int id = static_cast<int>(fn);
+    gflops_t gflops = records_[id].etalonGFlops;
+
+    if (records_[id].etalonNrRuns) {
+        gflops /= records_[id].etalonNrRuns;
+    }
+
+    return gflops;
+}
+
+gflops_t
+PerformanceRecorder::clblasAvgPerf(BlasFunction fn)
+{
+    int id = static_cast<int>(fn);
+    gflops_t gflops = records_[id].clblasGFlops;
+
+    if (records_[id].clblasNrRuns) {
+        gflops /= records_[id].clblasNrRuns;
+    }
+
+    return gflops;
+}
+
+gbps_t
+PerformanceRecorder::etalonAvgGbpsPerf(BlasFunction fn)
+{
+    int id = static_cast<int>(fn);
+    gbps_t gbps = records_[id].etalonGbps;
+
+    if (records_[id].etalonNrRuns) {
+        gbps /= records_[id].etalonNrRuns;
+    }
+
+    return gbps;
+}
+
+gbps_t
+PerformanceRecorder::clblasAvgGbpsPerf(BlasFunction fn)
+{
+    int id = static_cast<int>(fn);
+    gbps_t gbps = records_[id].clblasGbps;
+
+    if (records_[id].clblasNrRuns) {
+        gbps /= records_[id].clblasNrRuns;
+    }
+
+    return gbps;
+}
+
+
+double
+PerformanceRecorder::avgTimeRatio(BlasFunction fn)
+{
+    int id = static_cast<int>(fn);
+    double ratio = records_[id].timeRatio;
+
+    if (records_[id].nrRatios) {
+        ratio /= records_[id].nrRatios;
+    }
+
+    return ratio;
+}
diff --git a/src/tests/performance/PerformanceRecorder.h b/src/tests/performance/PerformanceRecorder.h
new file mode 100644
index 0000000..8c2ce1c
--- /dev/null
+++ b/src/tests/performance/PerformanceRecorder.h
@@ -0,0 +1,93 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Overall performance recorder definition
+ */
+
+#ifndef PERFORMANCERECORDER_H_
+#define PERFORMANCERECORDER_H_
+
+#include <clBLAS.h>
+#include <common.h>
+
+enum {
+    MAX_TIMES_PER_FUNCTION = 3
+};
+
+namespace clMath {
+
+typedef double gflops_t;
+typedef double gbps_t;
+
+#if defined(_MSC_VER)
+typedef unsigned long long problem_size_t;
+#else
+typedef uint64_t problem_size_t;
+#endif
+
+class PerformanceRecorder {
+public:
+    PerformanceRecorder();
+    virtual ~PerformanceRecorder();
+
+    // register etalon function execution time
+    void etalonRegPerf(BlasFunction fn, unsigned long us, problem_size_t size);
+
+    // register clblas function execution time
+    void clblasRegPerf(BlasFunction fn, unsigned long us, problem_size_t size);
+
+    /*
+     * register time ratio of the clblas function against this one
+     * of the reference implementation
+     */
+    void regTimeRatio(BlasFunction fn, double ratio);
+
+    // get average etalon function average performance in giga-flops and gbps
+    gflops_t etalonAvgPerf(BlasFunction fn);
+	gbps_t etalonAvgGbpsPerf(BlasFunction fn);
+
+    // get clblas function average performance in giga-flops and gbps
+    gflops_t clblasAvgPerf(BlasFunction fn);
+	gbps_t clblasAvgGbpsPerf(BlasFunction fn);
+
+    /*
+     * get average time ratio of a clblas function against
+     * the reference implementation
+     */
+    double avgTimeRatio(BlasFunction fn);
+
+private:
+    struct PerfRecord {
+        gflops_t etalonGFlops;
+        gflops_t clblasGFlops;
+		gbps_t etalonGbps;
+		gbps_t clblasGbps;
+        unsigned int etalonNrRuns;
+        unsigned int clblasNrRuns;
+        double timeRatio;
+        unsigned int nrRatios;
+    };
+
+    PerfRecord *records_;
+};
+
+} // namespace clMath
+
+extern clMath::PerformanceRecorder *perfRecorder;
+
+#endif /* PERFORMANCERECORDER_H_ */
diff --git a/src/tests/performance/PerformanceTest.cpp b/src/tests/performance/PerformanceTest.cpp
new file mode 100644
index 0000000..3726ae3
--- /dev/null
+++ b/src/tests/performance/PerformanceTest.cpp
@@ -0,0 +1,133 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Basic performance test case class implementation
+ */
+
+#include <clBLAS.h>
+#include <iostream>
+#include <gtest/gtest.h>
+
+#include <common.h>
+#include "PerformanceTest.h"
+
+#include "timer.h"
+
+using namespace std;
+using namespace clMath;
+
+enum {
+    NUMBER_TEST_RUNS = 5 // 1000
+};
+
+int PerformanceTest::run(int opFactor)
+{
+    int i;
+    nano_time_t t1, t2;
+    nano_time_t time = NANOTIME_MAX;
+
+    if (prepare()) {
+        return -1;
+    }
+
+    /*
+     * etalon and tested procedures several times and select
+     * the minimum time so that to reduce delay would be introduced
+     * by another OS components or applications
+     */
+
+    t1 = NANOTIME_MAX;
+    for (i = 0; (i < NUMBER_TEST_RUNS) && (time != NANOTIME_ERR); i++) {
+        time = etalonPerfSingle();
+        if (time < t1) {
+            t1 = time;
+        }
+    }
+
+    t2 = NANOTIME_MAX;
+    for (i = 0; (i < NUMBER_TEST_RUNS) && (time != NANOTIME_ERR); i++) {
+        time = clblasPerfSingle();
+        if (time < t2) {
+            t2 = time;
+        }
+    }
+
+    if (time == NANOTIME_ERR) {
+        return -1;
+    }
+
+    t1 = conv2microsec(t1);
+    t2 = conv2microsec(t2);
+
+	#ifdef PERF_TEST_WITH_ACML
+           std::cerr << "Acml ";
+    #endif
+
+	if ( (functionBlasLevel(function_) == 2) || (functionBlasLevel(function_) == 1) ) {
+        cerr << "reference function has worked in " << t1 <<
+                " microseconds, clBlas function has worked in " << t2 <<
+                " microseconds";
+    }
+    else {
+        cerr << "reference function has worked in " << t1 / 1000 <<
+                " milliseconds, clBlas function has worked in " << t2 / 1000 <<
+                " milliseconds";
+
+    }
+    if (t2 != 0) {
+        cerr << ", time ratio is " << (double)t1 / t2;
+    }
+    cerr << endl;
+
+    perfRecorder->etalonRegPerf(function_, static_cast<unsigned long>(t1),
+                                prob_size_ * opFactor);
+    perfRecorder->clblasRegPerf(function_, static_cast<unsigned long>(t2),
+                                prob_size_ * opFactor);
+    if (t2 != 0) {
+        perfRecorder->regTimeRatio(function_, (double)t1 / t2);
+    }
+
+    /*
+     * Here check only if the CLBLAS version has worked not slower then
+     * the reference one
+     */
+#if 0
+    return !(t2 <= t1);
+#else
+    return 0;
+#endif
+}
+
+int PerformanceTest::prepare(void)
+{
+    // stub
+    return -1;
+}
+
+nano_time_t PerformanceTest::etalonPerfSingle(void)
+{
+    // stub
+    return NANOTIME_ERR;
+}
+
+nano_time_t PerformanceTest::clblasPerfSingle(void)
+{
+    // stub
+    return NANOTIME_ERR;
+}
+
diff --git a/src/tests/performance/PerformanceTest.h b/src/tests/performance/PerformanceTest.h
new file mode 100644
index 0000000..df93b9d
--- /dev/null
+++ b/src/tests/performance/PerformanceTest.h
@@ -0,0 +1,59 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Basic performance test case class declaration
+ */
+
+#ifndef PERFORMANCE_TEST_H_
+#define PERFORMANCE_TEST_H_
+
+#include <common.h>
+#include "timer.h"
+#include "PerformanceRecorder.h"
+
+enum {
+    MAX_ZMATRIX_SIZE = 3072
+};
+
+namespace clMath {
+
+class PerformanceTest {
+public:
+    PerformanceTest(BlasFunction function, problem_size_t prob_size) :
+            function_(function), prob_size_(prob_size) { };
+    virtual ~PerformanceTest() { }
+
+    /*
+     * On runtime error returns -1; otherwise returns 1
+     * if the CLBLAS version has been slower, otherwise returns 0
+     *
+     * @opFactor: scaling factor showing number of operations per each element
+     */
+    int run(int opFactor);
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+private:
+    BlasFunction function_;
+    problem_size_t prob_size_;
+};
+
+} // namespace clMath
+
+#endif /* PERFORMANCE_TEST_H_ */
diff --git a/src/tests/performance/TrxmPerformanceTest.cpp b/src/tests/performance/TrxmPerformanceTest.cpp
new file mode 100644
index 0000000..6ee272d
--- /dev/null
+++ b/src/tests/performance/TrxmPerformanceTest.cpp
@@ -0,0 +1,362 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Performance test case class implementation for
+ * TRMM and TRSM routines
+ */
+
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+using namespace std;
+using namespace clMath;
+
+namespace clMath {
+
+template <typename ElemType> class TrxmPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~TrxmPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        TrxmPerformanceTest<ElemType> *perfCase;
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        if (fn == FN_STRMM || fn == FN_DTRMM ||
+            fn == FN_STRSM || fn == FN_DTRSM) {
+
+            opFactor = 1;
+        }
+        else {
+            opFactor = 4;
+        }
+
+        if ((fn == FN_DTRMM || fn == FN_ZTRMM ||
+             fn == FN_DTRSM || fn == FN_ZTRSM) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        perfCase = new TrxmPerformanceTest<ElemType>(fn, params);
+        if (!perfCase->areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient "
+                         "resources" << std::endl;
+        }
+        else {
+            ret = perfCase->run(opFactor);
+        }
+
+        delete perfCase;
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                                     "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    TrxmPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType *A_;
+    ElemType *B_;
+    ElemType *backB_;
+    cl_mem mobjA_;
+    cl_mem mobjB_;
+    ::clMath::BlasBase *base_;
+    bool isTrsm_;
+
+    static problem_size_t problemSize(TestParams *params);
+};
+
+} // namespace
+
+template <typename ElemType>
+TrxmPerformanceTest<ElemType>::TrxmPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn, problemSize(params)),
+                params_(*params), mobjA_(NULL), mobjB_(NULL)
+{
+    A_ = new ElemType[params_.rowsA * params_.columnsA];
+    B_ = new ElemType[params_.rowsB * params_.columnsB];
+    backB_ = new ElemType[params_.rowsB * params_.columnsB];
+
+    base_ = ::clMath::BlasBase::getInstance();
+    isTrsm_ = (static_cast<int>(fn) >= FN_STRSM);
+}
+
+template <typename ElemType>
+TrxmPerformanceTest<ElemType>::~TrxmPerformanceTest()
+{
+    delete[] A_;
+    delete[] B_;
+    delete[] backB_;
+
+    clReleaseMemObject(mobjB_);
+    clReleaseMemObject(mobjA_);
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+TrxmPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize, maxMatrSize;
+    bool ret = true;
+    size_t m = params->M, n = params->N;
+    size_t asize;
+    clblasSide side = params->side;
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize(0);
+    allocSize = (size_t)base->maxMemAllocSize();
+    asize = (side == clblasLeft) ? m : n;
+
+    if (base->useImages()) {
+        size_t iw;
+
+        // overall 2 images 1/5 of gmemSize each and 2 memory objects
+        maxMatrSize = 3 * gmemSize / 10;
+        iw = base->scratchImageWidth() * sizeof(cl_float4) / sizeof(ElemType);
+
+        if (isTrsm_) {
+            size_t ih, nb;
+
+            // check if matrix A is fitted to the image with 32x32 blocks
+            ih = base->scratchImageHeight();
+            nb = asize / 32 + (asize % 32 != 0);
+            ret = ((asize * asize + nb * 32 * 32) / 2 < iw * ih);
+        }
+        else {
+            ret = (std::max(n, asize) < iw);
+        }
+    }
+    else {
+        maxMatrSize = gmemSize / 2;
+    }
+    maxMatrSize = std::min(maxMatrSize, allocSize);
+
+    if (ret) {
+        ret = ((m * n * sizeof(ElemType) < maxMatrSize) &&
+               (asize * asize * sizeof(ElemType) < maxMatrSize));
+    }
+
+    return ret;
+}
+
+template <typename ElemType> int
+TrxmPerformanceTest<ElemType>::prepare(void)
+{
+    bool useAlpha = base_->useAlpha();
+
+    if (useAlpha) {
+        alpha_ = convertMultiplier<ElemType>(base_->alpha());
+    }
+
+    if (isTrsm_) {
+        randomTrsmMatrices<ElemType>(params_.order, params_.side, params_.uplo,
+            params_.diag, params_.M, params_.N, useAlpha,
+            &alpha_, A_, params_.lda, B_, params_.ldb);
+    }
+    else {
+        randomTrmmMatrices<ElemType>(params_.order, params_.side, params_.uplo,
+            params_.diag, params_.M, params_.N, useAlpha,
+            &alpha_, A_, params_.lda, B_, params_.ldb);
+    }
+
+    mobjA_ = base_->createEnqueueBuffer(A_, params_.rowsA * params_.columnsA *
+                                        sizeof(ElemType),
+                                        params_.offA * sizeof(ElemType),
+                                        CL_MEM_READ_ONLY);
+    if (mobjA_) {
+        mobjB_ = base_->createEnqueueBuffer(backB_, params_.rowsB *
+                                            params_.columnsB * sizeof(ElemType),
+                                            params_.offBX * sizeof(ElemType),
+                                            CL_MEM_READ_WRITE);
+    }
+
+    return (mobjB_) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+TrxmPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+    size_t lda, ldb;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+
+    memcpy(B_, backB_, params_.rowsB * params_.columnsB *
+           sizeof(ElemType));
+    order = params_.order;
+    lda = params_.lda;
+    ldb = params_.ldb;
+
+#ifdef PERF_TEST_WITH_ACML
+
+    if (order == clblasRowMajor) {
+        order = clblasColumnMajor;
+        if (params_.side == clblasLeft) {
+            lda = params_.M;
+        }
+        else {
+            lda = params_.N;
+        }
+        ldb = params_.M;
+    }
+
+    time = getCurrentTime();
+    if (isTrsm_) {
+        clMath::blas::trsm(order, params_.side, params_.uplo,
+                        params_.transA, params_.diag,
+                        params_.M, params_.N,
+                        alpha_, A_, lda, B_, ldb);
+    }
+    else {
+        clMath::blas::trmm(order, params_.side, params_.uplo,
+                        params_.transA, params_.diag,
+                        params_.M, params_.N,
+                        alpha_, A_, lda, B_, ldb);
+    }
+    time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+template <typename ElemType> nano_time_t
+TrxmPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue;
+
+    queue = base_->commandQueues()[0];
+
+    status = clEnqueueWriteBuffer(queue, mobjB_, CL_TRUE, 0,
+                                  params_.rowsB * params_.columnsB *
+                                  sizeof(ElemType), backB_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Matrix B buffer object enqueuing error, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+
+    if (isTrsm_) {
+        status = (cl_int)clMath::clblas::trsm(params_.order, params_.side,
+            params_.uplo, params_.transA, params_.diag, params_.M, params_.N,
+            alpha_, mobjA_, params_.offA, params_.lda, mobjB_, params_.offBX,
+            params_.ldb, 1, &queue, 0, NULL, &event);
+    }
+    else {
+        status = (cl_int)clMath::clblas::trmm(params_.order, params_.side,
+            params_.uplo, params_.transA, params_.diag, params_.M, params_.N,
+            alpha_, mobjA_, params_.offA, params_.lda, mobjB_, params_.offBX,
+            params_.ldb, 1, &queue, 0, NULL, &event);
+    }
+
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS TRXM function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+
+    return time;
+}
+
+template <typename ElemType> problem_size_t
+TrxmPerformanceTest<ElemType>::problemSize(TestParams *params)
+{
+    problem_size_t size;
+
+    if (params->side == clblasRight) {
+        size = (problem_size_t)params->N * params->N * params->M;
+    }
+    else {
+        size = (problem_size_t)params->M * params->M * params->N;
+    }
+
+    return size;
+}
diff --git a/src/tests/performance/perf-asum.cpp b/src/tests/performance/perf-asum.cpp
new file mode 100644
index 0000000..225e163
--- /dev/null
+++ b/src/tests/performance/perf-asum.cpp
@@ -0,0 +1,300 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <asum.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class AsumPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~AsumPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        AsumPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor =1;
+
+        if (((fn == FN_DASUM) || (fn == FN_DZASUM)) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to insufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    AsumPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType *blasX_;
+    cl_mem mobjX_;
+	cl_mem mobjAsum_;
+	cl_mem scratchBuff;
+    size_t  lengthX;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+AsumPerformanceTest<ElemType>::AsumPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn,(problem_size_t) ( (1 * params->N)  * sizeof(ElemType) ) ), params_(*params), mobjX_(NULL), mobjAsum_(NULL)
+{
+
+    blasX_ = NULL;
+	mobjX_ = mobjAsum_= scratchBuff = NULL;
+    lengthX = 1 + (params->N - 1) * abs(params_.incx);
+
+    try
+    {
+        blasX_ = new ElemType[lengthX + params_.offBX];
+    }
+    catch(bad_alloc& ba) {
+        blasX_ = NULL;     // areResourcesSufficient() will handle the rest and return
+        mobjX_ = mobjAsum_= scratchBuff = NULL;
+        ba = ba;
+    }
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+AsumPerformanceTest<ElemType>::~AsumPerformanceTest()
+{
+	if(blasX_ != NULL)
+    {
+        delete[] blasX_;
+	}
+    if( mobjX_ != NULL )
+    {
+		clReleaseMemObject(mobjX_);
+    }
+	if( mobjAsum_ != NULL )
+    {
+        clReleaseMemObject(mobjAsum_);
+    }
+	if( scratchBuff!= NULL )
+    {
+        clReleaseMemObject(scratchBuff);
+    }
+
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+AsumPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    bool ret;
+    size_t sizeX, sizeAsum;
+
+	if((blasX_ == NULL) ) {
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize( 0 );
+    allocSize = (size_t)base->maxMemAllocSize();
+    sizeX = (lengthX + params->offBX) * sizeof(ElemType);
+	sizeAsum = (1 + params->offa) * sizeof(ElemType);
+
+    ret = ((sizeX < allocSize) && (sizeAsum < allocSize));
+    ret = (ret && ((sizeX + sizeAsum) < gmemSize));
+
+    return ret;
+}
+
+template <typename ElemType> int
+AsumPerformanceTest<ElemType>::prepare(void)
+{
+
+    randomVectors(params_.N, (blasX_ + params_.offBX), params_.incx, (ElemType*)NULL, 0, true);
+
+	mobjX_ = base_->createEnqueueBuffer(blasX_, ((lengthX + params_.offBX) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
+	mobjAsum_ = base_->createEnqueueBuffer(NULL, ((1 + params_.offa) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
+	scratchBuff = base_->createEnqueueBuffer(NULL, ((lengthX) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
+
+    return ((mobjX_ != NULL) &&  (mobjAsum_ != NULL)&& (scratchBuff != NULL) )? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+AsumPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+
+#ifdef PERF_TEST_WITH_ACML
+
+	time = getCurrentTime();
+	clMath::blas::asum(params_.N, blasX_, params_.offBX, params_.incx );
+	time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+AsumPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    DataType type;
+    type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE:
+										( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+    event = NULL;
+    clFinish( queue);
+    time = getCurrentTime();
+
+#define TIMING
+#ifdef TIMING
+    int iter = 100;
+    for ( int i=1; i <= iter; i++)
+    {
+#endif
+
+        status = (cl_int)clMath::clblas::asum( type, params_.N, mobjAsum_, params_.offa, mobjX_, params_.offBX, params_.incx,
+                             scratchBuff, 1, &queue, 0, NULL, &event);
+        if (status != CL_SUCCESS) {
+            cerr << "The CLBLAS ASUM function failed, status = " <<
+                    status << endl;
+
+            return NANOTIME_ERR;
+        }
+#ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+#else
+
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+    return time;
+}
+
+} // namespace clMath
+
+TEST_P(ASUM, sasum)
+{
+    TestParams params;
+
+    getParams(&params);
+    AsumPerformanceTest<float>::runInstance(FN_SASUM, &params);
+}
+
+
+TEST_P(ASUM, dasum)
+{
+    TestParams params;
+
+    getParams(&params);
+    AsumPerformanceTest<double>::runInstance(FN_DASUM, &params);
+}
+
+TEST_P(ASUM, scasum)
+{
+    TestParams params;
+
+    getParams(&params);
+    AsumPerformanceTest<FloatComplex>::runInstance(FN_SCASUM, &params);
+}
+
+
+TEST_P(ASUM, dzasum)
+{
+    TestParams params;
+
+    getParams(&params);
+    AsumPerformanceTest<DoubleComplex>::runInstance(FN_DZASUM, &params);
+}
diff --git a/src/tests/performance/perf-axpy.cpp b/src/tests/performance/perf-axpy.cpp
new file mode 100644
index 0000000..6c86389
--- /dev/null
+++ b/src/tests/performance/perf-axpy.cpp
@@ -0,0 +1,344 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * AXPY performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <axpy.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class AxpyPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~AxpyPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        AxpyPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor =1;
+
+        if (((fn == FN_DAXPY) || (fn == FN_ZAXPY)) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    AxpyPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType *X_;
+    ElemType *Y_;
+    ElemType *blasX_;
+    ElemType *blasY_;
+    cl_mem mobjX_;
+    cl_mem mobjY_;
+    size_t  lengthX;
+    size_t  lengthY;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+AxpyPerformanceTest<ElemType>::AxpyPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn,(problem_size_t) ( (3 * params->N)  * sizeof(ElemType) ) ), params_(*params), mobjX_(NULL), mobjY_(NULL)
+{
+
+    X_ = blasX_ = Y_ = blasY_ = NULL;
+
+    lengthX = 1 + (params->N - 1) * abs(params_.incx);
+    lengthY = 1 + (params->N - 1) * abs(params_.incy);
+
+    try
+    {
+        X_ = new ElemType[lengthX + params_.offBX];
+        blasX_ = new ElemType[lengthX + params_.offBX];
+        Y_ = new ElemType[lengthY + params_.offCY];
+        blasY_ = new ElemType[lengthY + params_.offCY];
+    }
+    catch(bad_alloc& ba) {
+        X_ = blasX_ = Y_ = blasY_ = NULL;     // areResourcesSufficient() will handle the rest and return
+        mobjX_= NULL;
+        ba = ba;
+    }
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+AxpyPerformanceTest<ElemType>::~AxpyPerformanceTest()
+{
+	if(X_ != NULL)
+    {
+        delete[] X_;
+	}
+	if(blasX_ != NULL)
+    {
+        delete[] blasX_;
+	}
+    if( mobjX_ != NULL )
+    {
+		clReleaseMemObject(mobjX_);
+    }
+    if(Y_ != NULL)
+    {
+        delete[] Y_;
+	}
+	if(blasY_ != NULL)
+    {
+        delete[] blasY_;
+	}
+    if( mobjY_ != NULL )
+    {
+		clReleaseMemObject(mobjY_);
+    }
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+AxpyPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize, reqdSize;
+    bool ret;
+
+	if((X_ == NULL) || (blasX_ == NULL) || (Y_ == NULL) || (blasY_ == NULL))
+    {
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize( 0 );
+    allocSize = (size_t)base->maxMemAllocSize();
+    reqdSize = (lengthX + params->offBX + lengthY + params->offCY) * sizeof(ElemType);
+
+    ret = (reqdSize) < allocSize;
+    ret = ret && (reqdSize < gmemSize);
+
+    return ret;
+}
+
+template <typename ElemType> int
+AxpyPerformanceTest<ElemType>::prepare(void)
+{
+
+    alpha_ = convertMultiplier<ElemType>(params_.alpha);
+    randomVectors(params_.N, (X_ + params_.offBX), params_.incx, (Y_ + params_.offCY), params_.incy);
+    memcpy(blasX_, X_, (lengthX + params_.offBX)* sizeof(ElemType));
+    memcpy(blasY_, Y_, (lengthY + params_.offCY)* sizeof(ElemType));
+	mobjX_ = base_->createEnqueueBuffer(X_, ((lengthX + params_.offBX) * sizeof(ElemType)), 0, CL_MEM_READ_ONLY);
+	mobjY_ = base_->createEnqueueBuffer(Y_, ((lengthY + params_.offCY) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
+
+    return ((mobjX_ != NULL) && (mobjY_ != NULL))? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+AxpyPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+
+#ifdef PERF_TEST_WITH_ACML
+
+		time = getCurrentTime();
+		clMath::blas::axpy(params_.N, alpha_, blasX_, params_.offBX, params_.incx, blasY_, params_.offCY, params_.incy);
+		time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+AxpyPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0,
+                                  (lengthX + params_.offBX) * sizeof(ElemType), X_, 0, NULL, &event);
+    status |= clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0,
+                                  (lengthY + params_.offCY) * sizeof(ElemType), Y_, 0, NULL, &event);
+    if (status != CL_SUCCESS)
+    {
+        cerr << "mobjX_ or mobjY_ buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS)
+    {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+    time = getCurrentTime();
+
+#define TIMING
+#ifdef TIMING
+    clFinish( queue);
+    int iter = 50;
+    for ( int i=1; i <= iter; i++)
+    {
+#endif
+
+        status = (cl_int)clMath::clblas::axpy(params_.N, alpha_, mobjX_, params_.offBX, params_.incx, mobjY_, params_.offCY, params_.incy,
+                            1, &queue, 0, NULL, &event);
+        if (status != CL_SUCCESS) {
+            cerr << "The CLBLAS AXPY function failed, status = " <<
+                    status << endl;
+
+            return NANOTIME_ERR;
+        }
+#ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+#else
+
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+    return time;
+}
+
+} // namespace clMath
+
+// axpy performance test
+TEST_P(AXPY, saxpy)
+{
+    TestParams params;
+
+    getParams(&params);
+    AxpyPerformanceTest<float>::runInstance(FN_SAXPY, &params);
+}
+
+
+TEST_P(AXPY, daxpy)
+{
+    TestParams params;
+
+    getParams(&params);
+    AxpyPerformanceTest<double>::runInstance(FN_DAXPY, &params);
+}
+
+TEST_P(AXPY, caxpy)
+{
+    TestParams params;
+
+    getParams(&params);
+    AxpyPerformanceTest<FloatComplex>::runInstance(FN_CAXPY, &params);
+}
+
+
+TEST_P(AXPY, zaxpy)
+{
+    TestParams params;
+
+    getParams(&params);
+    AxpyPerformanceTest<DoubleComplex>::runInstance(FN_ZAXPY, &params);
+}
diff --git a/src/tests/performance/perf-copy.cpp b/src/tests/performance/perf-copy.cpp
new file mode 100644
index 0000000..51eb8b1
--- /dev/null
+++ b/src/tests/performance/perf-copy.cpp
@@ -0,0 +1,322 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <copy.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class CopyPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~CopyPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        CopyPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor =1;
+
+        if (((fn == FN_DCOPY) || (fn == FN_ZCOPY)) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    CopyPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+	ElemType *backY_;
+    ElemType *blasX_;
+    ElemType *blasY_;
+    cl_mem mobjX_;
+    cl_mem mobjY_;
+    size_t  lengthX;
+    size_t  lengthY;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+CopyPerformanceTest<ElemType>::CopyPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn,(problem_size_t) ( (2 * params->N)  * sizeof(ElemType) ) ), params_(*params), mobjX_(NULL), mobjY_(NULL)
+{
+
+    blasX_ = NULL;
+    blasY_ = NULL;
+	backY_ = NULL;
+    lengthX = 1 + (params->N - 1) * abs(params_.incx);
+    lengthY = 1 + (params->N - 1) * abs(params_.incy);
+
+    try
+    {
+		backY_ = new ElemType[lengthY + params_.offCY];
+        blasX_ = new ElemType[lengthX + params_.offBX];
+        blasY_ = new ElemType[lengthY + params_.offCY];
+    }
+    catch(bad_alloc& ba) {
+        backY_ = blasX_ = blasY_ = NULL;     // areResourcesSufficient() will handle the rest and return
+        mobjX_= mobjY_ = NULL;
+        ba = ba;
+    }
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+CopyPerformanceTest<ElemType>::~CopyPerformanceTest()
+{
+	if(blasX_ != NULL)
+    {
+        delete[] blasX_;
+	}
+	if(blasY_ != NULL)
+    {
+        delete[] blasY_;
+	}
+	if(backY_ != NULL)
+	{
+		delete[] backY_;
+	}
+    if( mobjX_ != NULL )
+    {
+		clReleaseMemObject(mobjX_);
+    }
+    if( mobjY_ != NULL )
+    {
+		clReleaseMemObject(mobjY_);
+    }
+}
+template <typename ElemType> bool
+CopyPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    bool ret;
+    size_t sizeX, sizeY;
+
+	if((blasX_ == NULL) || (blasY_ == NULL) || (backY_ ==NULL) ) {
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize( 0 );
+    allocSize = (size_t)base->maxMemAllocSize();
+    sizeX = (lengthX + params->offBX) * sizeof(ElemType);
+    sizeY = (lengthY + params->offCY) * sizeof(ElemType);
+
+    ret = ((sizeX < allocSize) && (sizeY < allocSize));
+    ret = (ret && ((sizeX + sizeY) < gmemSize));
+
+    return ret;
+}
+
+template <typename ElemType> int
+CopyPerformanceTest<ElemType>::prepare(void)
+{
+    randomVectors(params_.N, (blasX_ + params_.offBX), params_.incx, (blasY_ + params_.offCY), params_.incy);
+    memcpy(backY_, blasY_, (lengthY + params_.offCY)* sizeof(ElemType));
+
+	mobjX_ = base_->createEnqueueBuffer(blasX_, ((lengthX + params_.offBX) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
+	mobjY_ = base_->createEnqueueBuffer(blasY_, ((lengthY + params_.offCY) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
+
+    return ((mobjX_ != NULL) && (mobjY_ != NULL))? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+CopyPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+
+#ifdef PERF_TEST_WITH_ACML
+
+		time = getCurrentTime();
+		clMath::blas::copy(params_.N, blasX_, params_.offBX, params_.incx, blasY_, params_.offCY, params_.incy);
+		time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+CopyPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    DataType type;
+    type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE:
+										( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+    status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0,
+                                  (lengthY + params_.offCY) * sizeof(ElemType), backY_, 0, NULL, &event);
+    if (status != CL_SUCCESS)
+    {
+        cerr << "Vector Y buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS)
+    {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+    time = getCurrentTime();
+
+#define TIMING
+#ifdef TIMING
+    clFinish( queue);
+    int iter = 100;
+    for ( int i=1; i <= iter; i++)
+    {
+#endif
+
+        status = (cl_int)clMath::clblas::copy(type, params_.N, mobjX_, params_.offBX, params_.incx,
+                             mobjY_, params_.offCY, params_.incy, 1, &queue, 0, NULL, &event);
+        if (status != CL_SUCCESS) {
+            cerr << "The CLBLAS COPY function failed, status = " <<
+                    status << endl;
+
+            return NANOTIME_ERR;
+        }
+#ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+#else
+
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+    return time;
+}
+
+} // namespace clMath
+
+TEST_P(COPY, scopy)
+{
+    TestParams params;
+
+    getParams(&params);
+    CopyPerformanceTest<float>::runInstance(FN_SCOPY, &params);
+}
+
+
+TEST_P(COPY, dcopy)
+{
+    TestParams params;
+
+    getParams(&params);
+    CopyPerformanceTest<double>::runInstance(FN_DCOPY, &params);
+}
+
+TEST_P(COPY, ccopy)
+{
+    TestParams params;
+
+    getParams(&params);
+    CopyPerformanceTest<FloatComplex>::runInstance(FN_CCOPY, &params);
+}
+
+
+TEST_P(COPY, zcopy)
+{
+    TestParams params;
+
+    getParams(&params);
+    CopyPerformanceTest<DoubleComplex>::runInstance(FN_ZCOPY, &params);
+}
diff --git a/src/tests/performance/perf-dot.cpp b/src/tests/performance/perf-dot.cpp
new file mode 100644
index 0000000..03e95b6
--- /dev/null
+++ b/src/tests/performance/perf-dot.cpp
@@ -0,0 +1,316 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <dot.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class DotPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~DotPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        DotPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor =1;
+
+        if (((fn == FN_DDOT) || (fn == FN_ZDOTU)) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to insufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    DotPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType *blasX_;
+    ElemType *blasY_;
+    cl_mem mobjX_;
+    cl_mem mobjY_;
+	cl_mem mobjDP_;
+	cl_mem scratchBuff;
+    size_t  lengthX;
+    size_t  lengthY;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+DotPerformanceTest<ElemType>::DotPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn,(problem_size_t) ( (2 * params->N)  * sizeof(ElemType) ) ), params_(*params), mobjX_(NULL), mobjY_(NULL),mobjDP_(NULL)
+{
+
+    blasX_ = NULL;
+    blasY_ = NULL;
+	mobjX_= mobjY_ = mobjDP_= scratchBuff = NULL;
+    lengthX = 1 + (params->N - 1) * abs(params_.incx);
+    lengthY = 1 + (params->N - 1) * abs(params_.incy);
+
+    try
+    {
+        blasX_ = new ElemType[lengthX + params_.offBX];
+        blasY_ = new ElemType[lengthY + params_.offCY];
+    }
+    catch(bad_alloc& ba) {
+        blasX_ = blasY_ = NULL;     // areResourcesSufficient() will handle the rest and return
+        mobjX_= mobjY_ = mobjDP_= scratchBuff = NULL;
+        ba = ba;
+    }
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+DotPerformanceTest<ElemType>::~DotPerformanceTest()
+{
+	if(blasX_ != NULL)
+    {
+        delete[] blasX_;
+	}
+	if(blasY_ != NULL)
+    {
+        delete[] blasY_;
+	}
+    if( mobjX_ != NULL )
+    {
+		clReleaseMemObject(mobjX_);
+    }
+    if( mobjY_ != NULL )
+    {
+		clReleaseMemObject(mobjY_);
+    }
+	if( mobjDP_ != NULL )
+    {
+        clReleaseMemObject(mobjDP_);
+    }
+	if( scratchBuff!= NULL )
+    {
+        clReleaseMemObject(scratchBuff);
+    }
+
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+DotPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    bool ret;
+    size_t sizeX, sizeY, sizeDP;
+
+	if((blasX_ == NULL) || (blasY_ == NULL) ) {
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize( 0 );
+    allocSize = (size_t)base->maxMemAllocSize();
+    sizeX = (lengthX + params->offBX) * sizeof(ElemType);
+    sizeY = (lengthY + params->offCY) * sizeof(ElemType);
+	sizeDP = (1 + params->offa) * sizeof(ElemType);
+
+    ret = ((sizeX < allocSize) && (sizeY < allocSize) && (sizeDP < allocSize));
+    ret = (ret && ((sizeX + sizeY + sizeDP) < gmemSize));
+
+    return ret;
+}
+
+template <typename ElemType> int
+DotPerformanceTest<ElemType>::prepare(void)
+{
+
+    randomVectors(params_.N, (blasX_ + params_.offBX), params_.incx, (blasY_ + params_.offCY), params_.incy, true);
+
+	mobjX_ = base_->createEnqueueBuffer(blasX_, ((lengthX + params_.offBX) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
+	mobjY_ = base_->createEnqueueBuffer(blasY_, ((lengthY + params_.offCY) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
+	mobjDP_ = base_->createEnqueueBuffer(NULL, ((1 + params_.offa) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
+	scratchBuff = base_->createEnqueueBuffer(NULL, ((lengthY) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
+
+    return ((mobjX_ != NULL) && (mobjY_ != NULL) &&  (mobjDP_ != NULL)&& (scratchBuff != NULL) )? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+DotPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+
+#ifdef PERF_TEST_WITH_ACML
+
+	time = getCurrentTime();
+	clMath::blas::dot(params_.N, blasX_, params_.offBX, params_.incx, blasY_, params_.offCY, params_.incy);
+	time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+DotPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    DataType type;
+    type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE:
+										( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+    event = NULL;
+    clFinish( queue);
+    time = getCurrentTime();
+
+#define TIMING
+#ifdef TIMING
+    int iter = 100;
+    for ( int i=1; i <= iter; i++)
+    {
+#endif
+
+        status = (cl_int)clMath::clblas::dot( type, params_.N, mobjDP_, params_.offa, mobjX_, params_.offBX, params_.incx,
+                             mobjY_, params_.offCY, params_.incy, scratchBuff, 1, &queue, 0, NULL, &event);
+        if (status != CL_SUCCESS) {
+            cerr << "The CLBLAS DOT function failed, status = " <<
+                    status << endl;
+
+            return NANOTIME_ERR;
+        }
+#ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+#else
+
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+    return time;
+}
+
+} // namespace clMath
+
+TEST_P(DOT, sdot)
+{
+    TestParams params;
+
+    getParams(&params);
+    DotPerformanceTest<float>::runInstance(FN_SDOT, &params);
+}
+
+
+TEST_P(DOT, ddot)
+{
+    TestParams params;
+
+    getParams(&params);
+    DotPerformanceTest<double>::runInstance(FN_DDOT, &params);
+}
+
+TEST_P(DOT, cdotu)
+{
+    TestParams params;
+
+    getParams(&params);
+    DotPerformanceTest<FloatComplex>::runInstance(FN_CDOTU, &params);
+}
+
+
+TEST_P(DOT, zdotu)
+{
+    TestParams params;
+
+    getParams(&params);
+    DotPerformanceTest<DoubleComplex>::runInstance(FN_ZDOTU, &params);
+}
diff --git a/src/tests/performance/perf-dotc.cpp b/src/tests/performance/perf-dotc.cpp
new file mode 100644
index 0000000..3658534
--- /dev/null
+++ b/src/tests/performance/perf-dotc.cpp
@@ -0,0 +1,300 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <dotc.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class DotcPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~DotcPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        DotcPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor =1;
+
+        if (((fn == FN_ZDOTC)) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to insufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    DotcPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType *blasX_;
+    ElemType *blasY_;
+    cl_mem mobjX_;
+    cl_mem mobjY_;
+	cl_mem mobjDP_;
+	cl_mem scratchBuff;
+    size_t  lengthX;
+    size_t  lengthY;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+DotcPerformanceTest<ElemType>::DotcPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn,(problem_size_t) ( (2 * params->N)  * sizeof(ElemType) ) ), params_(*params), mobjX_(NULL), mobjY_(NULL),mobjDP_(NULL)
+{
+
+    blasX_ = NULL;
+    blasY_ = NULL;
+	mobjX_= mobjY_ = mobjDP_= scratchBuff = NULL;
+    lengthX = 1 + (params->N - 1) * abs(params_.incx);
+    lengthY = 1 + (params->N - 1) * abs(params_.incy);
+
+    try
+    {
+        blasX_ = new ElemType[lengthX + params_.offBX];
+        blasY_ = new ElemType[lengthY + params_.offCY];
+    }
+    catch(bad_alloc& ba) {
+        blasX_ = blasY_ = NULL;     // areResourcesSufficient() will handle the rest and return
+        mobjX_= mobjY_ = mobjDP_= scratchBuff = NULL;
+        ba = ba;
+    }
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+DotcPerformanceTest<ElemType>::~DotcPerformanceTest()
+{
+	if(blasX_ != NULL)
+    {
+        delete[] blasX_;
+	}
+	if(blasY_ != NULL)
+    {
+        delete[] blasY_;
+	}
+    if( mobjX_ != NULL )
+    {
+		clReleaseMemObject(mobjX_);
+    }
+    if( mobjY_ != NULL )
+    {
+		clReleaseMemObject(mobjY_);
+    }
+	if( mobjDP_ != NULL )
+    {
+        clReleaseMemObject(mobjDP_);
+    }
+	if( scratchBuff!= NULL )
+    {
+        clReleaseMemObject(scratchBuff);
+    }
+
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+DotcPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    bool ret;
+    size_t sizeX, sizeY, sizeDP;
+
+	if((blasX_ == NULL) || (blasY_ == NULL) ) {
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize( 0 );
+    allocSize = (size_t)base->maxMemAllocSize();
+    sizeX = (lengthX + params->offBX) * sizeof(ElemType);
+    sizeY = (lengthY + params->offCY) * sizeof(ElemType);
+	sizeDP = (1 + params->offa) * sizeof(ElemType);
+
+    ret = ((sizeX < allocSize) && (sizeY < allocSize) && (sizeDP < allocSize));
+    ret = (ret && ((sizeX + sizeY + sizeDP) < gmemSize));
+
+    return ret;
+}
+
+template <typename ElemType> int
+DotcPerformanceTest<ElemType>::prepare(void)
+{
+
+    randomVectors(params_.N, (blasX_ + params_.offBX), params_.incx, (blasY_ + params_.offCY), params_.incy, true);
+
+	mobjX_ = base_->createEnqueueBuffer(blasX_, ((lengthX + params_.offBX) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
+	mobjY_ = base_->createEnqueueBuffer(blasY_, ((lengthY + params_.offCY) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
+	mobjDP_ = base_->createEnqueueBuffer(NULL, ((1 + params_.offa) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
+	scratchBuff = base_->createEnqueueBuffer(NULL, ((lengthY) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
+
+    return ((mobjX_ != NULL) && (mobjY_ != NULL) &&  (mobjDP_ != NULL)&& (scratchBuff != NULL) )? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+DotcPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+
+#ifdef PERF_TEST_WITH_ACML
+
+	time = getCurrentTime();
+	clMath::blas::dotc(params_.N, blasX_, params_.offBX, params_.incx, blasY_, params_.offCY, params_.incy);
+	time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+DotcPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    DataType type;
+    type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE:
+										( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+    event = NULL;
+    clFinish( queue);
+    time = getCurrentTime();
+
+#define TIMING
+#ifdef TIMING
+    int iter = 100;
+    for ( int i=1; i <= iter; i++)
+    {
+#endif
+
+        status = (cl_int)clMath::clblas::dot( type, params_.N, mobjDP_, params_.offa, mobjX_, params_.offBX, params_.incx,
+                             mobjY_, params_.offCY, params_.incy, scratchBuff, 1, &queue, 0, NULL, &event);
+        if (status != CL_SUCCESS) {
+            cerr << "The CLBLAS DOT function failed, status = " <<
+                    status << endl;
+
+            return NANOTIME_ERR;
+        }
+#ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+#else
+
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+    return time;
+}
+
+} // namespace clMath
+
+TEST_P(DOTC, cdotc)
+{
+    TestParams params;
+
+    getParams(&params);
+    DotcPerformanceTest<FloatComplex>::runInstance(FN_CDOTC, &params);
+}
+
+
+TEST_P(DOTC, zdotc)
+{
+    TestParams params;
+
+    getParams(&params);
+    DotcPerformanceTest<DoubleComplex>::runInstance(FN_ZDOTC, &params);
+}
+
diff --git a/src/tests/performance/perf-gbmv.cpp b/src/tests/performance/perf-gbmv.cpp
new file mode 100644
index 0000000..7066fd5
--- /dev/null
+++ b/src/tests/performance/perf-gbmv.cpp
@@ -0,0 +1,353 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Gbmv performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <gbmv.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class GbmvPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~GbmvPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        GbmvPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor = 1;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        if ((fn == FN_DGBMV || fn == FN_ZGBMV) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    GbmvPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha;
+    ElemType beta;
+    ElemType *A_;
+    ElemType *X_;
+    ElemType *Y_;
+    ElemType *backY_;
+    cl_mem mobjA_;
+    cl_mem mobjX_;
+    cl_mem mobjY_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+GbmvPerformanceTest<ElemType>::GbmvPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn,
+    (problem_size_t)( ( (((params->order == clblasColumnMajor)? params->N : params->M) * (params->KL + params->KU + 1)   // A-access
+                          - (params->KL*(params->KL+1) + params->KU*(params->KU+1)) )       // Substract hole-part for A & X
+                        +((params->transA == clblasNoTrans)? ((params->KL + params->KU + 1) * params->M + 2*params->M)   // X & Y access
+                                                              : ((params->KL + params->KU + 1) * params->N + 2*params->N) ) // X & Y for Trans case
+                                                                                                              ) * sizeof(ElemType) ) ),
+                          params_(*params), mobjA_(NULL), mobjX_(NULL), mobjY_(NULL)
+{
+    size_t lenA, lenX, lenY;
+    lenA = ((params_.order == clblasColumnMajor)? params_.N : params_.M) * (params_.lda) + params_.offA;
+    lenX = (((params_.transA == clblasNoTrans)? params_.N : params_.M) - 1)* params_.incx + 1 + params_.offBX;
+    lenY = (((params_.transA == clblasNoTrans)? params_.M : params_.N) - 1)* params_.incy + 1 + params_.offCY;
+    A_ = new ElemType[ lenA ];
+    X_ = new ElemType[ lenX ];
+    Y_ = new ElemType[ lenY ];
+    backY_ = new ElemType[ lenY ];
+    alpha = convertMultiplier<ElemType>(params_.alpha);
+	beta  = convertMultiplier<ElemType>(params_.beta);
+
+    base_ = ::clMath::BlasBase::getInstance();
+
+	mobjA_ = NULL;
+	mobjX_ = NULL;
+	mobjY_ = NULL;
+}
+
+template <typename ElemType>
+GbmvPerformanceTest<ElemType>::~GbmvPerformanceTest()
+{
+    if(A_ != NULL)
+    {
+        delete[] A_;
+    }
+	if(X_ != NULL)
+	{
+        delete[] X_;
+	}
+	if(backY_ != NULL)
+	{
+		delete[] backY_;
+	}
+	if(Y_ != NULL)
+	{
+	    delete[] Y_;
+	}
+
+    if ( mobjA_ != NULL )
+		clReleaseMemObject(mobjA_);
+	if ( mobjX_ != NULL )
+	    clReleaseMemObject(mobjX_);
+	if ( mobjY_ != NULL )
+		clReleaseMemObject(mobjY_);
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+GbmvPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t m = params->M, n = params->N, lda = params->lda;
+    size_t lenA = (((params->order == clblasColumnMajor)? n : m) * lda  + params->offA)* sizeof(ElemType);
+    size_t lenX = ((((params->transA == clblasNoTrans)? params->N : params->M) - 1)* params->incx + 1 + params->offBX) * sizeof(ElemType);
+    size_t lenY = ((((params->transA == clblasNoTrans)? params->M : params->N) - 1)* params->incy + 1 + params->offCY) * sizeof(ElemType);
+
+    if((A_ == NULL) || (X_ == NULL) || (Y_ == NULL) || (backY_ == NULL))
+	{
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize(0);
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    bool suff = (lenA < allocSize) && ( (lenA + lenX + lenY) < gmemSize );
+
+    return suff;
+}
+
+template <typename ElemType> int
+GbmvPerformanceTest<ElemType>::prepare(void)
+{
+    size_t lenX, lenY, lenA;
+
+    lenA = ((params_.order == clblasColumnMajor)? params_.N : params_.M) * params_.lda + params_.offA;
+
+    if (params_.transA == clblasNoTrans) {
+        lenX = (params_.N - 1)*abs(params_.incx) + 1 + params_.offBX;
+        lenY = (params_.M - 1)*abs(params_.incy) + 1 + params_.offCY;
+    }
+    else {
+        lenX = (params_.M - 1)*abs(params_.incx) + 1 + params_.offBX;
+        lenY = (params_.N - 1)*abs(params_.incy) + 1 + params_.offCY;
+    }
+
+    randomGbmvMatrices(params_.order, params_.transA, params_.M, params_.N, &alpha, &beta,
+                        (A_+params_.offA), params_.lda, (X_+params_.offBX), params_.incx, (Y_+params_.offCY), params_.incy );
+
+    memcpy(backY_, Y_, lenY * sizeof(ElemType));
+
+    mobjA_ = base_->createEnqueueBuffer(A_, lenA * sizeof(ElemType), 0, CL_MEM_READ_ONLY);
+    mobjX_ = base_->createEnqueueBuffer(X_, lenX * sizeof(ElemType), 0, CL_MEM_READ_ONLY);
+    mobjY_ = base_->createEnqueueBuffer(backY_, lenY * sizeof(ElemType), 0, CL_MEM_READ_WRITE);
+
+    return ((mobjA_ != NULL) && (mobjX_ != NULL) && (mobjY_ != NULL)) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+GbmvPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder fOrder;
+    clblasTranspose fTrans;
+    size_t lda, lenY, lenA;
+    size_t fM = params_.M, fN = params_.N, fKL = params_.KL, fKU = params_.KU;
+
+    lenA = ((params_.order == clblasColumnMajor)? params_.N : params_.M) * params_.lda;
+    lenY = (((params_.transA == clblasNoTrans)? params_.M : params_.N) - 1)* params_.incy + 1 + params_.offCY;
+
+    memcpy(Y_, backY_, lenY * sizeof(ElemType));
+    fOrder = params_.order;
+    fTrans = params_.transA;
+    lda = params_.lda;
+
+    if (fOrder != clblasColumnMajor)
+    {
+        fOrder = clblasColumnMajor;
+        fTrans = (params_.transA == clblasNoTrans)? clblasTrans : clblasNoTrans;
+        fM = params_.N;
+        fN = params_.M;
+        fKL = params_.KU;
+        fKU = params_.KL;
+
+		if( params_.transA == clblasConjTrans )
+            doConjugate( (A_+params_.offa), 1, lenA, lda );
+   	}
+
+#ifdef PERF_TEST_WITH_ACML
+
+   	time = getCurrentTime();
+   	clMath::blas::gbmv(fOrder, fTrans, fM, fN, fKL, fKU, alpha, A_, params_.offA, lda,
+							X_, params_.offBX, params_.incx, beta, Y_, params_.offCY, params_.incy);
+  	time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+GbmvPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    size_t lenY;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    lenY = (((params_.transA == clblasNoTrans)? params_.M : params_.N) - 1)* params_.incy + 1 + params_.offCY;
+
+    status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0,
+                                  lenY * sizeof(ElemType), backY_, 0, NULL, &event);
+
+    if (status != CL_SUCCESS) {
+        cerr << "Vector Y buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+    time = getCurrentTime();
+    int iter = 20;
+	for ( int i = 1; i <= iter; i++)
+	{
+        status = clMath::clblas::gbmv(params_.order, params_.transA, params_.M, params_.N, params_.KL, params_.KU,
+                                        alpha, mobjA_, params_.offA, params_.lda, mobjX_, params_.offBX, params_.incx,
+                                        beta, mobjY_, params_.offCY, params_.incy, 1, &queue, 0, NULL, &event);
+
+        if (status != CL_SUCCESS) {
+            cerr << "The CLBLAS GBMV function failed, status = " <<
+                    status << endl;
+            return NANOTIME_ERR;
+        }
+    }
+    clFinish( queue );
+    time = getCurrentTime() - time;
+	time /= iter;
+
+    return time;
+}
+
+} // namespace clMath
+
+// sgbmv performance test
+TEST_P(GBMV, sgbmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    GbmvPerformanceTest<float>::runInstance(FN_SGBMV, &params);
+}
+
+// dgbmv performance test case
+TEST_P(GBMV, dgbmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    GbmvPerformanceTest<double>::runInstance(FN_DGBMV, &params);
+}
+
+// cgbmv performance test
+TEST_P(GBMV, cgbmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    GbmvPerformanceTest<FloatComplex>::runInstance(FN_CGBMV, &params);
+}
+
+// zgbmv performance test case
+TEST_P(GBMV, zgbmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    GbmvPerformanceTest<DoubleComplex>::runInstance(FN_ZGBMV, &params);
+}
diff --git a/src/tests/performance/perf-gemm.cpp b/src/tests/performance/perf-gemm.cpp
new file mode 100644
index 0000000..1a7d3e6
--- /dev/null
+++ b/src/tests/performance/perf-gemm.cpp
@@ -0,0 +1,368 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Gemm performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <gemm.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class GemmPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~GemmPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        GemmPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        if (fn == FN_SGEMM || fn == FN_DGEMM) {
+            opFactor = 2;
+        }
+        else {
+            opFactor = 8;
+        }
+
+        if ((fn == FN_DGEMM || fn == FN_ZGEMM) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    GemmPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType beta_;
+    ElemType *A_;
+    ElemType *B_;
+    ElemType *C_;
+    ElemType *backC_;
+    cl_mem mobjA_;
+    cl_mem mobjB_;
+    cl_mem mobjC_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+GemmPerformanceTest<ElemType>::GemmPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn, (problem_size_t)params->M * params->N
+                                            * params->K),
+                        params_(*params), mobjA_(NULL), mobjB_(NULL), mobjC_(NULL)
+{
+    A_ = new ElemType[params_.rowsA * params_.columnsA];
+    B_ = new ElemType[params_.rowsB * params_.columnsB];
+    C_ = new ElemType[params_.rowsC * params_.columnsC];
+    backC_ = new ElemType[params_.rowsC * params_.columnsC];
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+GemmPerformanceTest<ElemType>::~GemmPerformanceTest()
+{
+    delete[] A_;
+    delete[] B_;
+    delete[] C_;
+    delete[] backC_;
+
+    clReleaseMemObject(mobjC_);
+    clReleaseMemObject(mobjB_);
+    clReleaseMemObject(mobjA_);
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+GemmPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize, maxMatrSize;
+    bool ret = true;
+    size_t m = params->M, n = params->N, k = params->K;
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize(0);
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    if (base->useImages()) {
+        maxMatrSize = gmemSize / 5;
+        ret = (k < base->scratchImageWidth() *
+                  sizeof(cl_float4) / sizeof(ElemType));
+    }
+    else {
+        maxMatrSize = gmemSize / 3;
+    }
+    maxMatrSize = std::min(maxMatrSize, allocSize);
+
+    if (ret) {
+        ret = ((std::max(m, n) * k * sizeof(ElemType) < maxMatrSize) &&
+               (m * n * sizeof(ElemType) < maxMatrSize));
+    }
+
+    return ret;
+}
+
+template <typename ElemType> int
+GemmPerformanceTest<ElemType>::prepare(void)
+{
+    bool useAlpha = base_->useAlpha();
+    bool useBeta = base_->useBeta();
+
+    if (useAlpha) {
+        alpha_ = convertMultiplier<ElemType>(params_.alpha);
+    }
+    if (useBeta) {
+        beta_ = convertMultiplier<ElemType>(params_.beta);
+    }
+
+    randomGemmMatrices<ElemType>(params_.order, params_.transA, params_.transB,
+                                 params_.M, params_.N, params_.K, useAlpha,
+                                 &alpha_, A_, params_.lda, B_, params_.ldb,
+                                 useBeta, &beta_, C_, params_.ldc);
+
+    mobjA_ = base_->createEnqueueBuffer(A_, params_.rowsA * params_.columnsA *
+                                        sizeof(ElemType),
+                                        params_.offA * sizeof(ElemType),
+                                        CL_MEM_READ_ONLY);
+    if (mobjA_) {
+        mobjB_ = base_->createEnqueueBuffer(B_, params_.rowsB * params_.columnsB *
+                                            sizeof(ElemType),
+                                            params_.offBX * sizeof(ElemType),
+                                            CL_MEM_READ_ONLY);
+    }
+    if (mobjB_) {
+        mobjC_ = base_->createEnqueueBuffer(backC_, params_.rowsC * params_.columnsC *
+                                            sizeof(ElemType),
+                                            params_.offCY * sizeof(ElemType),
+                                            CL_MEM_READ_WRITE);
+    }
+
+    return (mobjC_) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+GemmPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+    size_t lda, ldb, ldc;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+
+    memcpy(C_, backC_, params_.rowsC * params_.columnsC * sizeof(ElemType));
+    order = params_.order;
+    lda = params_.lda;
+    ldb = params_.ldb;
+    ldc = params_.ldc;
+
+#ifdef PERF_TEST_WITH_ACML
+
+    if (order == clblasRowMajor) {
+        order = clblasColumnMajor;
+        if (params_.transA == clblasNoTrans) {
+            lda = params_.M;
+        }
+        else {
+            lda = params_.K;
+        }
+        if (params_.transB == clblasNoTrans) {
+            ldb = params_.K;
+        }
+        else {
+            ldb = params_.N;
+        }
+        ldc = params_.M;
+    }
+
+    time = getCurrentTime();
+    clMath::blas::gemm(order, params_.transA, params_.transB,
+                    params_.M, params_.N, params_.K,
+                    alpha_, A_, lda, B_, ldb, beta_, C_, ldc);
+    time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+GemmPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0,
+                                  params_.rowsC * params_.columnsC *
+                                  sizeof(ElemType), backC_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Matrix C buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+    status = (cl_int)clMath::clblas::gemm(params_.order,
+        params_.transA, params_.transB, params_.M, params_.N, params_.K, alpha_,
+        mobjA_, params_.offA, params_.lda, mobjB_, params_.offBX, params_.ldb,
+        beta_, mobjC_, params_.offCY, params_.ldc, 1, &queue, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS GEMM function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+
+    return time;
+}
+
+} // namespace clMath
+
+// sgemm performance test
+TEST_P(GEMM, sgemm)
+{
+    TestParams params;
+
+    getParams(&params);
+    GemmPerformanceTest<float>::runInstance(FN_SGEMM, &params);
+}
+
+// dgemm performance test case
+TEST_P(GEMM, dgemm)
+{
+    TestParams params;
+
+    getParams(&params);
+    GemmPerformanceTest<double>::runInstance(FN_DGEMM, &params);
+}
+
+// cgemm performance test case
+TEST_P(GEMM, cgemm)
+{
+    TestParams params;
+
+    getParams(&params);
+    GemmPerformanceTest<FloatComplex>::runInstance(FN_CGEMM, &params);
+}
+
+// zgemm performance test case
+TEST_P(GEMM, zgemm)
+{
+    TestParams params;
+
+    getParams(&params);
+    GemmPerformanceTest<DoubleComplex>::runInstance(FN_ZGEMM, &params);
+}
diff --git a/src/tests/performance/perf-gemm2.cpp b/src/tests/performance/perf-gemm2.cpp
new file mode 100644
index 0000000..e38c1a9
--- /dev/null
+++ b/src/tests/performance/perf-gemm2.cpp
@@ -0,0 +1,397 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Gemm performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <gemm-2.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+//#define SHUNT_ACML_RUN
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class GemmPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~GemmPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        GemmPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        if (fn == FN_SGEMM_2 || fn == FN_DGEMM_2) {
+            opFactor = 2;
+        }
+        else {
+            opFactor = 8;
+        }
+
+        if ((fn == FN_DGEMM_2 || fn == FN_ZGEMM_2) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    GemmPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType beta_;
+    ElemType *A_;
+    ElemType *B_;
+    ElemType *C_;
+    ElemType *backC_;
+    cl_mem mobjA_;
+    cl_mem mobjB_;
+    cl_mem mobjC_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+GemmPerformanceTest<ElemType>::GemmPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn, (problem_size_t)params->M * params->N
+                                            * params->K),
+                        params_(*params), mobjA_(NULL), mobjB_(NULL), mobjC_(NULL)
+{
+    A_ = new ElemType[params_.rowsA * params_.columnsA];
+    B_ = new ElemType[params_.rowsB * params_.columnsB];
+    C_ = new ElemType[params_.rowsC * params_.columnsC];
+    backC_ = new ElemType[params_.rowsC * params_.columnsC];
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+GemmPerformanceTest<ElemType>::~GemmPerformanceTest()
+{
+    if(A_ != NULL)
+    {
+        delete[] A_;
+    }
+	if(B_ != NULL)
+	{
+        delete[] B_;
+	}
+	if(C_ != NULL)
+	{
+        delete[] C_;
+	}
+	if(backC_ != NULL)
+	{
+        delete[] backC_;
+	}
+    if(mobjA_ != NULL)
+ 	{
+        clReleaseMemObject(mobjA_);
+    }
+	if(mobjB_ != NULL)
+    {
+		clReleaseMemObject(mobjB_);
+	}
+	if(mobjC_ != NULL)
+	{
+		clReleaseMemObject(mobjC_);
+	}
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+GemmPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize, maxMatrSize;
+    bool ret = true;
+    size_t m = params->M, n = params->N, k = params->K;
+
+	if((A_ == NULL) || (backC_ == NULL) || (C_ == NULL) || (B_ == NULL))
+	{
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize(0);
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    if (base->useImages()) {
+        maxMatrSize = gmemSize / 5;
+        ret = (k < base->scratchImageWidth() *
+                  sizeof(cl_float4) / sizeof(ElemType));
+    }
+    else {
+        maxMatrSize = gmemSize / 3;
+    }
+    maxMatrSize = std::min(maxMatrSize, allocSize);
+
+    if (ret) {
+        ret = ((std::max(m, n) * k * sizeof(ElemType) < maxMatrSize) &&
+               (m * n * sizeof(ElemType) < maxMatrSize));
+    }
+
+    return ret;
+}
+
+template <typename ElemType> int
+GemmPerformanceTest<ElemType>::prepare(void)
+{
+    bool useAlpha = base_->useAlpha();
+    bool useBeta = base_->useBeta();
+
+    if (useAlpha) {
+        alpha_ = convertMultiplier<ElemType>(params_.alpha);
+    }
+    if (useBeta) {
+        beta_ = convertMultiplier<ElemType>(params_.beta);
+    }
+
+    randomGemmMatrices<ElemType>(params_.order, params_.transA, params_.transB,
+                                 params_.M, params_.N, params_.K, useAlpha,
+                                 &alpha_, A_, params_.lda, B_, params_.ldb,
+                                 useBeta, &beta_, C_, params_.ldc);
+
+    mobjA_ = base_->createEnqueueBuffer(A_, params_.rowsA * params_.columnsA *
+                                        sizeof(ElemType),
+                                        params_.offA * sizeof(ElemType),
+                                        CL_MEM_READ_ONLY);
+    if (mobjA_) {
+        mobjB_ = base_->createEnqueueBuffer(B_, params_.rowsB * params_.columnsB *
+                                            sizeof(ElemType),
+                                            params_.offBX * sizeof(ElemType),
+                                            CL_MEM_READ_ONLY);
+    }
+    if (mobjB_) {
+        mobjC_ = base_->createEnqueueBuffer(backC_, params_.rowsC * params_.columnsC *
+                                            sizeof(ElemType),
+                                            params_.offCY * sizeof(ElemType),
+                                            CL_MEM_READ_WRITE);
+    }
+
+    return (mobjC_) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+GemmPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+    size_t lda, ldb, ldc;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+
+    memcpy(C_, backC_, params_.rowsC * params_.columnsC * sizeof(ElemType));
+    order = params_.order;
+    lda = params_.lda;
+    ldb = params_.ldb;
+    ldc = params_.ldc;
+
+#ifdef PERF_TEST_WITH_ACML
+
+    if (order == clblasRowMajor) {
+        order = clblasColumnMajor;
+        if (params_.transA == clblasNoTrans) {
+            lda = params_.M;
+        }
+        else {
+            lda = params_.K;
+        }
+        if (params_.transB == clblasNoTrans) {
+            ldb = params_.K;
+        }
+        else {
+            ldb = params_.N;
+        }
+        ldc = params_.M;
+    }
+
+    time = getCurrentTime();
+	#ifndef SHUNT_ACML_RUN
+    clMath::blas::gemm(order, params_.transA, params_.transB,
+                    params_.M, params_.N, params_.K,
+                    alpha_, A_, lda, B_, ldb, beta_, C_, ldc);
+	#endif
+    time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+GemmPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event, gemmevent;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0,
+                                  params_.rowsC * params_.columnsC *
+                                  sizeof(ElemType), backC_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Matrix C buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+    status = (cl_int)clMath::clblas::gemm2(params_.order,
+        params_.transA, params_.transB, params_.M, params_.N, params_.K, alpha_,
+        mobjA_, params_.offA, params_.lda, mobjB_, params_.offBX, params_.ldb,
+        beta_, mobjC_, params_.offCY, params_.ldc, 1, &queue, 0, NULL, &gemmevent);
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS GEMM function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &gemmevent);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+
+	//printf("Returning Time: %lu\n", time);
+    return time;
+}
+
+} // namespace clMath
+
+// sgemm performance test
+TEST_P(GEMM2, sgemm)
+{
+    TestParams params;
+
+    getParams(&params);
+    GemmPerformanceTest<float>::runInstance(FN_SGEMM_2, &params);
+}
+
+// dgemm performance test case
+TEST_P(GEMM2, dgemm)
+{
+    TestParams params;
+
+    getParams(&params);
+    GemmPerformanceTest<double>::runInstance(FN_DGEMM_2, &params);
+}
+
+// cgemm performance test case
+TEST_P(GEMM2, cgemm)
+{
+    TestParams params;
+
+    getParams(&params);
+    GemmPerformanceTest<FloatComplex>::runInstance(FN_CGEMM_2, &params);
+}
+
+// zgemm performance test case
+TEST_P(GEMM2, zgemm)
+{
+    TestParams params;
+
+    getParams(&params);
+    GemmPerformanceTest<DoubleComplex>::runInstance(FN_ZGEMM_2, &params);
+}
diff --git a/src/tests/performance/perf-gemv.cpp b/src/tests/performance/perf-gemv.cpp
new file mode 100644
index 0000000..4926d61
--- /dev/null
+++ b/src/tests/performance/perf-gemv.cpp
@@ -0,0 +1,344 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Gemv performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <gemv.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class GemvPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~GemvPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        GemvPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        switch (fn) {
+        case FN_SGEMV:
+            opFactor = sizeof(cl_float);
+            break;
+        case FN_DGEMV:
+            opFactor = sizeof(cl_double);
+        case FN_CGEMV:
+            opFactor = sizeof(FloatComplex);
+            break;
+        case FN_ZGEMV:
+            opFactor = sizeof(DoubleComplex);
+            break;
+        default:
+            break;
+        }
+
+        if ((fn == FN_DGEMV || fn == FN_ZGEMV) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    GemvPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType beta_;
+    ElemType *A_;
+    ElemType *B_;
+    ElemType *C_;
+    ElemType *backC_;
+    cl_mem mobjA_;
+    cl_mem mobjB_;
+    cl_mem mobjC_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+GemvPerformanceTest<ElemType>::GemvPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn, (problem_size_t)params->M * params->N),
+                        params_(*params), mobjA_(NULL), mobjB_(NULL), mobjC_(NULL)
+{
+    A_ = new ElemType[params_.rowsA * params_.columnsA];
+    B_ = new ElemType[params_.rowsB * params_.columnsB];
+    C_ = new ElemType[params_.rowsC * params_.columnsC];
+    backC_ = new ElemType[params_.rowsC * params_.columnsC];
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+GemvPerformanceTest<ElemType>::~GemvPerformanceTest()
+{
+    delete[] A_;
+    delete[] B_;
+    delete[] C_;
+    delete[] backC_;
+
+    clReleaseMemObject(mobjC_);
+    clReleaseMemObject(mobjB_);
+    clReleaseMemObject(mobjA_);
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+GemvPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize, maxMatrSize;
+    size_t m = params->M, n = params->N;
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize(0);
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    maxMatrSize = gmemSize / 3;
+
+    maxMatrSize = std::min(maxMatrSize, allocSize);
+
+    return (m * n * sizeof(ElemType) < maxMatrSize);
+}
+
+template <typename ElemType> int
+GemvPerformanceTest<ElemType>::prepare(void)
+{
+    size_t lenX, lenY;
+    bool useAlpha = base_->useAlpha();
+    bool useBeta = base_->useBeta();
+
+    if (useAlpha) {
+        alpha_ = convertMultiplier<ElemType>(params_.alpha);
+    }
+    if (useBeta) {
+        beta_ = convertMultiplier<ElemType>(params_.beta);
+    }
+
+    if (params_.transA == clblasNoTrans) {
+        lenX = params_.N;
+        lenY = params_.M;
+    }
+    else {
+        lenX = params_.M;
+        lenY = params_.N;
+    }
+    randomGemmxMatrices<ElemType>(params_.order, params_.transA, params_.transB,
+                           params_.transC, lenY, params_.K, lenX, useAlpha,
+                           &alpha_, A_, params_.lda, B_, params_.ldb, useBeta,
+                           &beta_, C_, params_.ldc);
+
+    mobjA_ = base_->createEnqueueBuffer(A_, params_.rowsA * params_.columnsA *
+                                     sizeof(*A_), params_.offA * sizeof(*A_),
+                                     CL_MEM_READ_ONLY);
+    mobjB_ = base_->createEnqueueBuffer(B_, params_.rowsB * params_.columnsB *
+                                     sizeof(*B_), 0, CL_MEM_READ_ONLY);
+    mobjC_ = base_->createEnqueueBuffer(backC_, params_.rowsC * params_.columnsC *
+                                     sizeof(*backC_), 0, CL_MEM_READ_WRITE);
+
+    return ((mobjA_ != NULL) && (mobjB_ != NULL) && (mobjC_ != NULL)) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+GemvPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+    size_t lda;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+
+    memcpy(C_, backC_, params_.rowsC * params_.columnsC * sizeof(ElemType));
+    order = params_.order;
+    lda = params_.lda;
+
+#ifdef PERF_TEST_WITH_ACML
+
+// #warning "GEMV performance test not implemented"
+    time = NANOTIME_MAX;
+    order = order;
+    lda = lda;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+GemvPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0,
+                                  params_.rowsC * params_.columnsC *
+                                  sizeof(ElemType), backC_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Vector Y buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+    status = (cl_int)clMath::clblas::gemv(params_.order,
+        params_.transA, params_.M, params_.N, alpha_, mobjA_, params_.offA,
+        params_.lda, mobjB_, params_.offBX, params_.incx, beta_, mobjC_,
+        params_.offCY, params_.incy, 1, &queue, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS GEMV function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+
+    return time;
+}
+
+} // namespace clMath
+
+// sgemv performance test
+TEST_P(GEMV, sgemv)
+{
+    TestParams params;
+
+    getParams(&params);
+    GemvPerformanceTest<float>::runInstance(FN_SGEMV, &params);
+}
+
+// dgemv performance test case
+TEST_P(GEMV, dgemv)
+{
+    TestParams params;
+
+    getParams(&params);
+    GemvPerformanceTest<double>::runInstance(FN_DGEMV, &params);
+}
+
+// cgemv performance test
+TEST_P(GEMV, cgemv)
+{
+    TestParams params;
+
+    getParams(&params);
+    GemvPerformanceTest<FloatComplex>::runInstance(FN_CGEMV, &params);
+}
+
+// zgemv performance test case
+TEST_P(GEMV, zgemv)
+{
+    TestParams params;
+
+    getParams(&params);
+    GemvPerformanceTest<DoubleComplex>::runInstance(FN_ZGEMV, &params);
+}
diff --git a/src/tests/performance/perf-ger.cpp b/src/tests/performance/perf-ger.cpp
new file mode 100644
index 0000000..0a60954
--- /dev/null
+++ b/src/tests/performance/perf-ger.cpp
@@ -0,0 +1,393 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * GER performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <ger.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class GerPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~GerPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        GerPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor =1;
+
+        if ((fn == FN_DGER || fn == FN_ZGERU) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    GerPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType *A_;
+    ElemType *backA_;
+    ElemType *x_;
+    ElemType *y_;
+    cl_mem mobjA_;
+    cl_mem mobjx_;
+    size_t  lengthA;
+    cl_mem mobjy_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+GerPerformanceTest<ElemType>::GerPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn,(problem_size_t) ( ( (3 * params->M * params->N) + params->M )  * sizeof(ElemType) ) ), params_(*params), mobjA_(NULL), mobjx_(NULL), mobjy_(NULL)
+{
+
+	if( params_.order == clblasColumnMajor )
+			lengthA = params_.N * params_.lda;
+		else
+                        lengthA = params_.M * params_.lda;
+
+    A_ = new ElemType[lengthA + params_.offa];
+    backA_ = new ElemType[lengthA + params_.offa];
+    x_ = new ElemType[(1 + (params->M - 1) * abs(params_.incx))+ params_.offBX];
+    y_ = new ElemType[(1 + (params->N - 1) * abs(params_.incy)) + params_.offCY] ;
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+GerPerformanceTest<ElemType>::~GerPerformanceTest()
+{
+    if(A_ != NULL)
+    {
+    delete[] A_;
+    }
+	if(x_ != NULL)
+	{
+    delete[] x_;
+	}
+	if(y_ != NULL)
+	{
+    delete[] y_;
+	}
+	if(backA_ != NULL)
+	{
+    delete[] backA_;
+	}
+
+	if( mobjy_ != NULL )
+	    clReleaseMemObject(mobjy_);
+    if( mobjx_ != NULL )
+		clReleaseMemObject(mobjx_);
+	if( mobjA_ != NULL )
+	    clReleaseMemObject(mobjA_);
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+GerPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    bool ret;
+    size_t m = params->M, n = params->N;
+
+	if((A_ == NULL) || (backA_ == NULL) || (x_ == NULL) || (y_ == NULL))
+	{
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize( 0 );
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    ret = std::max(m, n) * params_.lda * sizeof(ElemType) < allocSize;
+    ret = ret && ( ((1 + (params_.M-1)*abs(params_.incx)))* sizeof(ElemType) < allocSize);
+    ret = ret && ( ((1 + (params_.N-1)*abs(params_.incy))) * sizeof(ElemType) < allocSize);
+
+    ret = ret && (((std::max(m, n) * params_.lda) + ((1 + (params_.M-1)*abs(params_.incx))) +  ((1 + (params_.N-1)*abs(params_.incy)))) < gmemSize);
+
+    return ret;
+}
+
+template <typename ElemType> int
+GerPerformanceTest<ElemType>::prepare(void)
+{
+    bool useAlpha = base_->useAlpha();
+
+    if (useAlpha) {
+        alpha_ = convertMultiplier<ElemType>(params_.alpha);
+    }
+
+
+    int creationFlags = 0;
+    creationFlags =  creationFlags | RANDOM_INIT;
+
+    creationFlags = ( (params_.order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
+	BlasRoutineID BlasFn = CLBLAS_GER;
+
+	populate( A_+ params_.offa, params_.M, params_.N, params_.lda, BlasFn, creationFlags);
+	populate( x_, (1 + (params_.M-1) * abs(params_.incx) + params_.offBX), 1, (1 + (params_.M-1) * abs(params_.incx) + params_.offBX), BlasFn, creationFlags );
+	populate( y_, (1 + (params_.N-1) * abs(params_.incy) + params_.offCY), 1, (1 + (params_.N-1) * abs(params_.incy) + params_.offCY), BlasFn, creationFlags );
+
+
+    memcpy(backA_, A_, (lengthA + params_.offa)* sizeof(ElemType));
+
+	mobjA_ = base_->createEnqueueBuffer(A_, (lengthA + params_.offa) * sizeof(*A_), 0, CL_MEM_READ_WRITE);
+	mobjx_ = base_->createEnqueueBuffer(x_, ( (1 + (params_.M-1) * abs(params_.incx) + params_.offBX)) * sizeof(*x_), 0, CL_MEM_READ_WRITE);
+	mobjy_ = base_->createEnqueueBuffer(y_,( (1 + (params_.N-1) * abs(params_.incy) + params_.offCY)) * sizeof(*y_) , 0, CL_MEM_READ_WRITE);
+
+     return ( (mobjA_ != NULL) &&  (mobjx_ != NULL) && (mobjy_ != NULL) ) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+GerPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+    size_t lda, fN, fM;
+
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+
+    order = params_.order;
+    lda = params_.lda;
+     fM = params_.M;
+    fN = params_.N;
+
+#ifdef PERF_TEST_WITH_ACML
+
+    clblasOrder fOrder;
+    size_t fOffx, fOffy;
+    int fIncx, fIncy;
+    ElemType *fX, *fY;
+    fOrder = params_.order;
+    fM = params_.M;
+    fN = params_.N;
+    fIncx = params_.incx;
+    fIncy = params_.incy;
+    fX = x_;
+    fY = y_;
+    fOffx = params_.offBX;
+    fOffy = params_.offCY;
+
+    if (fOrder != clblasColumnMajor) {
+           fOrder = clblasColumnMajor;
+           fM = params_.N;
+           fN = params_.M;
+           fX = y_;
+           fY = x_;
+           fIncx = params_.incy;
+           fIncy = params_.incx;
+           fOffx = params_.offCY;
+           fOffy = params_.offBX;
+		}
+		time = getCurrentTime();
+		clMath::blas::ger(order, fM, fN, alpha_, fX, fOffx, fIncx, fY, fOffy, fIncy,  A_, params_.offa, lda);
+		time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+GerPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    status = clEnqueueWriteBuffer(queue, mobjA_, CL_TRUE, 0,
+                                  (lengthA + params_.offa) * sizeof(ElemType), backA_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Matrix A buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+     time = getCurrentTime();
+
+#define TIMING
+#ifdef TIMING
+        clFinish( queue);
+
+        int iter = 20;
+        for ( int i = 1; i <= iter; i++)
+        {
+#endif
+
+    status = (cl_int)clMath::clblas::ger(params_.order, params_.M, params_.N, alpha_,
+         mobjx_, params_.offBX, params_.incx, mobjy_, params_.offCY, params_.incy, mobjA_, params_.offa, params_.lda, 1,
+        &queue, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS GER function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+#ifdef TIMING
+        } // iter loop
+        clFinish( queue);
+    time = getCurrentTime() - time;
+        time /= iter;
+#else
+
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+    return time;
+}
+
+} // namespace clMath
+
+// ger performance test
+
+TEST_P(GER, sger)
+{
+    TestParams params;
+
+    getParams(&params);
+    GerPerformanceTest<float>::runInstance(FN_SGER, &params);
+}
+
+
+TEST_P(GER, dger)
+{
+    TestParams params;
+
+    getParams(&params);
+    GerPerformanceTest<double>::runInstance(FN_DGER, &params);
+}
+
+TEST_P(GER, cgeru)
+{
+    TestParams params;
+
+    getParams(&params);
+    GerPerformanceTest<FloatComplex>::runInstance(FN_CGERU, &params);
+}
+
+
+TEST_P(GER, zgeru)
+{
+    TestParams params;
+
+    getParams(&params);
+    GerPerformanceTest<DoubleComplex>::runInstance(FN_ZGERU, &params);
+}
diff --git a/src/tests/performance/perf-gerc.cpp b/src/tests/performance/perf-gerc.cpp
new file mode 100644
index 0000000..3c4873d
--- /dev/null
+++ b/src/tests/performance/perf-gerc.cpp
@@ -0,0 +1,384 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Symm performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <gerc.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class GercPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~GercPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        GercPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor =1;
+
+		if (fn == FN_ZGERC &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    GercPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType *A_;
+    ElemType *backA_;
+    ElemType *x_;
+    ElemType *y_;
+    cl_mem mobjA_;
+    cl_mem mobjx_;
+    cl_mem mobjy_;
+    int lengthA;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+GercPerformanceTest<ElemType>::GercPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn,(problem_size_t) (((2 *  params->M * params->N) +  params->M + params->N ) * sizeof(ElemType) ) ), params_(*params), mobjA_(NULL), mobjx_(NULL), mobjy_(NULL)
+{
+	//if( params_.side == clblasLeft )
+          //      ka = params_.M;
+        //else    ka = params_.N;
+
+	if( params_.order == clblasColumnMajor )
+			lengthA = params_.N * params_.lda;
+		else
+			lengthA = params_.M * params_.lda;
+
+    A_ = new ElemType[(lengthA) + params_.offa];
+    backA_ = new ElemType[lengthA+ params_.offa];
+    x_ = new ElemType[(1 + (params->M - 1) * abs(params_.incx))+ params_.offBX];
+    y_ = new ElemType[(1 + (params->N - 1) * abs(params_.incy)) + params_.offCY] ;
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+GercPerformanceTest<ElemType>::~GercPerformanceTest()
+{
+    if(A_ != NULL)
+    {
+    delete[] A_;
+    }
+	if(x_ != NULL)
+	{
+    delete[] x_;
+	}
+	if(y_ != NULL)
+	{
+    delete[] y_;
+	}
+	if(backA_ != NULL)
+	{
+    delete[] backA_;
+	}
+
+	if( mobjy_ != NULL )
+	    clReleaseMemObject(mobjy_);
+    if( mobjx_ != NULL )
+		clReleaseMemObject(mobjx_);
+	if( mobjA_ != NULL )
+	    clReleaseMemObject(mobjA_);
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+GercPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    bool ret;
+    size_t m = params->M, n = params->N;
+
+	if((A_ == NULL) || (backA_ == NULL) || (x_ == NULL) || (y_ == NULL))
+	{
+        return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize( 0 );
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    ret = std::max(m, n) * params_.lda * sizeof(ElemType) < allocSize;
+    ret = ret && ( ((1 + (params_.M-1)*abs(params_.incx)))* sizeof(ElemType) < allocSize);
+    ret = ret && ( ((1 + (params_.N-1)*abs(params_.incy))) * sizeof(ElemType) < allocSize);
+
+    ret = ret && (((std::max(m, n) * params_.lda) + ((1 + (params_.M-1)*abs(params_.incx))) +  ((1 + (params_.N-1)*abs(params_.incy)))) < gmemSize);
+
+    return ret;
+}
+
+template <typename ElemType> int
+GercPerformanceTest<ElemType>::prepare(void)
+{
+    bool useAlpha = base_->useAlpha();
+
+    if (useAlpha) {
+        alpha_ = convertMultiplier<ElemType>(params_.alpha);
+    }
+
+
+    int creationFlags = 0;
+    creationFlags =  creationFlags | RANDOM_INIT;
+
+    creationFlags = ( (params_.order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
+	BlasRoutineID funcId = CLBLAS_GER;
+
+	populate( A_ + params_.offa, params_.M, params_.N, params_.lda, funcId, creationFlags);
+	populate( x_ , (1 + (params_.M-1) * abs(params_.incx) + params_.offBX),1, (1 + (params_.M-1) * abs(params_.incx) + params_.offBX), funcId, 0 );
+	populate( y_ , (1 + (params_.N-1) * abs(params_.incy) + params_.offCY),1, (1 + (params_.N-1) * abs(params_.incy) + params_.offCY), funcId, 0 );
+
+
+        memcpy(backA_, A_, (lengthA + params_.offa)* sizeof(ElemType));
+
+	mobjA_ = base_->createEnqueueBuffer(A_, (lengthA + params_.offa) * sizeof(*A_), 0, CL_MEM_READ_WRITE);
+	mobjx_ = base_->createEnqueueBuffer(x_, ( (1 + (params_.M-1) * abs(params_.incx) + params_.offBX)) * sizeof(*x_), 0, CL_MEM_READ_WRITE);
+	mobjy_ = base_->createEnqueueBuffer(y_,( (1 + (params_.N-1) * abs(params_.incy) + params_.offCY)) * sizeof(*y_) , 0, CL_MEM_READ_WRITE);
+
+     return ( (mobjA_ != NULL) &&  (mobjx_ != NULL) && (mobjy_ != NULL) ) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+GercPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+    size_t lda;
+    //int fIncx, fIncy;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+
+    order = params_.order;
+    lda = params_.lda;
+
+#ifdef PERF_TEST_WITH_ACML
+
+	 clblasOrder fOrder;
+    size_t fN, fM;
+    size_t fOffx, fOffy;
+    int fIncx, fIncy;
+    ElemType  *fX, *fY;
+    fOrder = params_.order;
+    fM = params_.M;
+    fN = params_.N;
+    fIncx = params_.incx;
+    fIncy = params_.incy;
+    fX = x_;
+    fY = y_;
+    fOffx = params_.offBX;
+    fOffy = params_.offCY;
+
+    if (fOrder != clblasColumnMajor) {
+           fOrder = clblasColumnMajor;
+
+		   doConjugate( (y_ + params_.offCY), (1 + (params_.N-1) * abs(params_.incy)), 1, 1 );
+           fM = params_.N;
+           fN = params_.M;
+           fX = y_;
+           fY = x_;
+           fIncx = params_.incy;
+           fIncy = params_.incx;
+           fOffx = params_.offCY;
+           fOffy = params_.offBX;
+		   // Note this according to the Legacy guide
+		   time = getCurrentTime();
+			clMath::blas::ger(fOrder, fM, fN, alpha_, fX , fOffx, fIncx, fY, fOffy, fIncy,  A_, params_.offa, params_.lda);
+       }
+	else{
+		time = getCurrentTime();
+		clMath::blas::gerc(order, fM, fN, alpha_, fX, fOffx, params_.incx, fY, fOffy, params_.incy,  A_, params_.offa, lda);
+	}
+    time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML<F2>
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+GercPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    status = clEnqueueWriteBuffer(queue, mobjA_, CL_TRUE, 0,
+                                  (lengthA + params_.offa) * sizeof(ElemType), backA_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Matrix A buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+     time = getCurrentTime();
+
+#define TIMING
+#ifdef TIMING
+        clFinish( queue);
+
+        int iter = 20;
+        for ( int i = 1; i <= iter; i++)
+        {
+#endif
+
+    status = (cl_int)clMath::clblas::gerc(params_.order, params_.M, params_.N, alpha_,
+         mobjx_, params_.offBX, params_.incx, mobjy_, params_.offCY, params_.incy, mobjA_, params_.offa, params_.lda, 1,
+        &queue, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS GERC function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+#ifdef TIMING
+        } // iter loop
+        clFinish( queue);
+    time = getCurrentTime() - time;
+        time /= iter;
+#else
+
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+    return time;
+}
+
+} // namespace clMath
+
+
+TEST_P(GERC, cgerc)
+{
+    TestParams params;
+
+    getParams(&params);
+    GercPerformanceTest<FloatComplex>::runInstance(FN_CGERC, &params);
+}
+
+
+TEST_P(GERC, zgerc)
+{
+    TestParams params;
+
+    getParams(&params);
+    GercPerformanceTest<DoubleComplex>::runInstance(FN_ZGERC, &params);
+}
diff --git a/src/tests/performance/perf-hbmv.cpp b/src/tests/performance/perf-hbmv.cpp
new file mode 100644
index 0000000..57bb352
--- /dev/null
+++ b/src/tests/performance/perf-hbmv.cpp
@@ -0,0 +1,321 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Hbmv performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <gbmv.h>
+#include <hbmv.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class HbmvPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~HbmvPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        HbmvPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor = 1;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        if ((fn == FN_ZHBMV) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    HbmvPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha;
+    ElemType beta;
+    ElemType *A_;
+    ElemType *X_;
+    ElemType *Y_;
+    ElemType *backY_;
+    cl_mem mobjA_;
+    cl_mem mobjX_;
+    cl_mem mobjY_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+HbmvPerformanceTest<ElemType>::HbmvPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn,
+    (problem_size_t)( ( (2 * (params->N) * (params->K + 1)   // A-access
+                          - (2 * params->K *  (params->K+1)) )       // Substract hole-part for A & X
+                        +( ((2*params->K + 1) * params->N + 2*params->N))   // X & Y access
+                                                                                                              ) * sizeof(ElemType) ) ),
+                          params_(*params), mobjA_(NULL), mobjX_(NULL), mobjY_(NULL)
+{
+    size_t lenA, lenX, lenY;
+    lenA = (params_.N) * (params_.lda) + params_.offA;
+    lenX = ((params_.N) - 1)* params_.incx + 1 + params_.offBX;
+    lenY = ((params_.N) - 1)* params_.incy + 1 + params_.offCY;
+    A_ = new ElemType[ lenA ];
+    X_ = new ElemType[ lenX ];
+    Y_ = new ElemType[ lenY ];
+    backY_ = new ElemType[ lenY ];
+    alpha = convertMultiplier<ElemType>(params_.alpha);
+	beta  = convertMultiplier<ElemType>(params_.beta);
+
+    base_ = ::clMath::BlasBase::getInstance();
+
+	mobjA_ = NULL;
+	mobjX_ = NULL;
+	mobjY_ = NULL;
+}
+
+template <typename ElemType>
+HbmvPerformanceTest<ElemType>::~HbmvPerformanceTest()
+{
+    if(A_ != NULL)
+    {
+        delete[] A_;
+    }
+	if(X_ != NULL)
+	{
+        delete[] X_;
+	}
+	if(backY_ != NULL)
+	{
+		delete[] backY_;
+	}
+	if(Y_ != NULL)
+	{
+	    delete[] Y_;
+	}
+
+    if ( mobjA_ != NULL )
+		clReleaseMemObject(mobjA_);
+	if ( mobjX_ != NULL )
+	    clReleaseMemObject(mobjX_);
+	if ( mobjY_ != NULL )
+		clReleaseMemObject(mobjY_);
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+HbmvPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t n = params->N, lda = params->lda;
+    size_t lenA = ((n ) * lda  + params->offA)* sizeof(ElemType);
+    size_t lenX = (((params->N) - 1)* params->incx + 1 + params->offBX) * sizeof(ElemType);
+    size_t lenY = (((params->N) - 1)* params->incy + 1 + params->offCY) * sizeof(ElemType);
+
+    if((A_ == NULL) || (X_ == NULL) || (Y_ == NULL) || (backY_ == NULL))
+	{
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize(0);
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    bool suff = (lenA < allocSize) && ( (lenA + lenX + lenY) < gmemSize );
+
+    return suff;
+}
+
+template <typename ElemType> int
+HbmvPerformanceTest<ElemType>::prepare(void)
+{
+    size_t lenX, lenY, lenA;
+
+    lenA = (params_.N ) * params_.lda + params_.offA;
+    lenX = (params_.N - 1)*abs(params_.incx) + 1 + params_.offBX;
+    lenY = (params_.N - 1)*abs(params_.incy) + 1 + params_.offCY;
+
+    randomGbmvMatrices(params_.order, clblasNoTrans, params_.N, params_.N, &alpha, &beta,
+                        (A_+params_.offA), params_.lda, (X_+params_.offBX), params_.incx, (Y_+params_.offCY), params_.incy );
+
+    memcpy(backY_, Y_, lenY * sizeof(ElemType));
+
+    mobjA_ = base_->createEnqueueBuffer(A_, lenA * sizeof(ElemType), 0, CL_MEM_READ_ONLY);
+    mobjX_ = base_->createEnqueueBuffer(X_, lenX * sizeof(ElemType), 0, CL_MEM_READ_ONLY);
+    mobjY_ = base_->createEnqueueBuffer(backY_, lenY * sizeof(ElemType), 0, CL_MEM_READ_WRITE);
+
+    return ((mobjA_ != NULL) && (mobjX_ != NULL) && (mobjY_ != NULL)) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+HbmvPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder fOrder;
+    clblasUplo fUplo;
+    size_t lda, lenY;
+    size_t fN = params_.N, fK = params_.K;
+
+    lenY = ((params_.N) - 1)* params_.incy + 1 + params_.offCY;
+
+    memcpy(Y_, backY_, lenY * sizeof(ElemType));
+    fOrder = params_.order;
+    fUplo = params_.uplo;
+    lda = params_.lda;
+
+    if (fOrder != clblasColumnMajor)
+    {
+        fOrder = clblasColumnMajor;
+        fUplo = (params_.uplo == clblasLower)? clblasUpper : clblasLower;
+    }
+
+
+#ifdef PERF_TEST_WITH_ACML
+
+   	time = getCurrentTime();
+   	clMath::blas::hbmv(fOrder, fUplo, fN, fK, alpha, A_, params_.offA, lda,
+							X_, params_.offBX, params_.incx, beta, Y_, params_.offCY, params_.incy);
+  	time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+HbmvPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    size_t lenY;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    lenY = ((params_.N) - 1)* params_.incy + 1 + params_.offCY;
+
+    status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0,
+                                  lenY * sizeof(ElemType), backY_, 0, NULL, &event);
+
+    if (status != CL_SUCCESS) {
+        cerr << "Vector Y buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+    time = getCurrentTime();
+    int iter = 20;
+	for ( int i = 1; i <= iter; i++)
+	{
+        status = clMath::clblas::hbmv(params_.order, params_.uplo, params_.N, params_.K,
+                                        alpha, mobjA_, params_.offA, params_.lda, mobjX_, params_.offBX, params_.incx,
+                                        beta, mobjY_, params_.offCY, params_.incy, 1, &queue, 0, NULL, &event);
+
+        if (status != CL_SUCCESS) {
+            cerr << "The CLBLAS GBMV function failed, status = " <<
+                    status << endl;
+            return NANOTIME_ERR;
+        }
+    }
+    clFinish( queue );
+    time = getCurrentTime() - time;
+	time /= iter;
+
+    return time;
+}
+
+} // namespace clMath
+
+// chbmv performance test
+TEST_P(HBMV, chbmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    HbmvPerformanceTest<FloatComplex>::runInstance(FN_CHBMV, &params);
+}
+
+// zhbmv performance test case
+TEST_P(HBMV, zhbmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    HbmvPerformanceTest<DoubleComplex>::runInstance(FN_ZHBMV, &params);
+}
diff --git a/src/tests/performance/perf-hemm.cpp b/src/tests/performance/perf-hemm.cpp
new file mode 100644
index 0000000..6c28ecf
--- /dev/null
+++ b/src/tests/performance/perf-hemm.cpp
@@ -0,0 +1,371 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <hemm.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+//#define SHUNT_ACML_RUN
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class HemmPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~HemmPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        HemmPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+            opFactor = 8;
+
+        if ((fn == FN_ZHEMM) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    HemmPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType beta_;
+    ElemType *A_;
+    ElemType *B_;
+    ElemType *C_;
+    ElemType *backC_;
+    cl_mem mobjA_;
+    cl_mem mobjB_;
+    cl_mem mobjC_;
+    size_t ka, kbc;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+HemmPerformanceTest<ElemType>::HemmPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn,
+        						(problem_size_t) ( params->M * params->N * ( (params->side == clblasLeft)? params->M : params->N ) ) ),
+                        params_(*params), mobjA_(NULL), mobjB_(NULL), mobjC_(NULL)
+{
+	if( params_.side == clblasLeft )
+                ka = params_.M;
+        else    ka = params_.N;
+
+	if( params_.order == clblasColumnMajor )
+				kbc = params_.N;
+		else	kbc = params_.M;
+
+	A_ = new ElemType[params_.lda * ka + params_.offA];
+    B_ = new ElemType[params_.ldb * kbc + params_.offBX];
+    C_ = new ElemType[params_.ldc * kbc + params_.offCY];
+    backC_ = new ElemType[params_.ldc * kbc + params_.offCY];
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+HemmPerformanceTest<ElemType>::~HemmPerformanceTest()
+{
+    if(A_ != NULL)
+    {
+    delete[] A_;
+    }
+	if(B_ != NULL)
+	{
+    delete[] B_;
+	}
+	if(C_ != NULL)
+	{
+    delete[] C_;
+	}
+	if(backC_ != NULL)
+	{
+    delete[] backC_;
+	}
+
+	if( mobjC_ != NULL )
+    {
+	    clReleaseMemObject(mobjC_);
+    }
+    if( mobjB_ != NULL )
+    {
+		clReleaseMemObject(mobjB_);
+    }
+	if( mobjA_ != NULL )
+    {
+	    clReleaseMemObject(mobjA_);
+}
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+HemmPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    bool ret;
+    size_t m = params->M, n = params->N;
+
+	if((A_ == NULL) || (backC_ == NULL) || (C_ == NULL) || (B_ == NULL))
+	{
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize( 0 );
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    ret = (std::max(m, n) * params_.lda * sizeof(ElemType)) < allocSize;
+    ret = (ret && (std::max(m, n) * params_.ldb * sizeof(ElemType)) < allocSize);
+    ret = (ret && (std::max(m, n) * params_.ldc * sizeof(ElemType)) < allocSize);
+    ret = (ret && (((std::max(m, n) * params_.lda) + (std::max(m, n) * params_.ldb) + (std::max(m, n) * params_.ldc))) < gmemSize);
+
+    return ret;
+}
+
+template <typename ElemType> int
+HemmPerformanceTest<ElemType>::prepare(void)
+{
+    //bool useAlpha = base_->useAlpha();
+    //bool useBeta = base_->useBeta();
+
+    int creationFlags = 0, AcreationFlags = 0;
+    creationFlags =  creationFlags | RANDOM_INIT;
+
+    creationFlags = ( (params_.order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
+    AcreationFlags = ( (params_.uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY);
+	BlasRoutineID BlasFn = CLBLAS_HEMM;
+
+    populate( A_ + params_.offA, ka, ka, params_.lda, BlasFn, AcreationFlags);
+	populate( B_ + params_.offBX, params_.M, params_.N, params_.ldb, BlasFn, creationFlags );
+	populate( C_ + params_.offCY, params_.M, params_.N, params_.ldc, BlasFn, creationFlags );
+
+	memcpy( backC_, C_, (kbc * params_.ldc + params_.offCY) * sizeof(ElemType) );
+
+		mobjA_ = base_->createEnqueueBuffer(A_, (params_.lda * ka  + params_.offA) * sizeof(ElemType), 0, CL_MEM_READ_ONLY);
+        mobjB_ = base_->createEnqueueBuffer(B_, (params_.ldb * kbc + params_.offBX) * sizeof(ElemType), 0, CL_MEM_READ_ONLY);
+        mobjC_ = base_->createEnqueueBuffer(backC_, (params_.ldc * kbc + params_.offCY) * sizeof(ElemType), 0, CL_MEM_READ_WRITE);
+
+    return (mobjC_) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+HemmPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+    clblasUplo fUplo;
+	clblasSide fSide;
+	size_t lda, ldb, ldc, fN, fM;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+
+    order = params_.order;
+	fUplo = params_.uplo;
+	fSide = params_.side;
+    lda = params_.lda;
+    ldb = params_.ldb;
+    ldc = params_.ldc;
+	fM = params_.M;
+	fN = params_.N;
+
+#ifdef PERF_TEST_WITH_ACML
+
+	if (order != clblasColumnMajor) {
+
+		   order = clblasColumnMajor;
+           fM = params_.N;
+           fN = params_.M;
+           fSide = (params_.side == clblasLeft)? clblasRight: clblasLeft;
+           fUplo = (params_.uplo == clblasUpper)? clblasLower: clblasUpper;
+       }
+
+
+    time = getCurrentTime();
+    #ifndef SHUNT_ACML_RUN
+   clMath::blas::hemm(order, fSide, fUplo, fM, fN, alpha_,
+						A_, params_.offA, lda, B_, params_.offBX, ldb, beta_, C_, params_.offCY, ldc);
+    #endif
+    time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+HemmPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0,
+                                  (params_.ldc * kbc + params_.offCY) * sizeof(ElemType), backC_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Matrix C buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+
+	time = getCurrentTime();
+//#define TIMING
+#ifdef TIMING
+	clFinish( queue);
+
+	int iter = 20;
+	for ( int i = 1; i <= iter; i++)
+	{
+#endif
+    status = (cl_int)clMath::clblas::hemm(params_.order,
+        params_.side, params_.uplo, params_.M, params_.N, alpha_,
+        mobjA_, params_.offA, params_.lda, mobjB_, params_.offBX, params_.ldb, beta_, mobjC_, params_.offCY, params_.ldc, 1,
+        &queue, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS HEMM function failed, status = " <<
+                status << endl;
+        return NANOTIME_ERR;
+    }
+#ifdef TIMING
+	} // iter loop
+	clFinish( queue);
+    time = getCurrentTime() - time;
+	time /= iter;
+#else
+
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+
+    return time;
+}
+
+} // namespace clMath
+
+TEST_P(HEMM, chemm)
+{
+    TestParams params;
+
+    getParams(&params);
+    HemmPerformanceTest<FloatComplex>::runInstance(FN_CHEMM, &params);
+}
+
+
+TEST_P(HEMM, zhemm)
+{
+    TestParams params;
+
+    getParams(&params);
+    HemmPerformanceTest<DoubleComplex>::runInstance(FN_ZHEMM, &params);
+}
diff --git a/src/tests/performance/perf-hemv.cpp b/src/tests/performance/perf-hemv.cpp
new file mode 100644
index 0000000..b3fb6d7
--- /dev/null
+++ b/src/tests/performance/perf-hemv.cpp
@@ -0,0 +1,347 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Hemv performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <hemv.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class HemvPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~HemvPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        HemvPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+		opFactor = 1; //FIX-ME
+
+        if ((fn == FN_ZHEMV) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    HemvPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType *A_;
+	ElemType *X_;
+	ElemType *Y_;
+    ElemType *backY_;
+    cl_mem mobjA_;
+    cl_mem mobjX_;
+	cl_mem mobjY_;
+	ElemType alpha, beta;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+HemvPerformanceTest<ElemType>::HemvPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest( fn, (problem_size_t)( ( ((2 * (( params->N * (params->N)) + params->N)) ) * sizeof(ElemType) ) ) ),
+    params_(*params), mobjA_(NULL), mobjX_(NULL)
+{
+
+    A_ = new ElemType[params_.N * params_.lda + params_.offA];
+    X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx)  + params_.offBX];
+	Y_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incy)  + params_.offCY];
+    backY_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incy)  + params_.offCY];
+	alpha = convertMultiplier<ElemType>(params_.alpha);
+	beta  = convertMultiplier<ElemType>(params_.beta);
+
+    base_ = ::clMath::BlasBase::getInstance();
+
+	mobjA_ = NULL;
+	mobjX_ = NULL;
+	mobjY_ = NULL;
+}
+
+template <typename ElemType>
+HemvPerformanceTest<ElemType>::~HemvPerformanceTest()
+{
+	if(A_ != NULL)
+    {
+    delete[] A_;
+    }
+	if(X_ != NULL)
+	{
+    delete[] X_;
+	}
+	if(backY_ != NULL)
+	{
+		delete[] backY_;
+	}
+	if(Y_ != NULL)
+	{
+	delete[] Y_;
+	}
+
+    if ( mobjA_ != NULL )
+		clReleaseMemObject(mobjA_);
+	if ( mobjX_ != NULL )
+	    clReleaseMemObject(mobjX_);
+	if ( mobjY_ != NULL )
+		clReleaseMemObject(mobjY_);
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+HemvPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t n = params->N;
+
+	if((A_ == NULL) || (X_ == NULL) || (Y_ == NULL) || (backY_ == NULL))
+	{
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize( 0 );
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    bool suff = ( sizeof(ElemType)*n*params->lda < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations
+	suff = suff && ((( n*params->lda + (1 + (n-1)*abs(params->incx)) + (1 + (n-1)*abs(params->incy)))*sizeof(ElemType)) < gmemSize) ; //for total global allocations
+
+    return suff ;
+}
+
+template <typename ElemType> int
+HemvPerformanceTest<ElemType>::prepare(void)
+{
+    size_t lenX, N, lenY;
+	N = params_.N;
+    lenX = 1 + (N-1) * abs(params_.incx);
+	lenY = 1 + (N-1) * abs(params_.incy);
+
+	randomHemvMatrices(params_.order, params_.uplo, N, true, &alpha, (A_ + params_.offA), params_.lda,
+                        (X_ + params_.offBX), params_.incx, true, &beta, (Y_ + params_.offCY), params_.incy);
+
+	memcpy(backY_, Y_, (lenY+ params_.offCY )* sizeof(ElemType));
+
+    mobjA_ = base_->createEnqueueBuffer(A_, (params_.N * params_.lda + params_.offA)* sizeof(*A_), 0, CL_MEM_READ_ONLY);
+    mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX )* sizeof(*X_), 0, CL_MEM_READ_ONLY);
+	mobjY_ = base_->createEnqueueBuffer(Y_, (lenY + params_.offCY )* sizeof(*Y_), 0, CL_MEM_READ_WRITE);
+
+    return ( (mobjA_ != NULL) &&  (mobjX_ != NULL) && (mobjY_ != NULL) ) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+HemvPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+	clblasUplo fUplo;
+    size_t lda;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+    order = params_.order;
+	fUplo = params_.uplo;
+    lda	 = params_.lda;
+
+#ifdef PERF_TEST_WITH_ACML
+
+	if (order != clblasColumnMajor)
+    {
+        order = clblasColumnMajor;
+		fUplo =  (params_.uplo == clblasUpper)? clblasLower : clblasUpper;
+		doConjugate( (A_ + params_.offA), params_.N, params_.N, params_.lda );
+   	}
+
+   	time = getCurrentTime();
+  	clMath::blas::hemv(order, fUplo, params_.N, alpha, A_, params_.offA, lda,
+							X_, params_.offBX, params_.incx, beta, Y_, params_.offCY, params_.incy);
+  	time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+HemvPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+	int lenY = 1 + (params_.N-1) * abs(params_.incy);
+
+    status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0,
+                                  (lenY + params_.offCY )* sizeof(ElemType), backY_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Vector Y buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+
+	time = getCurrentTime();
+#define TIMING
+#ifdef TIMING
+
+	int iter = 20;
+	for ( int i = 1; i <= iter; i++)
+	{
+#endif
+		status = (cl_int)clMath::clblas::hemv(params_.order, params_.uplo, params_.N, alpha, mobjA_, params_.offA, params_.lda,
+						mobjX_, params_.offBX, params_.incx, beta, mobjY_, params_.offCY, params_.incy,
+						1, &queue, 0, NULL, &event);
+
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS HEMV function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+#ifdef TIMING
+	} // iter loop
+	clFinish( queue);
+    time = getCurrentTime() - time;
+	time /= iter;
+#else
+
+	status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+
+	//printf("Time elapsed : %lu\n", time);
+#endif
+
+    return time;
+}
+
+} // namespace clMath
+
+
+TEST_P(HEMV, chemv)
+{
+    TestParams params;
+
+    getParams(&params);
+    HemvPerformanceTest<FloatComplex>::runInstance(FN_CHEMV, &params);
+}
+
+TEST_P(HEMV, zhemv)
+{
+    TestParams params;
+
+    getParams(&params);
+    HemvPerformanceTest<DoubleComplex>::runInstance(FN_ZHEMV, &params);
+}
+
diff --git a/src/tests/performance/perf-her.cpp b/src/tests/performance/perf-her.cpp
new file mode 100644
index 0000000..d7a39ab
--- /dev/null
+++ b/src/tests/performance/perf-her.cpp
@@ -0,0 +1,324 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * HER performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <her.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class HerPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~HerPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        HerPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor = 1;
+
+        if ((fn == FN_ZHER) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    HerPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType *A_;
+    ElemType *X_;
+    ElemType *backA_;
+    cl_mem mobjA_;
+    cl_mem mobjX_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+HerPerformanceTest<ElemType>::HerPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn, (problem_size_t)((((params->N * params->N) + params->N) * 2 ) * sizeof(ElemType))),
+                          params_(*params), mobjA_(NULL), mobjX_(NULL)
+{
+    A_ = new ElemType[params_.N * params_.lda + params_.offa];
+    X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx)  + params_.offBX];
+    backA_ = new ElemType[params_.N * params_.lda + params_.offa];
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+HerPerformanceTest<ElemType>::~HerPerformanceTest()
+{
+    if(A_ != NULL)
+    {
+        delete[] A_;
+    }
+	if(backA_ != NULL)
+	{
+		delete[] backA_;
+	}
+	if(X_ != NULL)
+	{
+        delete[] X_;
+	}
+
+	if(mobjX_ != NULL) {
+		clReleaseMemObject(mobjX_);
+    }
+	if(mobjA_ != NULL) {
+		clReleaseMemObject(mobjA_);
+	}
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+HerPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t n = params->N;
+
+	if((A_ == NULL) || (backA_ == NULL) || (X_ == NULL))
+	{
+        return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize(0);
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    bool suff = ( sizeof(ElemType)*n*params->lda < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations
+    suff = suff && ((( n*params->lda + (1 + (n-1)*abs(params->incx))*2)*sizeof(ElemType)) < gmemSize) ; //for total global allocations
+
+    return suff ;
+}
+
+template <typename ElemType> int
+HerPerformanceTest<ElemType>::prepare(void)
+{
+	size_t lenX = 1 + (params_.N-1) * abs(params_.incx);
+    alpha_ = convertMultiplier<ElemType>(params_.alpha);
+	randomHerMatrices( params_.order, params_.uplo, params_.N, &alpha_, (A_ + params_.offa), params_.lda, (X_ + params_.offBX), params_.incx );
+	memcpy(backA_, A_, ((params_.N * params_.lda + params_.offa)* sizeof(ElemType)));
+
+    mobjA_ = base_->createEnqueueBuffer(A_, (params_.N * params_.lda + params_.offa)* sizeof(*A_), 0, CL_MEM_READ_WRITE);
+    mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX )* sizeof(*X_), 0, CL_MEM_READ_ONLY);
+
+    return ( (mobjA_ != NULL) &&  (mobjX_ != NULL) ) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+HerPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+	clblasOrder order;
+	size_t lda;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+
+    order = params_.order;
+    lda = params_.lda;
+
+#ifdef PERF_TEST_WITH_ACML
+
+    clblasOrder fOrder;
+    clblasUplo fUplo;
+    fOrder = params_.order;
+	fUplo = params_.uplo;
+
+	if (order != clblasColumnMajor)
+    {
+		doConjugate( (X_ + params_.offBX), (1 + (params_.N-1) * abs(params_.incx)), 1, 1 );
+        fOrder = clblasColumnMajor;
+        fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower;
+    }
+
+   	time = getCurrentTime();
+   	clMath::blas::her(fOrder, fUplo, params_.N, CREAL(alpha_), X_, params_.offBX, params_.incx, A_, params_.offa, lda);
+	time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+HerPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    status = clEnqueueWriteBuffer(queue, mobjA_, CL_TRUE, 0,
+                                  ((params_.N * params_.lda) + params_.offa) *
+                                  sizeof(ElemType), backA_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Matrix A buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+
+#define TIMING
+#ifdef TIMING
+    clFinish( queue);
+    time = getCurrentTime();
+
+    int iter = 20;
+    for ( int i = 1; i <= iter; i++)
+    {
+#endif
+    status = (cl_int)clMath::clblas::her(params_.order, params_.uplo, params_.N, CREAL(alpha_), mobjX_, params_.offBX, params_.incx,
+				mobjA_, params_.offa, params_.lda, 1, &queue, 0, NULL, &event);
+
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS HER function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+#ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+#else
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+
+    return time;
+}
+
+} // namespace clMath
+
+TEST_P(HER, cher)
+{
+    TestParams params;
+
+    getParams(&params);
+    HerPerformanceTest<FloatComplex>::runInstance(FN_CHER, &params);
+}
+
+TEST_P(HER, zher)
+{
+    TestParams params;
+
+    getParams(&params);
+    HerPerformanceTest<DoubleComplex>::runInstance(FN_ZHER, &params);
+}
diff --git a/src/tests/performance/perf-her2.cpp b/src/tests/performance/perf-her2.cpp
new file mode 100644
index 0000000..ababf1e
--- /dev/null
+++ b/src/tests/performance/perf-her2.cpp
@@ -0,0 +1,348 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Her2 performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <her2.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class Her2PerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~Her2PerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        Her2PerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor = 1;
+
+        if ((fn == FN_ZHER2) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    Her2PerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType *A_;
+    ElemType *X_;
+	ElemType *Y_;
+	ElemType *backA_;
+    cl_mem mobjA_;
+    cl_mem mobjX_;
+	cl_mem mobjY_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+Her2PerformanceTest<ElemType>::Her2PerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn, (problem_size_t)((((params->N * params->N) + (params->N)) * 3 ) * sizeof(ElemType))),
+	params_(*params), mobjA_(NULL), mobjX_(NULL), mobjY_(NULL)
+{
+    A_ = new ElemType[params_.N * params_.lda + params_.offa];
+    X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx) + params_.offBX];
+	Y_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incy) + params_.offCY];
+    backA_ = new ElemType[params_.N * params_.lda + params_.offa];
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+Her2PerformanceTest<ElemType>::~Her2PerformanceTest()
+{
+	if(A_ != NULL)
+    {
+    delete[] A_;
+    }
+	if(backA_ != NULL)
+	{
+		delete[] backA_;
+	}
+	if(X_ != NULL)
+	{
+    delete[] X_;
+	}
+	if(Y_ != NULL)
+	{
+	delete[] Y_;
+	}
+
+	if(mobjX_ != NULL) {
+		clReleaseMemObject(mobjX_);
+    }
+
+	if(mobjY_ != NULL) {
+		clReleaseMemObject(mobjY_);
+	}
+
+	if(mobjA_ != NULL) {
+		clReleaseMemObject(mobjA_);
+	}
+}
+
+template <typename ElemType> bool
+Her2PerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t n = params->N;
+
+	if((A_ == NULL) || (backA_ == NULL) || (X_ == NULL) || (Y_ == NULL))
+	{
+        return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize(0);
+    allocSize = (size_t)base->maxMemAllocSize();
+
+	bool suff = ( sizeof(ElemType)*n*params->lda < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize)
+				  && ((1 + (n-1)*abs(params->incy))*sizeof(ElemType) < allocSize); //for individual allocations
+    suff = suff && ((( n*params->lda + (1 + (n-1)*abs(params->incx)) + (1 + (n-1)*abs(params->incy)))*sizeof(ElemType)) < gmemSize) ; //for total global allocations
+
+    return suff ;
+}
+
+template <typename ElemType> int
+Her2PerformanceTest<ElemType>::prepare(void)
+{
+    //bool useAlpha = true;
+	size_t lenX = 1 + (params_.N-1) * abs(params_.incx);
+	size_t lenY = 1 + (params_.N-1) * abs(params_.incy);
+
+    alpha_ = convertMultiplier<ElemType>(params_.alpha);
+
+	randomHer2Matrices<ElemType>(params_.order, params_.uplo, params_.N, &alpha_, (A_ + params_.offa), params_.lda,
+							(X_ + params_.offBX), params_.incx, (Y_ + params_.offCY), params_.incy);
+
+	memcpy(backA_, A_, ((params_.N * params_.lda + params_.offa)* sizeof(ElemType)));
+
+    mobjA_ = base_->createEnqueueBuffer(A_, (params_.N * params_.lda + params_.offa)* sizeof(*A_), 0, CL_MEM_READ_WRITE);
+    mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX ) * sizeof(*X_), 0, CL_MEM_READ_ONLY);
+	mobjY_ = base_->createEnqueueBuffer(Y_, (lenY + params_.offCY ) * sizeof(*Y_), 0, CL_MEM_READ_ONLY);
+
+	return ((mobjA_ != NULL) &&  (mobjX_ != NULL) && (mobjY_ != NULL)) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+Her2PerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+	clblasOrder order;
+    clblasUplo fUplo;
+    nano_time_t time = 0;
+    size_t lda;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+
+    order = params_.order;
+    lda = params_.lda;
+    fUplo = params_.uplo;
+
+#ifdef PERF_TEST_WITH_ACML
+
+    ElemType *fX, *fY;
+    int fIncx, fIncy;
+    size_t fOffx, fOffy;
+	fX = X_;    fOffx = params_.offBX;  fIncx = params_.incx;
+	fY = Y_;    fOffy = params_.offCY;  fIncy = params_.incy;
+
+
+	if (order != clblasColumnMajor)
+    {
+		doConjugate( (X_ + params_.offBX), (1 + (params_.N-1) * abs(params_.incx)), 1, 1 );
+        doConjugate( (Y_ + params_.offCY), (1 + (params_.N-1) * abs(params_.incy)), 1, 1 );
+        order = clblasColumnMajor;
+        fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower;
+	    fX = Y_;    fOffx = params_.offCY;  fIncx = params_.incy;
+	    fY = X_;    fOffy = params_.offBX;  fIncy = params_.incx;
+    }
+
+   	time = getCurrentTime();
+   	clMath::blas::her2(order, fUplo, params_.N, alpha_, fX, fOffx, fIncx, fY,
+					fOffy, fIncy, A_, params_.offa, lda);
+	time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+Her2PerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    status = clEnqueueWriteBuffer(queue, mobjA_, CL_TRUE, 0,
+                                  ((params_.N * params_.lda) + params_.offa) *
+                                  sizeof(ElemType), backA_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Matrix A buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+
+#define TIMING
+#ifdef TIMING
+    clFinish( queue);
+    time = getCurrentTime();
+
+    int iter = 20;
+    for ( int i = 1; i <= iter; i++)
+    {
+#endif
+    status = (cl_int)clMath::clblas::her2(params_.order, params_.uplo, params_.N, alpha_, mobjX_, params_.offBX, params_.incx,
+				mobjY_, params_.offCY, params_.incy, mobjA_, params_.offa, params_.lda, 1, &queue, 0, NULL, &event);
+
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS HER2 function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+#ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+#else
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+
+    return time;
+}
+
+} // namespace clMath
+
+TEST_P(HER2, cher2)
+{
+    TestParams params;
+
+    getParams(&params);
+    Her2PerformanceTest<FloatComplex>::runInstance(FN_CHER2, &params);
+}
+
+TEST_P(HER2, zher2)
+{
+    TestParams params;
+
+    getParams(&params);
+    Her2PerformanceTest<DoubleComplex>::runInstance(FN_ZHER2, &params);
+}
diff --git a/src/tests/performance/perf-her2k.cpp b/src/tests/performance/perf-her2k.cpp
new file mode 100644
index 0000000..d959cb0
--- /dev/null
+++ b/src/tests/performance/perf-her2k.cpp
@@ -0,0 +1,353 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <her2k.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class Her2kPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~Her2kPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        Her2kPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor = 8;
+
+        if (( fn == FN_ZHER2K) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    Her2kPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType beta_;
+    ElemType *A_;
+    ElemType *B_;
+    ElemType *C_;
+    ElemType *backC_;
+    cl_mem mobjA_;
+    cl_mem mobjB_;
+    cl_mem mobjC_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+Her2kPerformanceTest<ElemType>::Her2kPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn, (problem_size_t)((params->N * params->N * params->K)) ),
+                        params_(*params), mobjA_(NULL), mobjC_(NULL)
+{
+    A_ = new ElemType[params_.rowsA * params_.columnsA];
+    B_ = new ElemType[params_.rowsB * params_.columnsB];
+    C_ = new ElemType[params_.rowsC * params_.columnsC];
+    backC_ = new ElemType[params_.rowsC * params_.columnsC];
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+Her2kPerformanceTest<ElemType>::~Her2kPerformanceTest()
+{
+    if(A_!=NULL)
+    {
+        delete[] A_;
+    }
+    if(B_!=NULL)
+    {
+        delete[] B_;
+    }
+    if(C_!=NULL)
+    {
+        delete[] C_;
+    }
+    if(backC_!=NULL)
+    {
+        delete[] backC_;
+    }
+    if(mobjC_!=NULL)
+    {
+        clReleaseMemObject(mobjC_);
+    }
+    if(mobjA_!=NULL)
+    {
+        clReleaseMemObject(mobjA_);
+    }
+    if(mobjB_!=NULL)
+    {
+        clReleaseMemObject(mobjB_);
+    }
+}
+
+template <typename ElemType> bool
+Her2kPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize, maxMatrSize;
+    size_t n = params->N, k = params->K;
+
+    if((A_ == NULL) || (B_ == NULL) || (backC_ == NULL) || (C_ == NULL))
+    {
+        return 0;
+    }
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize(0);
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    maxMatrSize = gmemSize / 2;
+    maxMatrSize = std::min(maxMatrSize, allocSize);
+    return ((2 * n * k * sizeof(ElemType)) + (n * n * sizeof(ElemType)) < maxMatrSize);
+
+}
+
+template <typename ElemType> int
+Her2kPerformanceTest<ElemType>::prepare(void)
+{
+    alpha_ = convertMultiplier<ElemType>(params_.alpha);
+    beta_ = convertMultiplier<ElemType>(params_.beta);
+
+    clblasTranspose ftransB = (params_.transA==clblasNoTrans)? clblasConjTrans: clblasNoTrans;
+
+    randomGemmMatrices<ElemType>(params_.order, params_.transA, ftransB,
+                                params_.N, params_.N, params_.K, true, &alpha_, A_, params_.lda,
+                                B_, params_.ldb, true, &beta_, backC_, params_.ldc);
+
+
+    mobjA_ = base_->createEnqueueBuffer(A_, params_.rowsA * params_.columnsA *
+                                        sizeof(ElemType),
+                                        params_.offA * sizeof(ElemType),
+                                        CL_MEM_READ_ONLY);
+    if (mobjA_) {
+        mobjB_ = base_->createEnqueueBuffer(B_, params_.rowsB * params_.columnsB *
+                                            sizeof(ElemType),
+                                            params_.offBX * sizeof(ElemType),
+                                            CL_MEM_READ_ONLY);
+    }
+    if (mobjB_) {
+        mobjC_ = base_->createEnqueueBuffer(backC_, params_.rowsC * params_.columnsC *
+                                            sizeof(ElemType),
+                                            params_.offCY * sizeof(ElemType),
+                                            CL_MEM_READ_WRITE);
+    }
+
+    return (mobjC_) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+Her2kPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+    clblasUplo fUplo;
+    clblasTranspose fTransA;
+    ElemType fAlpha;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+
+    memcpy(C_, backC_, params_.rowsC * params_.columnsC * sizeof(ElemType));
+    order = params_.order;
+    fUplo = params_.uplo;
+    fTransA = params_.transA;
+    fAlpha = alpha_;
+
+    if (order != clblasColumnMajor)
+    {
+        CIMAG( fAlpha ) *= -1.0;
+        fTransA = (params_.transA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans;
+        fUplo   = (params_.uplo == clblasUpper) ? clblasLower : clblasUpper;
+    }
+
+
+#ifdef PERF_TEST_WITH_ACML
+
+    time = getCurrentTime();
+    clMath::blas::her2k(clblasColumnMajor, fUplo, fTransA, params_.N, params_.K, fAlpha,
+                        A_, 0, params_.lda, B_, 0, params_.ldb, CREAL( beta_), C_, 0, params_.ldc);
+    time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+Her2kPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0,
+                                  params_.rowsC * params_.columnsC *
+                                  sizeof(ElemType), backC_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Matrix C buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+
+    #define TIMING
+    #ifdef TIMING
+    clFinish( queue);
+    time = getCurrentTime();
+
+    int iter = 20;
+    for ( int i = 1; i <= iter; i++)
+    {
+    #endif
+
+    status = (cl_int)clMath::clblas::her2k(params_.order, params_.uplo, params_.transA, params_.N, params_.K, alpha_,
+                        mobjA_, params_.offA, params_.lda, mobjB_, params_.offBX, params_.ldb, CREAL(beta_), mobjC_,
+                        params_.offCY, params_.ldc, 1, &queue, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS HER2K function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    #ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+    #else
+
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+
+    #endif
+
+    return time;
+}
+
+} // namespace clMath
+
+TEST_P(HER2K, cher2k)
+{
+    TestParams params;
+
+    getParams(&params);
+    Her2kPerformanceTest<FloatComplex>::runInstance(FN_CHER2K, &params);
+}
+
+TEST_P(HER2K, zher2k)
+{
+    TestParams params;
+
+    getParams(&params);
+    Her2kPerformanceTest<DoubleComplex>::runInstance(FN_ZHER2K, &params);
+}
diff --git a/src/tests/performance/perf-herk.cpp b/src/tests/performance/perf-herk.cpp
new file mode 100644
index 0000000..ede835e
--- /dev/null
+++ b/src/tests/performance/perf-herk.cpp
@@ -0,0 +1,345 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <herk.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class HerkPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~HerkPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        HerkPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor = 8;
+
+        if (( fn == FN_ZHERK) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    HerkPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType beta_;
+    ElemType *A_;
+    ElemType *C_;
+    ElemType *backC_;
+    cl_mem mobjA_;
+    cl_mem mobjC_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+HerkPerformanceTest<ElemType>::HerkPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn, (problem_size_t)((params->N * params->N
+                                            * params->K) / 2) ),
+                        params_(*params), mobjA_(NULL), mobjC_(NULL)
+{
+    A_ = new ElemType[params_.rowsA * params_.columnsA];
+    C_ = new ElemType[params_.rowsC * params_.columnsC];
+    backC_ = new ElemType[params_.rowsC * params_.columnsC];
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+HerkPerformanceTest<ElemType>::~HerkPerformanceTest()
+{
+    if(A_!=NULL)
+    {
+        delete[] A_;
+    }
+    if(C_!=NULL)
+    {
+        delete[] C_;
+    }
+    if(backC_!=NULL)
+    {
+        delete[] backC_;
+    }
+    if(mobjC_!=NULL)
+    {
+        clReleaseMemObject(mobjC_);
+    }
+    if(mobjA_!=NULL)
+    {
+        clReleaseMemObject(mobjA_);
+    }
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+HerkPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize, maxMatrSize;
+    size_t n = params->N, k = params->K;
+
+
+    if((A_ == NULL) || (backC_ == NULL) || (C_ == NULL))
+    {
+        return 0;
+    }
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize(0);
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    maxMatrSize = gmemSize / 2;
+    maxMatrSize = std::min(maxMatrSize, allocSize);
+    return ((n * k * sizeof(ElemType)) + (n * n * sizeof(ElemType)) < maxMatrSize);
+
+   // bool suff = ( sizeof(ElemType)*n*params->lda < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations
+    //suff = suff && ((( n*params->lda + (1 + (n-1)*abs(params->incx)) + (1 + (n-1)*abs(params->incy)))*sizeof(ElemType)) < gmemSize) ; //for total global allocations
+
+    //return suff ;
+
+}
+
+template <typename ElemType> int
+HerkPerformanceTest<ElemType>::prepare(void)
+{
+    alpha_ = convertMultiplier<ElemType>(params_.alpha);
+    beta_ = convertMultiplier<ElemType>(params_.beta);
+
+    randomGemmMatrices<ElemType>(params_.order, params_.transA, clblasNoTrans,
+        params_.N, params_.N, params_.K, true, &alpha_, A_, params_.lda,
+        NULL, 0, true, &beta_, C_, params_.ldc);
+
+
+    mobjA_ = base_->createEnqueueBuffer(A_, params_.rowsA * params_.columnsA *
+                                        sizeof(ElemType),
+                                        params_.offA * sizeof(ElemType),
+                                        CL_MEM_READ_ONLY);
+    if (mobjA_) {
+        mobjC_ = base_->createEnqueueBuffer(backC_, params_.rowsC * params_.columnsC *
+                                            sizeof(ElemType),
+                                            params_.offCY * sizeof(ElemType),
+                                            CL_MEM_READ_WRITE);
+    }
+
+    return (mobjC_) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+HerkPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+    clblasUplo fUplo;
+    clblasTranspose fTransA;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+
+    memcpy(C_, backC_, params_.rowsC * params_.columnsC * sizeof(ElemType));
+    order = params_.order;
+    fUplo = params_.uplo;
+    fTransA = params_.transA;
+
+
+#ifdef PERF_TEST_WITH_ACML
+    fTransA = params_.transA;
+    fUplo = params_.uplo;
+
+    if (order != clblasColumnMajor)
+    {
+        fTransA = (params_.transA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans;
+        fUplo   = (params_.uplo == clblasUpper) ? clblasLower : clblasUpper;
+    }
+
+    time = getCurrentTime();
+    clMath::blas::herk(clblasColumnMajor, fUplo, fTransA, params_.N, params_.K, CREAL(alpha_),
+                     A_, params_.lda,CREAL( beta_), C_, params_.ldc);
+    time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+HerkPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0,
+                                  params_.rowsC * params_.columnsC *
+                                  sizeof(ElemType), backC_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Matrix C buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+
+    #define TIMING
+    #ifdef TIMING
+    clFinish( queue);
+    time = getCurrentTime();
+
+    int iter = 20;
+    for ( int i = 1; i <= iter; i++)
+    {
+    #endif
+
+    status = (cl_int)clMath::clblas::herk(params_.order,
+        params_.uplo, params_.transA, params_.N, params_.K, CREAL(alpha_),
+        mobjA_, params_.offA, params_.lda, CREAL(beta_), mobjC_, params_.offCY,
+        params_.ldc, 1, &queue, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS HERK function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    #ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+    #else
+
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+
+    #endif
+
+    return time;
+}
+
+} // namespace clMath
+
+TEST_P(HERK, cherk)
+{
+    TestParams params;
+
+    getParams(&params);
+    HerkPerformanceTest<FloatComplex>::runInstance(FN_CHERK, &params);
+}
+
+TEST_P(HERK, zherk)
+{
+    TestParams params;
+
+    getParams(&params);
+    HerkPerformanceTest<DoubleComplex>::runInstance(FN_ZHERK, &params);
+}
diff --git a/src/tests/performance/perf-hpmv.cpp b/src/tests/performance/perf-hpmv.cpp
new file mode 100644
index 0000000..e32e136
--- /dev/null
+++ b/src/tests/performance/perf-hpmv.cpp
@@ -0,0 +1,346 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Hpmv performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <hpmv.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class HpmvPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~HpmvPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        HpmvPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+		opFactor = 1; //FIX-ME
+
+        if ((fn == FN_ZHPMV) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    HpmvPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType *AP_;
+	ElemType *X_;
+	ElemType *Y_;
+    ElemType *backY_;
+    cl_mem mobjAP_;
+    cl_mem mobjX_;
+	cl_mem mobjY_;
+	ElemType alpha, beta;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+HpmvPerformanceTest<ElemType>::HpmvPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest( fn, (problem_size_t)( ( ((2 * (( params->N * (params->N)) + params->N)) ) * sizeof(ElemType) ) ) ),
+    params_(*params), mobjAP_(NULL), mobjX_(NULL)
+{
+
+    AP_ = new ElemType[((params_.N * (params_.N + 1)) / 2 ) + params_.offA];
+    X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx)  + params_.offBX];
+	Y_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incy)  + params_.offCY];
+    backY_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incy)  + params_.offCY];
+	alpha = convertMultiplier<ElemType>(params_.alpha);
+	beta  = convertMultiplier<ElemType>(params_.beta);
+
+    base_ = ::clMath::BlasBase::getInstance();
+
+	mobjAP_ = NULL;
+	mobjX_ = NULL;
+	mobjY_ = NULL;
+}
+
+template <typename ElemType>
+HpmvPerformanceTest<ElemType>::~HpmvPerformanceTest()
+{
+	if(AP_ != NULL)
+    {
+        delete[] AP_;
+    }
+	if(X_ != NULL)
+	{
+        delete[] X_;
+	}
+	if(backY_ != NULL)
+	{
+		delete[] backY_;
+	}
+	if(Y_ != NULL)
+	{
+	    delete[] Y_;
+	}
+
+    if ( mobjAP_ != NULL )
+		clReleaseMemObject(mobjAP_);
+	if ( mobjX_ != NULL )
+	    clReleaseMemObject(mobjX_);
+	if ( mobjY_ != NULL )
+		clReleaseMemObject(mobjY_);
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+HpmvPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t n = params->N;
+
+	if((AP_ == NULL) || (X_ == NULL) || (Y_ == NULL) || (backY_ == NULL))
+	{
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize( 0 );
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    bool suff = ( sizeof(ElemType)*((n*(n+1))/2) < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations
+	suff = suff && ((( ((n*(n+1))/2) + (1 + (n-1)*abs(params->incx)) + (1 + (n-1)*abs(params->incy)))*sizeof(ElemType)) < gmemSize) ; //for total global allocations
+
+    return suff ;
+}
+
+template <typename ElemType> int
+HpmvPerformanceTest<ElemType>::prepare(void)
+{
+    size_t lenX, N, lenY;
+	N = params_.N;
+    lenX = 1 + (N-1) * abs(params_.incx);
+	lenY = 1 + (N-1) * abs(params_.incy);
+
+	randomHemvMatrices(params_.order, params_.uplo, N, true, &alpha, (AP_ + params_.offA), params_.lda,
+                        (X_ + params_.offBX), params_.incx, true, &beta, (Y_ + params_.offCY), params_.incy);
+
+	memcpy(backY_, Y_, (lenY+ params_.offCY )* sizeof(ElemType));
+
+    mobjAP_ = base_->createEnqueueBuffer(AP_, (((params_.N * (params_.N + 1)) / 2 ) + params_.offA)* sizeof(*AP_), 0, CL_MEM_READ_ONLY);
+    mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX )* sizeof(*X_), 0, CL_MEM_READ_ONLY);
+	mobjY_ = base_->createEnqueueBuffer(Y_, (lenY + params_.offCY )* sizeof(*Y_), 0, CL_MEM_READ_WRITE);
+
+    return ( (mobjAP_ != NULL) &&  (mobjX_ != NULL) && (mobjY_ != NULL) ) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+HpmvPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+	clblasUplo fUplo;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+    order = params_.order;
+	fUplo = params_.uplo;
+
+#ifdef PERF_TEST_WITH_ACML
+
+	if (order != clblasColumnMajor)
+    {
+        order = clblasColumnMajor;
+		fUplo =  (params_.uplo == clblasUpper)? clblasLower : clblasUpper;
+		doConjugate( (AP_ + params_.offA), params_.N, params_.N, params_.lda );
+        doConjugate( (AP_ + params_.offA), ((params_.N * (params_.N + 1)) / 2 ), 1, 1 );
+   	}
+
+   	time = getCurrentTime();
+  	clMath::blas::hpmv(order, fUplo, params_.N, alpha, AP_, params_.offA,
+							X_, params_.offBX, params_.incx, beta, Y_, params_.offCY, params_.incy);
+  	time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+HpmvPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+	int lenY = 1 + (params_.N-1) * abs(params_.incy);
+
+    status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0,
+                                  (lenY + params_.offCY )* sizeof(ElemType), backY_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Vector Y buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+
+	time = getCurrentTime();
+#define TIMING
+#ifdef TIMING
+
+	int iter = 20;
+	for ( int i = 1; i <= iter; i++)
+	{
+#endif
+		status = (cl_int)clMath::clblas::hpmv(params_.order, params_.uplo, params_.N, alpha, mobjAP_, params_.offA,
+						mobjX_, params_.offBX, params_.incx, beta, mobjY_, params_.offCY, params_.incy,
+						1, &queue, 0, NULL, &event);
+
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS HPMV function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+#ifdef TIMING
+	} // iter loop
+	clFinish( queue);
+    time = getCurrentTime() - time;
+	time /= iter;
+#else
+
+	status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+
+	//printf("Time elapsed : %lu\n", time);
+#endif
+
+    return time;
+}
+
+} // namespace clMath
+
+
+TEST_P(HPMV, chpmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    HpmvPerformanceTest<FloatComplex>::runInstance(FN_CHPMV, &params);
+}
+
+TEST_P(HPMV, zhpmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    HpmvPerformanceTest<DoubleComplex>::runInstance(FN_ZHPMV, &params);
+}
+
diff --git a/src/tests/performance/perf-hpr.cpp b/src/tests/performance/perf-hpr.cpp
new file mode 100644
index 0000000..ea990e2
--- /dev/null
+++ b/src/tests/performance/perf-hpr.cpp
@@ -0,0 +1,319 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <hpr.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class HprPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~HprPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        HprPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor = 1;
+
+        if ((fn == FN_ZHPR) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    HprPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType *AP_;
+    ElemType *X_;
+    ElemType *backAP_;
+    cl_mem mobjAP_;
+    cl_mem mobjX_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+HprPerformanceTest<ElemType>::HprPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn, (problem_size_t)((((params->N * params->N) + params->N) * 2 ) * sizeof(ElemType))),
+                          params_(*params), mobjAP_(NULL), mobjX_(NULL)
+{
+    AP_ = new ElemType[( ( params_.N*( params_.N + 1 ) )/2 )+ params_.offa];
+    X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx)  + params_.offBX];
+    backAP_ = new ElemType[( ( params_.N*( params_.N + 1 ) )/2 )+ params_.offa];
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+HprPerformanceTest<ElemType>::~HprPerformanceTest()
+{
+    if(AP_ != NULL)
+    {
+        delete[] AP_;
+    }
+	if(backAP_ != NULL)
+	{
+		delete[] backAP_;
+	}
+	if(X_ != NULL)
+	{
+        delete[] X_;
+	}
+
+	if(mobjX_ != NULL) {
+		clReleaseMemObject(mobjX_);
+    }
+	if(mobjAP_ != NULL) {
+		clReleaseMemObject(mobjAP_);
+	}
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+HprPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t n = params->N;
+
+	if((AP_ == NULL) || (backAP_ == NULL) || (X_ == NULL))
+	{
+        return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize(0);
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    bool suff = ( sizeof(ElemType) *(( n*( n + 1 ) )/2 )< allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations
+    suff = suff && (((( (n*( n + 1 ) )/2 )  + (1 + (n-1)*abs(params->incx))*2)*sizeof(ElemType)) < gmemSize) ; //for total global allocations
+
+    return suff ;
+}
+
+template <typename ElemType> int
+HprPerformanceTest<ElemType>::prepare(void)
+{
+	size_t lenX = 1 + (params_.N-1) * abs(params_.incx);
+    alpha_ = convertMultiplier<ElemType>(params_.alpha);
+	randomHerMatrices( params_.order, params_.uplo, params_.N, &alpha_, (AP_ + params_.offa), 0, (X_ + params_.offBX), params_.incx );
+	memcpy(backAP_, AP_, ((( ( params_.N*( params_.N + 1 ) )/2 )+ params_.offa)* sizeof(ElemType)));
+
+    mobjAP_ = base_->createEnqueueBuffer(AP_, (( ( params_.N*( params_.N + 1 ) )/2 )+ params_.offa)* sizeof(*AP_), 0, CL_MEM_READ_WRITE);
+    mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX )* sizeof(*X_), 0, CL_MEM_READ_ONLY);
+
+    return ( (mobjAP_ != NULL) &&  (mobjX_ != NULL) ) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+HprPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+	clblasOrder order;
+//	size_t lda;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+
+    order = params_.order;
+
+
+#ifdef PERF_TEST_WITH_ACML
+
+    clblasOrder fOrder;
+    clblasUplo fUplo;
+    fOrder = params_.order;
+	fUplo = params_.uplo;
+
+	if (order != clblasColumnMajor)
+    {
+		doConjugate( (X_ + params_.offBX), (1 + (params_.N-1) * abs(params_.incx)), 1, 1 );
+        fOrder = clblasColumnMajor;
+        fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower;
+    }
+
+   	time = getCurrentTime();
+   	clMath::blas::hpr(fOrder, fUplo, params_.N, CREAL(alpha_), X_, params_.offBX, params_.incx, AP_, params_.offa);
+	time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+HprPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    status = clEnqueueWriteBuffer(queue, mobjAP_, CL_TRUE, 0,
+                                  ((( params_.N*( params_.N + 1 ) )/2 ) + params_.offa) *
+                                  sizeof(ElemType), backAP_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Matrix A buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+
+#define TIMING
+#ifdef TIMING
+    clFinish( queue);
+    time = getCurrentTime();
+
+    int iter = 20;
+    for ( int i = 1; i <= iter; i++)
+    {
+#endif
+    status = (cl_int)clMath::clblas::hpr(params_.order, params_.uplo, params_.N, CREAL(alpha_), mobjX_, params_.offBX, params_.incx,
+				mobjAP_, params_.offa, 1, &queue, 0, NULL, &event);
+
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS HPR function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+#ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+#else
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+
+    return time;
+}
+
+} // namespace clMath
+
+TEST_P(HPR, chpr)
+{
+    TestParams params;
+
+    getParams(&params);
+    HprPerformanceTest<FloatComplex>::runInstance(FN_CHPR, &params);
+}
+
+TEST_P(HPR, zhpr)
+{
+    TestParams params;
+
+    getParams(&params);
+    HprPerformanceTest<DoubleComplex>::runInstance(FN_ZHPR, &params);
+}
diff --git a/src/tests/performance/perf-hpr2.cpp b/src/tests/performance/perf-hpr2.cpp
new file mode 100644
index 0000000..a1d6ad7
--- /dev/null
+++ b/src/tests/performance/perf-hpr2.cpp
@@ -0,0 +1,350 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Hpr2 performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <hpr2.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class Hpr2PerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~Hpr2PerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        Hpr2PerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor = 1;
+
+        if ((fn == FN_ZHPR2) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    Hpr2PerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType *AP_;
+    ElemType *X_;
+	ElemType *Y_;
+	ElemType *backAP_;
+    cl_mem mobjAP_;
+    cl_mem mobjX_;
+	cl_mem mobjY_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+Hpr2PerformanceTest<ElemType>::Hpr2PerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn, (problem_size_t)((((params->N * params->N) + (params->N)) * 3 ) * sizeof(ElemType))),
+	params_(*params), mobjAP_(NULL), mobjX_(NULL), mobjY_(NULL)
+{
+    AP_ = new ElemType[((params_.N * (params_.N + 1))/2) + params_.offa];
+    X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx) + params_.offBX];
+	Y_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incy) + params_.offCY];
+    backAP_ = new ElemType[((params_.N * (params_.N + 1))/2) + params_.offa];
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+Hpr2PerformanceTest<ElemType>::~Hpr2PerformanceTest()
+{
+	if(AP_ != NULL)
+    {
+        delete[] AP_;
+    }
+	if(backAP_ != NULL)
+	{
+		delete[] backAP_;
+	}
+	if(X_ != NULL)
+	{
+        delete[] X_;
+	}
+	if(Y_ != NULL)
+	{
+	    delete[] Y_;
+	}
+
+	if(mobjX_ != NULL) {
+		clReleaseMemObject(mobjX_);
+    }
+
+	if(mobjY_ != NULL) {
+		clReleaseMemObject(mobjY_);
+	}
+
+	if(mobjAP_ != NULL) {
+		clReleaseMemObject(mobjAP_);
+	}
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+Hpr2PerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t n = params->N;
+
+	if((AP_ == NULL) || (backAP_ == NULL) || (X_ == NULL) || (Y_ == NULL))
+	{
+        return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize(0);
+    allocSize = (size_t)base->maxMemAllocSize();
+
+	bool suff = ( sizeof(ElemType)*((params_.N * (params_.N + 1))/2) < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize)
+				  && ((1 + (n-1)*abs(params->incy))*sizeof(ElemType) < allocSize); //for individual allocations
+    suff = suff && ((( ((params_.N * (params_.N + 1))/2) + (1 + (n-1)*abs(params->incx)) + (1 + (n-1)*abs(params->incy)))*sizeof(ElemType)) < gmemSize) ; //for total global allocations
+
+    return suff ;
+}
+
+template <typename ElemType> int
+Hpr2PerformanceTest<ElemType>::prepare(void)
+{
+    //bool useAlpha = true;
+	size_t lenX = 1 + (params_.N-1) * abs(params_.incx);
+	size_t lenY = 1 + (params_.N-1) * abs(params_.incy);
+
+    alpha_ = convertMultiplier<ElemType>(params_.alpha);
+
+	randomHer2Matrices<ElemType>(params_.order, params_.uplo, params_.N, &alpha_, (AP_ + params_.offa), params_.lda,
+							(X_ + params_.offBX), params_.incx, (Y_ + params_.offCY), params_.incy);
+
+	memcpy(backAP_, AP_, ((((params_.N * (params_.N + 1))/2) + params_.offa)* sizeof(ElemType)));
+
+    mobjAP_ = base_->createEnqueueBuffer(AP_, (((params_.N * (params_.N + 1))/2) + params_.offa)* sizeof(*AP_), 0, CL_MEM_READ_WRITE);
+    mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX ) * sizeof(*X_), 0, CL_MEM_READ_ONLY);
+	mobjY_ = base_->createEnqueueBuffer(Y_, (lenY + params_.offCY ) * sizeof(*Y_), 0, CL_MEM_READ_ONLY);
+
+	return ((mobjAP_ != NULL) &&  (mobjX_ != NULL) && (mobjY_ != NULL)) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+Hpr2PerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+	clblasOrder order;
+    clblasUplo fUplo;
+    nano_time_t time = 0;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+
+    order = params_.order;
+    fUplo = params_.uplo;
+
+#ifdef PERF_TEST_WITH_ACML
+
+    ElemType *fX, *fY;
+    int fIncx, fIncy;
+    size_t fOffx, fOffy;
+	fX = X_;    fOffx = params_.offBX;  fIncx = params_.incx;
+	fY = Y_;    fOffy = params_.offCY;  fIncy = params_.incy;
+
+
+	if (order != clblasColumnMajor)
+    {
+		doConjugate( (X_ + params_.offBX), (1 + (params_.N-1) * abs(params_.incx)), 1, 1 );
+        doConjugate( (Y_ + params_.offCY), (1 + (params_.N-1) * abs(params_.incy)), 1, 1 );
+        order = clblasColumnMajor;
+        fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower;
+	    fX = Y_;    fOffx = params_.offCY;  fIncx = params_.incy;
+	    fY = X_;    fOffy = params_.offBX;  fIncy = params_.incx;
+    }
+
+   	time = getCurrentTime();
+   	clMath::blas::hpr2(order, fUplo, params_.N, alpha_, fX, fOffx, fIncx, fY,
+					    fOffy, fIncy, AP_, params_.offa);
+	time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+Hpr2PerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    status = clEnqueueWriteBuffer(queue, mobjAP_, CL_TRUE, 0,
+                                  (((params_.N * (params_.N + 1))/2) + params_.offa) *
+                                    sizeof(ElemType), backAP_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Matrix A buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+
+#define TIMING
+#ifdef TIMING
+    clFinish( queue);
+    time = getCurrentTime();
+
+    int iter = 20;
+    for ( int i = 1; i <= iter; i++)
+    {
+#endif
+    status = (cl_int)clMath::clblas::hpr2(params_.order, params_.uplo, params_.N, alpha_, mobjX_, params_.offBX, params_.incx,
+				mobjY_, params_.offCY, params_.incy, mobjAP_, params_.offa, 1, &queue, 0, NULL, &event);
+
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS HPR2 function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+#ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+#else
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+
+    return time;
+}
+
+} // namespace clMath
+
+TEST_P(HPR2, chpr2)
+{
+    TestParams params;
+
+    getParams(&params);
+    Hpr2PerformanceTest<FloatComplex>::runInstance(FN_CHER2, &params);
+}
+
+TEST_P(HPR2, zhpr2)
+{
+    TestParams params;
+
+    getParams(&params);
+    Hpr2PerformanceTest<DoubleComplex>::runInstance(FN_ZHPR2, &params);
+}
diff --git a/src/tests/performance/perf-iamax.cpp b/src/tests/performance/perf-iamax.cpp
new file mode 100644
index 0000000..33202bf
--- /dev/null
+++ b/src/tests/performance/perf-iamax.cpp
@@ -0,0 +1,303 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <iamax.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class iAmaxPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~iAmaxPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        iAmaxPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor =1;
+
+        if (((fn == FN_iDAMAX) || (fn == FN_iZAMAX)) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to insufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    iAmaxPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType *blasX_;
+    cl_mem mobjX_;
+	cl_mem mobjiAMAX_;
+	cl_mem scratchBuff;
+    size_t  lengthX;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+iAmaxPerformanceTest<ElemType>::iAmaxPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn,(problem_size_t) ( (params->N)  * sizeof(ElemType) ) ), params_(*params),
+                mobjX_(NULL),mobjiAMAX_(NULL)
+{
+
+    blasX_ = NULL;
+	mobjX_= mobjiAMAX_= scratchBuff = NULL;
+    lengthX = 1 + (params->N - 1) * abs(params_.incx);
+
+    try
+    {
+        blasX_ = new ElemType[lengthX + params_.offBX];
+    }
+    catch(bad_alloc& ba) {
+        blasX_ = NULL;     // areResourcesSufficient() will handle the rest and return
+        mobjX_= mobjiAMAX_= scratchBuff = NULL;
+        ba = ba;
+    }
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+iAmaxPerformanceTest<ElemType>::~iAmaxPerformanceTest()
+{
+	if(blasX_ != NULL)
+    {
+        delete[] blasX_;
+	}
+    if( mobjX_ != NULL )
+    {
+		clReleaseMemObject(mobjX_);
+    }
+	if( mobjiAMAX_ != NULL )
+    {
+        clReleaseMemObject(mobjiAMAX_);
+    }
+	if( scratchBuff!= NULL )
+    {
+        clReleaseMemObject(scratchBuff);
+    }
+
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+iAmaxPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    bool ret;
+    size_t sizeX, sizeiAMAX, sizeScratchBuff;
+
+	if(blasX_ == NULL) {
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize( 0 );
+    allocSize = (size_t)base->maxMemAllocSize();
+    sizeX = (lengthX + params->offBX) * sizeof(ElemType);
+	sizeiAMAX = (1 + params->offa) * sizeof(ElemType);
+    sizeScratchBuff = (params->N * 2) * sizeof(ElemType);
+
+    ret = ((sizeX < allocSize) && (sizeiAMAX < allocSize) && (sizeScratchBuff < allocSize));
+    ret = (ret && ((sizeX + sizeiAMAX + sizeScratchBuff) < gmemSize));
+
+    return ret;
+}
+
+template <typename ElemType> int
+iAmaxPerformanceTest<ElemType>::prepare(void)
+{
+
+    randomVectors(params_.N, (blasX_ + params_.offBX), params_.incx, (ElemType*)NULL, 0);
+
+	mobjX_ = base_->createEnqueueBuffer(blasX_, ((lengthX + params_.offBX) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
+	mobjiAMAX_ = base_->createEnqueueBuffer(NULL, ((1 + params_.offa) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
+	scratchBuff = base_->createEnqueueBuffer(NULL, ((params_.N * 2) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
+
+    return ((mobjX_ != NULL) && (mobjiAMAX_ != NULL)&& (scratchBuff != NULL) )? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+iAmaxPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+
+#ifdef PERF_TEST_WITH_ACML
+
+	time = getCurrentTime();
+	clMath::blas::iamax(params_.N, blasX_, params_.offBX, params_.incx);
+	time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+iAmaxPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    DataType type;
+    type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE:
+										( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+    event = NULL;
+    clFinish( queue);
+    time = getCurrentTime();
+
+#define TIMING
+#ifdef TIMING
+    int iter = 100;
+    for ( int i=1; i <= iter; i++)
+    {
+#endif
+
+        status = (cl_int)clMath::clblas::iamax( type, params_.N, mobjiAMAX_, params_.offa,
+                                                mobjX_, params_.offBX, params_.incx,
+                                                scratchBuff, 1, &queue, 0, NULL, &event);
+        if (status != CL_SUCCESS) {
+            cerr << "The CLBLAS iAMAX function failed, status = " <<
+                    status << endl;
+
+            return NANOTIME_ERR;
+        }
+#ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+#else
+
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+    return time;
+}
+
+} // namespace clMath
+
+TEST_P(iAMAX, isamax)
+{
+    TestParams params;
+
+    getParams(&params);
+    iAmaxPerformanceTest<float>::runInstance(FN_iSAMAX, &params);
+}
+
+
+TEST_P(iAMAX, idamax)
+{
+    TestParams params;
+
+    getParams(&params);
+    iAmaxPerformanceTest<double>::runInstance(FN_iDAMAX, &params);
+}
+
+TEST_P(iAMAX, icamax)
+{
+    TestParams params;
+
+    getParams(&params);
+    iAmaxPerformanceTest<FloatComplex>::runInstance(FN_iCAMAX, &params);
+}
+
+
+TEST_P(iAMAX, izamax)
+{
+    TestParams params;
+
+    getParams(&params);
+    iAmaxPerformanceTest<DoubleComplex>::runInstance(FN_iZAMAX, &params);
+}
diff --git a/src/tests/performance/perf-nrm2.cpp b/src/tests/performance/perf-nrm2.cpp
new file mode 100644
index 0000000..7105a4a
--- /dev/null
+++ b/src/tests/performance/perf-nrm2.cpp
@@ -0,0 +1,302 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <nrm2.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class Nrm2PerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~Nrm2PerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        Nrm2PerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor =1;
+
+        if (((fn == FN_DNRM2) || (fn == FN_DZNRM2)) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to insufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    Nrm2PerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType *blasX_;
+    cl_mem mobjX_;
+	cl_mem mobjNRM2_;
+	cl_mem scratchBuff;
+    size_t  lengthX;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+Nrm2PerformanceTest<ElemType>::Nrm2PerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn,(problem_size_t) ( (params->N)  * sizeof(ElemType) ) ), params_(*params),
+                mobjX_(NULL),mobjNRM2_(NULL)
+{
+
+    blasX_ = NULL;
+	mobjX_= mobjNRM2_= scratchBuff = NULL;
+    lengthX = 1 + (params->N - 1) * abs(params_.incx);
+
+    try
+    {
+        blasX_ = new ElemType[lengthX + params_.offBX];
+    }
+    catch(bad_alloc& ba) {
+        blasX_ = NULL;     // areResourcesSufficient() will handle the rest and return
+        mobjX_= mobjNRM2_= scratchBuff = NULL;
+        ba = ba;
+    }
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+Nrm2PerformanceTest<ElemType>::~Nrm2PerformanceTest()
+{
+	if(blasX_ != NULL)
+    {
+        delete[] blasX_;
+	}
+    if( mobjX_ != NULL )
+    {
+		clReleaseMemObject(mobjX_);
+    }
+	if( mobjNRM2_ != NULL )
+    {
+        clReleaseMemObject(mobjNRM2_);
+    }
+	if( scratchBuff!= NULL )
+    {
+        clReleaseMemObject(scratchBuff);
+    }
+
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+Nrm2PerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    bool ret;
+    size_t sizeX, sizeNRM2, sizeScratch;
+
+	if(blasX_ == NULL) {
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize( 0 );
+    allocSize = (size_t)base->maxMemAllocSize();
+    sizeX = (lengthX + params->offBX) * sizeof(ElemType);
+    sizeScratch = (lengthX * 2) * sizeof(ElemType);
+	sizeNRM2 = (1 + params->offa) * sizeof(ElemType);
+
+    ret = ((sizeX < allocSize) && (sizeNRM2 < allocSize) && (sizeScratch < allocSize));
+    ret = (ret && ((sizeX + sizeNRM2 + sizeScratch) < gmemSize));
+
+    return ret;
+}
+
+template <typename ElemType> int
+Nrm2PerformanceTest<ElemType>::prepare(void)
+{
+
+    randomVectors(params_.N, (blasX_ + params_.offBX), params_.incx, (ElemType*)NULL, 0, true);
+
+	mobjX_ = base_->createEnqueueBuffer(blasX_, ((lengthX + params_.offBX) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
+	mobjNRM2_ = base_->createEnqueueBuffer(NULL, ((1 + params_.offa) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
+	scratchBuff = base_->createEnqueueBuffer(NULL, ((2 * lengthX) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
+
+    return ((mobjX_ != NULL) && (mobjNRM2_ != NULL)&& (scratchBuff != NULL) )? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+Nrm2PerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+
+#ifdef PERF_TEST_WITH_ACML
+
+	time = getCurrentTime();
+	clMath::blas::nrm2(params_.N, blasX_, params_.offBX, params_.incx);
+	time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+Nrm2PerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    DataType type;
+    type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE:
+										( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+    event = NULL;
+    clFinish( queue);
+    time = getCurrentTime();
+
+#define TIMING
+#ifdef TIMING
+    int iter = 100;
+    for ( int i=1; i <= iter; i++)
+    {
+#endif
+
+        status = (cl_int)clMath::clblas::nrm2( type, params_.N, mobjNRM2_, params_.offa, mobjX_, params_.offBX, params_.incx,
+                                                scratchBuff, 1, &queue, 0, NULL, &event);
+        if (status != CL_SUCCESS) {
+            cerr << "The CLBLAS NRM2 function failed, status = " <<
+                    status << endl;
+
+            return NANOTIME_ERR;
+        }
+#ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+#else
+
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+    return time;
+}
+
+} // namespace clMath
+
+TEST_P(NRM2, snrm2)
+{
+    TestParams params;
+
+    getParams(&params);
+    Nrm2PerformanceTest<float>::runInstance(FN_SNRM2, &params);
+}
+
+
+TEST_P(NRM2, dnrm2)
+{
+    TestParams params;
+
+    getParams(&params);
+    Nrm2PerformanceTest<double>::runInstance(FN_DNRM2, &params);
+}
+
+TEST_P(NRM2, scnrm2)
+{
+    TestParams params;
+
+    getParams(&params);
+    Nrm2PerformanceTest<FloatComplex>::runInstance(FN_SCNRM2, &params);
+}
+
+
+TEST_P(NRM2, dznrm2)
+{
+    TestParams params;
+
+    getParams(&params);
+    Nrm2PerformanceTest<DoubleComplex>::runInstance(FN_DZNRM2, &params);
+}
diff --git a/src/tests/performance/perf-rot.cpp b/src/tests/performance/perf-rot.cpp
new file mode 100644
index 0000000..f940f95
--- /dev/null
+++ b/src/tests/performance/perf-rot.cpp
@@ -0,0 +1,364 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * ROT performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <rot.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+// ElemType1 for storing general type, ElemType2 to store type of C which is only float/double
+template <typename ElemType> class RotPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~RotPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        RotPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor =1;
+
+        if (((fn == FN_DROT) || (fn == FN_ZDROT)) &&
+            !base->isDevSupportDoublePrecision())
+        {
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params))
+        {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else
+        {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    RotPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType *X_, *Y_, *back_X_, *back_Y_, alpha, beta;
+    size_t lengthx, lengthy;
+    cl_mem mobjX_, mobjY_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+RotPerformanceTest<ElemType>::RotPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn,(problem_size_t) ((4 * params->N ) * sizeof(ElemType))), params_(*params)
+{
+
+    X_ = Y_ =  NULL;
+    back_X_ = back_Y_ = NULL;
+    mobjX_= mobjY_ = NULL;
+
+    lengthx = 1 + (params_.N - 1) * abs(params_.incx);
+    lengthy = 1 + (params_.N - 1) * abs(params_.incy);
+
+    try
+    {
+        X_ = new ElemType[lengthx + params_.offa];
+        back_X_ = new ElemType[lengthx + params_.offa];
+        Y_ = new ElemType[lengthy + params_.offb];
+        back_Y_ = new ElemType[lengthy + params_.offb];
+    }
+    catch(bad_alloc& ba)
+    {
+        X_ = back_X_ = Y_ = back_Y_ = NULL;     // areResourcesSufficient() will handle the rest and return
+        ba = ba;
+    }
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+RotPerformanceTest<ElemType>::~RotPerformanceTest()
+{
+	if(X_ != NULL)
+    {
+        delete[] X_;
+	}
+	if(back_X_ != NULL)
+    {
+        delete[] back_X_;
+	}
+    if( mobjX_ != NULL )
+    {
+		clReleaseMemObject(mobjX_);
+    }
+
+    if(Y_ != NULL)
+    {
+        delete[] Y_;
+	}
+	if(back_Y_ != NULL)
+    {
+        delete[] back_Y_;
+	}
+    if( mobjY_ != NULL )
+    {
+		clReleaseMemObject(mobjY_);
+    }
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+RotPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t offx = params->offa;
+    size_t offy = params->offb;
+
+    size_t sizex = (lengthx + offx)*sizeof(ElemType);
+    size_t sizey = (lengthy + offy)*sizeof(ElemType);
+
+    bool ret;
+    size_t sizeRequired = (sizex + sizey);
+
+	if((X_ == NULL) || (back_X_ == NULL) || (Y_ == NULL) || (back_Y_ == NULL))
+    {
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize(0);
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    ret = (sizex < allocSize) && (sizey < allocSize);
+    ret = ret && (sizeRequired < gmemSize);
+
+    return ret;
+}
+
+template <typename ElemType> int
+RotPerformanceTest<ElemType>::prepare(void)
+{
+    randomVectors(params_.N, (X_ + params_.offa), params_.incx, (Y_ + params_.offb), params_.incy);
+
+    alpha= convertMultiplier<ElemType>(params_.alpha);
+	beta = convertMultiplier<ElemType>(params_.beta);
+
+    memcpy(back_X_, X_, (lengthx + params_.offa)*sizeof(ElemType));
+    memcpy(back_Y_, Y_, (lengthy + params_.offb)*sizeof(ElemType));
+
+	// Allocate buffers
+    mobjX_ = base_->createEnqueueBuffer(X_, (lengthx + params_.offa) * sizeof(ElemType), 0, CL_MEM_READ_WRITE);
+    mobjY_ = base_->createEnqueueBuffer(Y_, (lengthy + params_.offb) * sizeof(ElemType), 0, CL_MEM_READ_WRITE);
+
+    if((mobjX_ == NULL) || (mobjY_ == NULL))
+    {
+        return -1;
+    }
+    return 0;
+}
+
+template <typename ElemType> nano_time_t
+RotPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+
+#ifdef PERF_TEST_WITH_ACML
+
+		time = getCurrentTime();
+		clMath::blas::rot(params_.N, back_X_, params_.offa, params_.incx, back_Y_, params_.offb, params_.incy,
+                        alpha, beta);
+		time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+RotPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    //DataType type;
+    //type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT: TYPE_DOUBLE;
+
+    status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0, (lengthx + params_.offa) * sizeof(ElemType), X_, 0, NULL, &event);
+    if (status != CL_SUCCESS)
+    {
+        cerr << "Vector X buffer object enqueuing error, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0, (lengthy + params_.offb) * sizeof(ElemType), Y_, 0, NULL, &event);
+    if (status != CL_SUCCESS)
+    {
+        cerr << "Vector Y buffer object enqueuing error, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS)
+    {
+        cout << "Wait on event failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+    time = getCurrentTime();
+
+#define TIMING
+#ifdef TIMING
+    clFinish( queue);
+    int iter = 50;
+    for ( int i=1; i <= iter; i++)
+    {
+#endif
+        status = (cl_int)clMath::clblas::rot(params_.N, mobjX_, params_.offa, params_.incx, mobjY_, params_.offb, params_.incy,
+                                             alpha, beta, 1, &queue, 0, NULL, &event);
+        if (status != CL_SUCCESS)
+        {
+            cerr << "The CLBLAS ROT function failed, status = " << status << endl;
+            return NANOTIME_ERR;
+        }
+#ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+#else
+
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS)
+    {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS)
+    {
+        time = getCurrentTime() - time;
+    }
+    else
+    {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+    return time;
+}
+
+} // namespace clMath
+
+// rot performance test
+TEST_P(ROT, srot)
+{
+    TestParams params;
+
+    getParams(&params);
+    RotPerformanceTest<float>::runInstance(FN_SROT, &params);
+}
+
+
+TEST_P(ROT, drot)
+{
+    TestParams params;
+
+    getParams(&params);
+    RotPerformanceTest<double>::runInstance(FN_DROT, &params);
+}
+
+TEST_P(ROT, csrot)
+{
+    TestParams params;
+
+    getParams(&params);
+    RotPerformanceTest<FloatComplex>::runInstance(FN_CSROT, &params);
+}
+
+
+TEST_P(ROT, zdrot)
+{
+    TestParams params;
+
+    getParams(&params);
+    RotPerformanceTest<DoubleComplex>::runInstance(FN_ZDROT, &params);
+}
+
diff --git a/src/tests/performance/perf-rotg.cpp b/src/tests/performance/perf-rotg.cpp
new file mode 100644
index 0000000..943e040
--- /dev/null
+++ b/src/tests/performance/perf-rotg.cpp
@@ -0,0 +1,418 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * ROTG performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <rotg.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+// ElemType1 for storing general type, ElemType2 to store type of C which is only float/double
+template <typename ElemType1, typename ElemType2> class RotgPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~RotgPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        RotgPerformanceTest<ElemType1, ElemType2> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor =1;
+
+        if (((fn == FN_DROTG) || (fn == FN_ZROTG)) &&
+            !base->isDevSupportDoublePrecision())
+        {
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params))
+        {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else
+        {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    RotgPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType1 *SA_, *SB_, *S_, *back_SA_, *back_SB_, *back_S_;
+    ElemType2 *C_, *back_C_;
+    cl_mem mobjSA_, mobjSB_, mobjC_, mobjS_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType1, typename ElemType2>
+RotgPerformanceTest<ElemType1, ElemType2>::RotgPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn,(problem_size_t) (5 * sizeof(ElemType1) + sizeof(ElemType2))), params_(*params)
+{
+
+    SA_ = SB_ = S_ = NULL;
+    back_SA_ = back_SB_ = back_S_ = NULL;
+    C_ = back_C_ = NULL;
+    mobjSA_= mobjSB_ = mobjC_ = mobjS_ = NULL;
+
+    try
+    {
+        SA_ = new ElemType1[1 + params_.offBX];
+        back_SA_ = new ElemType1[1 + params_.offBX];
+        SB_ = new ElemType1[1 + params_.offCY];
+        back_SB_ = new ElemType1[1 + params_.offCY];
+        C_ = new ElemType2[1 + params_.offa];
+        back_C_ = new ElemType2[1 + params_.offa];
+        S_ = new ElemType1[1 + params_.offb];
+        back_S_ = new ElemType1[1 + params_.offb];
+    }
+    catch(bad_alloc& ba)
+    {
+        SA_ = back_SA_ = SB_ = back_SB_ = NULL;     // areResourcesSufficient() will handle the rest and return
+        S_ = back_S_ = NULL;
+        C_ = back_C_ = NULL;
+        ba = ba;
+    }
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType1, typename ElemType2>
+RotgPerformanceTest<ElemType1, ElemType2>::~RotgPerformanceTest()
+{
+	if(SA_ != NULL)
+    {
+        delete[] SA_;
+	}
+	if(back_SA_ != NULL)
+    {
+        delete[] back_SA_;
+	}
+    if( mobjSA_ != NULL )
+    {
+		clReleaseMemObject(mobjSA_);
+    }
+
+    if(SB_ != NULL)
+    {
+        delete[] SB_;
+	}
+	if(back_SB_ != NULL)
+    {
+        delete[] back_SB_;
+	}
+    if( mobjSB_ != NULL )
+    {
+		clReleaseMemObject(mobjSB_);
+    }
+
+    if(C_ != NULL)
+    {
+        delete[] C_;
+	}
+	if(back_C_ != NULL)
+    {
+        delete[] back_C_;
+	}
+    if( mobjC_ != NULL )
+    {
+		clReleaseMemObject(mobjC_);
+    }
+
+    if(S_ != NULL)
+    {
+        delete[] S_;
+	}
+	if(back_S_ != NULL)
+    {
+        delete[] back_S_;
+	}
+    if( mobjS_ != NULL )
+    {
+		clReleaseMemObject(mobjS_);
+    }
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType1, typename ElemType2> bool
+RotgPerformanceTest<ElemType1, ElemType2>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t offSA_ = params->offBX;
+    size_t offSB_ = params->offCY;
+    size_t offC_ = params->offa;
+    size_t offS_ = params->offb;
+    bool ret;
+    size_t sizeRequired = ((1 + offSA_) + (1 + offSB_) + (1 + offS_)) * sizeof(ElemType1)
+                             + ((1 + offC_) * sizeof(ElemType2));
+
+	if((SA_ == NULL) || (back_SA_ == NULL) || (SB_ == NULL) || (back_SB_ == NULL) ||
+        (C_ == NULL) || (back_C_ == NULL) || (S_ == NULL) || (back_S_ == NULL))
+    {
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize( 0 );
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    ret = (sizeRequired) < allocSize;
+    ret = ret && (sizeRequired < gmemSize);
+
+    return ret;
+}
+
+template <typename ElemType1, typename ElemType2> int
+RotgPerformanceTest<ElemType1, ElemType2>::prepare(void)
+{
+    randomVectors(1, (SA_ + params_.offBX), 1, (SB_ + params_.offCY), 1);
+    C_[params_.offa] = back_C_[params_.offa] = ZERO<ElemType2>();
+    S_[params_.offb] = back_S_[params_.offb] = ZERO<ElemType1>();
+    back_SA_[params_.offBX] = SA_[params_.offBX];
+    back_SB_[params_.offCY] = SB_[params_.offCY];
+
+    //printing the inputs, as they change after processing
+    ::std::cerr << "A = ";
+    printElement<ElemType1>(SA_[params_.offBX]);
+    ::std::cerr << "\tB = ";
+    printElement<ElemType1>(SB_[params_.offCY]);
+    ::std::cerr << "\tC = ";
+    printElement<ElemType2>(C_[params_.offa]);
+    ::std::cerr << "\tS = ";
+    printElement<ElemType1>(S_[params_.offb]);
+    ::std::cout << std::endl << std::endl;
+
+	// Allocate buffers
+    mobjSA_ = base_->createEnqueueBuffer(SA_, (1 + params_.offBX) * sizeof(ElemType1), 0, CL_MEM_READ_WRITE);
+    mobjSB_ = base_->createEnqueueBuffer(SB_, (1 + params_.offCY) * sizeof(ElemType1), 0, CL_MEM_READ_WRITE);
+    mobjC_  = base_->createEnqueueBuffer(C_,  (1 + params_.offa ) * sizeof(ElemType2), 0, CL_MEM_WRITE_ONLY);
+    mobjS_  = base_->createEnqueueBuffer(S_,  (1 + params_.offb ) * sizeof(ElemType1), 0, CL_MEM_WRITE_ONLY);
+
+    if((mobjSA_ == NULL) || (mobjSB_ == NULL) || (mobjC_ == NULL) || (mobjS_ == NULL))
+    {
+        return -1;
+    }
+    return 0;
+}
+
+template <typename ElemType1, typename ElemType2> nano_time_t
+RotgPerformanceTest<ElemType1, ElemType2>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+
+#ifdef PERF_TEST_WITH_ACML
+
+		time = getCurrentTime();
+		clMath::blas::rotg(back_SA_, params_.offBX, back_SB_, params_.offCY, back_C_, params_.offa, back_S_, params_.offb);
+		time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType1, typename ElemType2> nano_time_t
+RotgPerformanceTest<ElemType1, ElemType2>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    DataType type;
+    type = ( typeid(ElemType1) == typeid(float))? TYPE_FLOAT:( typeid(ElemType1) == typeid(double))? TYPE_DOUBLE:
+										( typeid(ElemType1) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+    status = clEnqueueWriteBuffer(queue, mobjSA_, CL_TRUE, 0, (1 + params_.offBX) * sizeof(ElemType1), SA_, 0, NULL, &event);
+    if (status != CL_SUCCESS)
+    {
+        cerr << "Vector SA buffer object enqueuing error, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    status = clEnqueueWriteBuffer(queue, mobjSB_, CL_TRUE, 0, (1 + params_.offCY) * sizeof(ElemType1), SB_, 0, NULL, &event);
+    if (status != CL_SUCCESS)
+    {
+        cerr << "Vector SB buffer object enqueuing error, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0, (1 + params_.offa) * sizeof(ElemType2), C_, 0, NULL, &event);
+    if (status != CL_SUCCESS)
+    {
+        cerr << "Vector C buffer object enqueuing error, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    status = clEnqueueWriteBuffer(queue, mobjS_, CL_TRUE, 0, (1 + params_.offb) * sizeof(ElemType1), S_, 0, NULL, &event);
+    if (status != CL_SUCCESS)
+    {
+        cerr << "Vector S buffer object enqueuing error, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS)
+    {
+        cout << "Wait on event failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+    time = getCurrentTime();
+
+#define TIMING
+#ifdef TIMING
+    clFinish( queue);
+    int iter = 50;
+    for ( int i=1; i <= iter; i++)
+    {
+#endif
+        status = (cl_int)clMath::clblas::rotg(type, mobjSA_, params_.offBX, mobjSB_, params_.offCY, mobjC_, params_.offa, mobjS_, params_.offb,
+                                                1, &queue, 0, NULL, &event);
+        if (status != CL_SUCCESS)
+        {
+            cerr << "The CLBLAS ROTG function failed, status = " << status << endl;
+            return NANOTIME_ERR;
+        }
+#ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+#else
+
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS)
+    {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS)
+    {
+        time = getCurrentTime() - time;
+    }
+    else
+    {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+    return time;
+}
+
+} // namespace clMath
+
+// rotg performance test
+TEST_P(ROTG, srotg)
+{
+    TestParams params;
+
+    getParams(&params);
+    RotgPerformanceTest<float, float>::runInstance(FN_SROTG, &params);
+}
+
+
+TEST_P(ROTG, drotg)
+{
+    TestParams params;
+
+    getParams(&params);
+    RotgPerformanceTest<double, double>::runInstance(FN_DROTG, &params);
+}
+
+TEST_P(ROTG, crotg)
+{
+    TestParams params;
+
+    getParams(&params);
+    RotgPerformanceTest<FloatComplex, float>::runInstance(FN_CROTG, &params);
+}
+
+
+TEST_P(ROTG, zrotg)
+{
+    TestParams params;
+
+    getParams(&params);
+    RotgPerformanceTest<DoubleComplex, double>::runInstance(FN_ZROTG, &params);
+}
diff --git a/src/tests/performance/perf-rotm.cpp b/src/tests/performance/perf-rotm.cpp
new file mode 100644
index 0000000..47141bc
--- /dev/null
+++ b/src/tests/performance/perf-rotm.cpp
@@ -0,0 +1,377 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * ROTM performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <rotm.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+// ElemType1 for storing general type, ElemType2 to store type of C which is only float/double
+template <typename ElemType> class RotmPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~RotmPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        RotmPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor =1;
+
+        if (((fn == FN_DROTM)) &&
+            !base->isDevSupportDoublePrecision())
+        {
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params))
+        {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else
+        {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    RotmPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType *X_, *Y_, *PARAM_, *back_X_, *back_Y_, *back_PARAM_;
+    size_t lengthx, lengthy;
+    cl_mem mobjX_, mobjY_, mobjParam_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+RotmPerformanceTest<ElemType>::RotmPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn,(problem_size_t) ((4 * params->N + 5) * sizeof(ElemType))), params_(*params)
+{
+
+    X_ = Y_ = PARAM_ = NULL;
+    back_X_ = back_Y_ = back_PARAM_ = NULL;
+    mobjX_= mobjY_ = mobjParam_ = NULL;
+
+    lengthx = 1 + (params_.N - 1) * abs(params_.incx);
+    lengthy = 1 + (params_.N - 1) * abs(params_.incy);
+
+    try
+    {
+        X_ = new ElemType[lengthx + params_.offa];
+        back_X_ = new ElemType[lengthx + params_.offa];
+        Y_ = new ElemType[lengthy + params_.offb];
+        back_Y_ = new ElemType[lengthy + params_.offb];
+        PARAM_ = new ElemType[5 + params_.offc];
+        back_PARAM_ = new ElemType[5 + params_.offc];
+    }
+    catch(bad_alloc& ba)
+    {
+        X_ = back_X_ = Y_ = back_Y_ = NULL;     // areResourcesSufficient() will handle the rest and return
+        PARAM_ = back_PARAM_ = NULL;
+        ba = ba;
+    }
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+RotmPerformanceTest<ElemType>::~RotmPerformanceTest()
+{
+	if(X_ != NULL)
+    {
+        delete[] X_;
+	}
+	if(back_X_ != NULL)
+    {
+        delete[] back_X_;
+	}
+    if( mobjX_ != NULL )
+    {
+		clReleaseMemObject(mobjX_);
+    }
+
+    if(Y_ != NULL)
+    {
+        delete[] Y_;
+	}
+	if(back_Y_ != NULL)
+    {
+        delete[] back_Y_;
+	}
+    if( mobjY_ != NULL )
+    {
+		clReleaseMemObject(mobjY_);
+    }
+
+    if(PARAM_ != NULL)
+    {
+        delete[] PARAM_;
+	}
+	if(back_PARAM_ != NULL)
+    {
+        delete[] back_PARAM_;
+	}
+    if( mobjParam_ != NULL )
+    {
+		clReleaseMemObject(mobjParam_);
+    }
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+RotmPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t offx = params->offa;
+    size_t offy = params->offb;
+    size_t offParam = params->offc;
+
+    size_t sizex = (lengthx + offx)*sizeof(ElemType);
+    size_t sizey = (lengthy + offy)*sizeof(ElemType);
+    size_t sizeParam = (5 + offParam)*sizeof(ElemType);
+
+    bool ret;
+    size_t sizeRequired = (sizex + sizey + sizeParam);
+
+	if((X_ == NULL) || (back_X_ == NULL) || (Y_ == NULL) || (back_Y_ == NULL) ||
+        (PARAM_ == NULL) || (back_PARAM_ == NULL))
+    {
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize(0);
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    ret = (sizex < allocSize) && (sizey < allocSize);
+    ret = ret && (sizeRequired < gmemSize);
+
+    return ret;
+}
+
+template <typename ElemType> int
+RotmPerformanceTest<ElemType>::prepare(void)
+{
+    //Filling random values for SA and SB. C & S are only for output sake
+    randomVectors(params_.N, (X_ + params_.offa), params_.incx, (Y_ + params_.offb), params_.incy);
+    randomVectors(4, (PARAM_ + params_.offc + 1), 1); //1st element is initialized separately
+
+    ElemType sflagParam = convertMultiplier<ElemType>(params_.alpha);
+    PARAM_[params_.offc] = sflagParam; // initializing first element
+
+    memcpy(back_X_, X_, (lengthx + params_.offa)*sizeof(ElemType));
+    memcpy(back_Y_, Y_, (lengthy + params_.offb)*sizeof(ElemType));
+    memcpy(back_PARAM_, PARAM_, (params_.offc)*sizeof(ElemType));
+
+	// Allocate buffers
+    mobjX_ = base_->createEnqueueBuffer(X_, (lengthx + params_.offa) * sizeof(ElemType), 0, CL_MEM_READ_WRITE);
+    mobjY_ = base_->createEnqueueBuffer(Y_, (lengthy + params_.offb) * sizeof(ElemType), 0, CL_MEM_READ_WRITE);
+    mobjParam_  = base_->createEnqueueBuffer(PARAM_,  (5 + params_.offc) * sizeof(ElemType), 0, CL_MEM_READ_ONLY);
+
+    if((mobjX_ == NULL) || (mobjY_ == NULL) || (mobjParam_ == NULL))
+    {
+        return -1;
+    }
+    return 0;
+}
+
+template <typename ElemType> nano_time_t
+RotmPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+
+#ifdef PERF_TEST_WITH_ACML
+
+		time = getCurrentTime();
+		clMath::blas::rotm(params_.N, back_X_, params_.offa, params_.incx, back_Y_, params_.offb, params_.incy,
+                        back_PARAM_, params_.offc);
+		time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+RotmPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    DataType type;
+    type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT: TYPE_DOUBLE;
+
+    status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0, (lengthx + params_.offa) * sizeof(ElemType), X_, 0, NULL, &event);
+    if (status != CL_SUCCESS)
+    {
+        cerr << "Vector X buffer object enqueuing error, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0, (lengthy + params_.offb) * sizeof(ElemType), Y_, 0, NULL, &event);
+    if (status != CL_SUCCESS)
+    {
+        cerr << "Vector Y buffer object enqueuing error, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    status = clEnqueueWriteBuffer(queue, mobjParam_, CL_TRUE, 0, (5 + params_.offc) * sizeof(ElemType), PARAM_, 0, NULL, &event);
+    if (status != CL_SUCCESS)
+    {
+        cerr << "Vector C buffer object enqueuing error, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS)
+    {
+        cout << "Wait on event failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+    time = getCurrentTime();
+
+#define TIMING
+#ifdef TIMING
+    clFinish( queue);
+    int iter = 50;
+    for ( int i=1; i <= iter; i++)
+    {
+#endif
+        status = (cl_int)clMath::clblas::rotm(type, params_.N, mobjX_, params_.offa, params_.incx, mobjY_, params_.offb, params_.incy,
+                                             mobjParam_, params_.offc, 1, &queue, 0, NULL, &event);
+        if (status != CL_SUCCESS)
+        {
+            cerr << "The CLBLAS ROTM function failed, status = " << status << endl;
+            return NANOTIME_ERR;
+        }
+#ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+#else
+
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS)
+    {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS)
+    {
+        time = getCurrentTime() - time;
+    }
+    else
+    {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+    return time;
+}
+
+} // namespace clMath
+
+// rotm performance test
+TEST_P(ROTM, srotm)
+{
+    TestParams params;
+
+    getParams(&params);
+    RotmPerformanceTest<float>::runInstance(FN_SROTM, &params);
+}
+
+
+TEST_P(ROTM, drotm)
+{
+    TestParams params;
+
+    getParams(&params);
+    RotmPerformanceTest<double>::runInstance(FN_DROTM, &params);
+}
+
diff --git a/src/tests/performance/perf-rotmg.cpp b/src/tests/performance/perf-rotmg.cpp
new file mode 100644
index 0000000..c332573
--- /dev/null
+++ b/src/tests/performance/perf-rotmg.cpp
@@ -0,0 +1,420 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * ROTMG performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <rotmg.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+// ElemType1 for storing general type, ElemType2 to store type of C which is only float/double
+template <typename ElemType> class RotmgPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~RotmgPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        RotmgPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor =1;
+
+        if (((fn == FN_DROTMG)) &&
+            !base->isDevSupportDoublePrecision())
+        {
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params))
+        {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else
+        {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    RotmgPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType *D1_, *D2_, *X_, *Y_, *PARAM_, *back_D1_, *back_D2_, *back_X_, *back_Y_, *back_PARAM_;
+    cl_mem mobjD1_, mobjD2_, mobjX_, mobjY_, mobjParam_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+RotmgPerformanceTest<ElemType>::RotmgPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn,(problem_size_t) ((4 + 3 + 10) * sizeof(ElemType))), params_(*params)
+                                // D1,D2,X and Param are Read/Write and Y is Read only
+{
+
+    D1_ = D2_ = X_ = Y_ = PARAM_ = NULL;
+    back_D1_ = back_D2_ = back_X_ = back_Y_ = back_PARAM_ = NULL;
+    mobjD1_ = mobjD2_ = mobjX_= mobjY_ = mobjParam_ = NULL;
+
+    try
+    {
+        D1_         = new ElemType[1 + params_.offa];
+        back_D1_    = new ElemType[1 + params_.offa];
+        D2_         = new ElemType[1 + params_.offb];
+        back_D2_    = new ElemType[1 + params_.offb];
+        X_          = new ElemType[1 + params_.offBX];
+        back_X_     = new ElemType[1 + params_.offBX];
+        Y_          = new ElemType[1 + params_.offCY];
+        back_Y_     = new ElemType[1 + params_.offCY];
+        PARAM_      = new ElemType[5 + params_.offc];
+        back_PARAM_ = new ElemType[5 + params_.offc];
+    }
+    catch(bad_alloc& ba)
+    {
+        D1_ = back_D1_ = D2_ = back_D2_ = X_ = back_X_ = Y_ = back_Y_ = NULL;
+        // areResourcesSufficient() will handle the rest and return
+        PARAM_ = back_PARAM_ = NULL;
+        ba = ba;
+    }
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+RotmgPerformanceTest<ElemType>::~RotmgPerformanceTest()
+{
+    if(D1_ != NULL)
+    {
+        delete[] D1_;
+	}
+	if(back_D1_ != NULL)
+    {
+        delete[] back_D1_;
+	}
+    if( mobjD1_ != NULL )
+    {
+		clReleaseMemObject(mobjD1_);
+    }
+
+    if(D2_ != NULL)
+    {
+        delete[] D2_;
+	}
+	if(back_D2_ != NULL)
+    {
+        delete[] back_D2_;
+	}
+    if( mobjD2_ != NULL )
+    {
+		clReleaseMemObject(mobjD2_);
+    }
+
+	if(X_ != NULL)
+    {
+        delete[] X_;
+	}
+	if(back_X_ != NULL)
+    {
+        delete[] back_X_;
+	}
+    if( mobjX_ != NULL )
+    {
+		clReleaseMemObject(mobjX_);
+    }
+
+    if(Y_ != NULL)
+    {
+        delete[] Y_;
+	}
+	if(back_Y_ != NULL)
+    {
+        delete[] back_Y_;
+	}
+    if( mobjY_ != NULL )
+    {
+		clReleaseMemObject(mobjY_);
+    }
+
+    if(PARAM_ != NULL)
+    {
+        delete[] PARAM_;
+	}
+	if(back_PARAM_ != NULL)
+    {
+        delete[] back_PARAM_;
+	}
+    if( mobjParam_ != NULL )
+    {
+		clReleaseMemObject(mobjParam_);
+    }
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+RotmgPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t offx = params->offBX;
+    size_t offy = params->offCY;
+    size_t offD1 = params->offa;
+    size_t offD2 = params->offb;
+    size_t offParam = params->offc;
+
+    bool ret;
+    size_t sizeRequired = ((1 + offx) + (1 + offy) + (1 + offD1) + (1 + offD2) + (1 + offParam)) * sizeof(ElemType);
+
+	if((D1_ == NULL) || (back_D1_ == NULL) ||(X_ == NULL) || (back_X_ == NULL) || (Y_ == NULL) || (back_Y_ == NULL) ||
+        (D2_ == NULL) || (back_D2_ == NULL) || (PARAM_ == NULL) || (back_PARAM_ == NULL))
+    {
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize(0);
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    ret = (sizeRequired) < allocSize;
+    ret = ret && (sizeRequired < gmemSize);
+
+    return ret;
+}
+
+template <typename ElemType> int
+RotmgPerformanceTest<ElemType>::prepare(void)
+{
+    //Filling random values for SA and SB. C & S are only for output sake
+    randomRotmg( (D1_ + params_.offa), (D2_ + params_.offb),
+                (X_ + params_.offBX), (Y_ + params_.offCY), (PARAM_ + params_.offc) );
+
+    ElemType sflagParam = convertMultiplier<ElemType>(params_.alpha);
+    PARAM_[params_.offc] = sflagParam; // initializing first element
+
+    memcpy(back_D1_, D1_, (1 + params_.offa)*sizeof(ElemType));
+    memcpy(back_D2_, D2_, (1 + params_.offb)*sizeof(ElemType));
+    memcpy(back_X_, X_, (1 + params_.offBX)*sizeof(ElemType));
+    memcpy(back_Y_, Y_, (1 + params_.offCY)*sizeof(ElemType));
+    memcpy(back_PARAM_, PARAM_, (5 + params_.offc)*sizeof(ElemType));
+
+	// Allocate buffers
+    mobjX_ = base_->createEnqueueBuffer(X_, (1 + params_.offBX) * sizeof(ElemType), 0, CL_MEM_READ_WRITE);
+    mobjY_ = base_->createEnqueueBuffer(Y_, (1 + params_.offCY) * sizeof(ElemType), 0, CL_MEM_READ_ONLY);
+    mobjD1_ = base_->createEnqueueBuffer(D1_, (1 + params_.offa) * sizeof(ElemType), 0, CL_MEM_READ_WRITE);
+    mobjD2_ = base_->createEnqueueBuffer(D2_, (1 + params_.offb) * sizeof(ElemType), 0, CL_MEM_READ_WRITE);
+    mobjParam_  = base_->createEnqueueBuffer(PARAM_,  (5 + params_.offc) * sizeof(ElemType), 0, CL_MEM_READ_ONLY);
+
+    if((mobjD1_ == NULL) || (mobjD2_ == NULL) || (mobjX_ == NULL) || (mobjY_ == NULL) || (mobjParam_ == NULL))
+    {
+        return -1;
+    }
+    return 0;
+}
+
+template <typename ElemType> nano_time_t
+RotmgPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+
+#ifdef PERF_TEST_WITH_ACML
+
+		time = getCurrentTime();
+		clMath::blas::rotmg(back_D1_, params_.offa, back_D2_, params_.offb, back_X_, params_.offBX,
+                         back_Y_, params_.offCY, back_PARAM_, params_.offc);
+		time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+RotmgPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    DataType type;
+    type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT: TYPE_DOUBLE;
+
+    status = clEnqueueWriteBuffer(queue, mobjD1_, CL_TRUE, 0, (1 + params_.offa) * sizeof(ElemType), D1_, 0, NULL, &event);
+    if (status != CL_SUCCESS)
+    {
+        cerr << "Vector D1 buffer object enqueuing error, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    status = clEnqueueWriteBuffer(queue, mobjD2_, CL_TRUE, 0, (1 + params_.offb) * sizeof(ElemType), D2_, 0, NULL, &event);
+    if (status != CL_SUCCESS)
+    {
+        cerr << "Vector D2 buffer object enqueuing error, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+    status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0, (1 + params_.offBX) * sizeof(ElemType), X_, 0, NULL, &event);
+    if (status != CL_SUCCESS)
+    {
+        cerr << "Vector X buffer object enqueuing error, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0, (1 + params_.offCY) * sizeof(ElemType), Y_, 0, NULL, &event);
+    if (status != CL_SUCCESS)
+    {
+        cerr << "Vector Y buffer object enqueuing error, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    status = clEnqueueWriteBuffer(queue, mobjParam_, CL_TRUE, 0, (5 + params_.offc) * sizeof(ElemType), PARAM_, 0, NULL, &event);
+    if (status != CL_SUCCESS)
+    {
+        cerr << "Vector C buffer object enqueuing error, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS)
+    {
+        cout << "Wait on event failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+    time = getCurrentTime();
+
+#define TIMING
+#ifdef TIMING
+    clFinish( queue);
+    int iter = 50;
+    for ( int i=1; i <= iter; i++)
+    {
+#endif
+        status = (cl_int)clMath::clblas::rotmg(type, mobjD1_, params_.offa, mobjD2_, params_.offb, mobjX_, params_.offBX,
+                                           mobjY_, params_.offCY, mobjParam_, params_.offc, 1, &queue, 0, NULL, &event);
+        if (status != CL_SUCCESS)
+        {
+            cerr << "The CLBLAS ROTMG function failed, status = " << status << endl;
+            return NANOTIME_ERR;
+        }
+#ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+#else
+
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS)
+    {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS)
+    {
+        time = getCurrentTime() - time;
+    }
+    else
+    {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+    return time;
+}
+
+} // namespace clMath
+
+// rotmg performance test
+TEST_P(ROTMG, srotmg)
+{
+    TestParams params;
+
+    getParams(&params);
+    RotmgPerformanceTest<float>::runInstance(FN_SROTMG, &params);
+}
+
+
+TEST_P(ROTMG, drotmg)
+{
+    TestParams params;
+
+    getParams(&params);
+    RotmgPerformanceTest<double>::runInstance(FN_DROTMG, &params);
+}
+
diff --git a/src/tests/performance/perf-sbmv.cpp b/src/tests/performance/perf-sbmv.cpp
new file mode 100644
index 0000000..af1385d
--- /dev/null
+++ b/src/tests/performance/perf-sbmv.cpp
@@ -0,0 +1,328 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Sbmv performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <gbmv.h>
+#include <sbmv.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class SbmvPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~SbmvPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        SbmvPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor = 1;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        if ((fn == FN_DSBMV) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    SbmvPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha;
+    ElemType beta;
+    ElemType *A_;
+    ElemType *X_;
+    ElemType *Y_;
+    ElemType *backY_;
+    cl_mem mobjA_;
+    cl_mem mobjX_;
+    cl_mem mobjY_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+SbmvPerformanceTest<ElemType>::SbmvPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn,
+    (problem_size_t)( ( (2 * (params->N) * (params->K + 1)   // A-access
+                          - (2 * params->K *  (params->K+1)) )       // Substract hole-part for A & X
+                        +( ((2*params->K + 1) * params->N + 2*params->N))   // X & Y access
+                                                                                                              ) * sizeof(ElemType) ) ),
+                          params_(*params), mobjA_(NULL), mobjX_(NULL), mobjY_(NULL)
+{
+    size_t lenA, lenX, lenY;
+    lenA = params_.N  * (params_.lda) + params_.offA;
+    lenX = params_.N - 1* params_.incx + 1 + params_.offBX;
+    lenY = params_.N - 1* params_.incy + 1 + params_.offCY;
+    A_ = new ElemType[ lenA ];
+    X_ = new ElemType[ lenX ];
+    Y_ = new ElemType[ lenY ];
+    backY_ = new ElemType[ lenY ];
+    alpha = convertMultiplier<ElemType>(params_.alpha);
+	beta  = convertMultiplier<ElemType>(params_.beta);
+
+    base_ = ::clMath::BlasBase::getInstance();
+
+	mobjA_ = NULL;
+	mobjX_ = NULL;
+	mobjY_ = NULL;
+}
+
+template <typename ElemType>
+SbmvPerformanceTest<ElemType>::~SbmvPerformanceTest()
+{
+    if(A_ != NULL)
+    {
+        delete[] A_;
+    }
+	if(X_ != NULL)
+	{
+        delete[] X_;
+	}
+	if(backY_ != NULL)
+	{
+		delete[] backY_;
+	}
+	if(Y_ != NULL)
+	{
+	    delete[] Y_;
+	}
+
+    if ( mobjA_ != NULL )
+		clReleaseMemObject(mobjA_);
+	if ( mobjX_ != NULL )
+	    clReleaseMemObject(mobjX_);
+	if ( mobjY_ != NULL )
+		clReleaseMemObject(mobjY_);
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+SbmvPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t n = params->N, lda = params->lda;
+    size_t lenA = (n * lda)  + params->offA* sizeof(ElemType);
+    size_t lenX = (n - 1) * params->incx + 1 + params->offBX * sizeof(ElemType);
+    size_t lenY = (n - 1) * params->incy + 1 + params->offCY * sizeof(ElemType);
+
+    if((A_ == NULL) || (X_ == NULL) || (Y_ == NULL) || (backY_ == NULL))
+	{
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize(0);
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    bool suff = (lenA < allocSize) && ( (lenA + lenX + lenY) < gmemSize );
+
+    return suff;
+}
+
+template <typename ElemType> int
+SbmvPerformanceTest<ElemType>::prepare(void)
+{
+    size_t lenX, lenY, lenA;
+
+    lenA = (params_.N * params_.lda) + params_.offA;
+
+    if (params_.transA == clblasNoTrans) {
+        lenX = (params_.N - 1) * abs(params_.incx) + 1 + params_.offBX;
+        lenY = (params_.N - 1) * abs(params_.incy) + 1 + params_.offCY;
+    }
+    else {
+        lenX = (params_.N - 1)*abs(params_.incx) + 1 + params_.offBX;
+        lenY = (params_.N - 1)*abs(params_.incy) + 1 + params_.offCY;
+    }
+
+    randomGbmvMatrices(params_.order, clblasNoTrans , params_.N, params_.N, &alpha, &beta,
+                        (A_+params_.offA), params_.lda, (X_+params_.offBX), params_.incx, (Y_+params_.offCY), params_.incy );
+
+    memcpy(backY_, Y_, lenY * sizeof(ElemType));
+
+    mobjA_ = base_->createEnqueueBuffer(A_, lenA * sizeof(ElemType), 0, CL_MEM_READ_ONLY);
+    mobjX_ = base_->createEnqueueBuffer(X_, lenX * sizeof(ElemType), 0, CL_MEM_READ_ONLY);
+    mobjY_ = base_->createEnqueueBuffer(backY_, lenY * sizeof(ElemType), 0, CL_MEM_READ_WRITE);
+
+    return ((mobjA_ != NULL) && (mobjX_ != NULL) && (mobjY_ != NULL)) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+SbmvPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder fOrder;
+    clblasUplo fUplo;
+    size_t lda, lenY;
+    size_t fN = params_.N, fK = params_.K;
+
+    lenY = (params_.N - 1) * params_.incy + 1 + params_.offCY;
+
+    memcpy(Y_, backY_, lenY * sizeof(ElemType));
+    fOrder = params_.order;
+    fUplo = params_.uplo;
+    lda = params_.lda;
+
+    if (fOrder != clblasColumnMajor)
+    {
+        fOrder = clblasColumnMajor;
+        fUplo = (params_.uplo == clblasLower)? clblasUpper : clblasLower;
+        fN = params_.N;
+   	}
+
+#ifdef PERF_TEST_WITH_ACML
+
+   	time = getCurrentTime();
+   	clMath::blas::sbmv(fOrder, fUplo, fN, fK , alpha, A_, params_.offA, lda,
+							X_, params_.offBX, params_.incx, beta, Y_, params_.offCY, params_.incy);
+  	time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+SbmvPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    size_t lenY;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    lenY = (params_.N - 1)* params_.incy + 1 + params_.offCY;
+
+    status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0,
+                                  lenY * sizeof(ElemType), backY_, 0, NULL, &event);
+
+    if (status != CL_SUCCESS) {
+        cerr << "Vector Y buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+    time = getCurrentTime();
+    int iter = 20;
+	for ( int i = 1; i <= iter; i++)
+	{
+        status = clMath::clblas::sbmv(params_.order, params_.uplo, params_.N, params_.K,
+                                        alpha, mobjA_, params_.offA, params_.lda, mobjX_, params_.offBX, params_.incx,
+                                        beta, mobjY_, params_.offCY, params_.incy, 1, &queue, 0, NULL, &event);
+
+        if (status != CL_SUCCESS) {
+            cerr << "The CLBLAS GBMV function failed, status = " <<
+                    status << endl;
+            return NANOTIME_ERR;
+        }
+    }
+    clFinish( queue );
+    time = getCurrentTime() - time;
+	time /= iter;
+
+    return time;
+}
+
+} // namespace clMath
+
+// sgbmv performance test
+TEST_P(SBMV, ssbmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    SbmvPerformanceTest<float>::runInstance(FN_SSBMV, &params);
+}
+
+// dgbmv performance test case
+TEST_P(SBMV, dsbmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    SbmvPerformanceTest<double>::runInstance(FN_DSBMV, &params);
+}
diff --git a/src/tests/performance/perf-scal.cpp b/src/tests/performance/perf-scal.cpp
new file mode 100644
index 0000000..391afcd
--- /dev/null
+++ b/src/tests/performance/perf-scal.cpp
@@ -0,0 +1,336 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * SCAL performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <scal.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class ScalPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~ScalPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        ScalPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor =1;
+
+        if (((fn == FN_DSCAL) || (fn == FN_ZSCAL) || (fn == FN_ZDSCAL)) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    ScalPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType *X_;
+    ElemType *backX_;
+    cl_mem mobjX_;
+    size_t  lengthX;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+ScalPerformanceTest<ElemType>::ScalPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn,(problem_size_t) ( (2 * params->N)  * sizeof(ElemType) ) ), params_(*params), mobjX_(NULL)
+{
+
+    X_ = backX_ = NULL;
+    mobjX_= NULL;
+    lengthX = 1 + (params->N - 1) * abs(params_.incx);
+    try {
+        X_ = new ElemType[lengthX + params_.offBX];
+        backX_ = new ElemType[lengthX + params_.offBX];
+    }
+    catch(bad_alloc& ba) {
+        X_ = backX_ = NULL;     // areResourcesSufficient() will handle the rest and return
+        mobjX_= NULL;
+        ba = ba;
+    }
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+ScalPerformanceTest<ElemType>::~ScalPerformanceTest()
+{
+	if(X_ != NULL) {
+        delete[] X_;
+	}
+	if(backX_ != NULL) {
+        delete[] backX_;
+	}
+    if( mobjX_ != NULL )
+		clReleaseMemObject(mobjX_);
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+ScalPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    bool ret;
+
+	if((X_ == NULL) || (backX_ == NULL)) {
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize( 0 );
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    ret = ((lengthX + params->offBX) * sizeof(ElemType)) < allocSize;
+    ret = ret && ( ((lengthX + params->offBX) * sizeof(ElemType)) < gmemSize);
+
+    return ret;
+}
+
+template <typename ElemType> int
+ScalPerformanceTest<ElemType>::prepare(void)
+{
+
+    alpha_ = convertMultiplier<ElemType>(params_.alpha);
+    randomVectors(params_.N, (X_ + params_.offBX), params_.incx);
+    memcpy(backX_, X_, (lengthX + params_.offBX)* sizeof(ElemType));
+	mobjX_ = base_->createEnqueueBuffer(X_, ((lengthX + params_.offBX) * sizeof(*X_)), 0, CL_MEM_READ_WRITE);
+
+    return (mobjX_ != NULL)? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+ScalPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    bool is_css_zds = (params_.K == 1)? true: false;        // K indicates csscal/zdscal
+
+#ifdef PERF_TEST_WITH_ACML
+
+		time = getCurrentTime();
+		clMath::blas::scal(is_css_zds, params_.N, alpha_, X_, params_.offBX, params_.incx);
+		time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+ScalPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+    bool is_css_zds = (params_.K == 1)? true: false;        // K indicates csscal/zdscal
+
+    status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0,
+                                  (lengthX + params_.offBX) * sizeof(ElemType), backX_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Matrix A buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+    time = getCurrentTime();
+
+#define TIMING
+#ifdef TIMING
+    clFinish( queue);
+    int iter = 50;
+    for ( int i=1; i <= iter; i++)
+    {
+#endif
+
+        status = (cl_int)clMath::clblas::scal(is_css_zds, params_.N, alpha_, mobjX_, params_.offBX, params_.incx,
+                            1, &queue, 0, NULL, &event);
+        if (status != CL_SUCCESS) {
+            cerr << "The CLBLAS SCAL function failed, status = " <<
+                    status << endl;
+
+            return NANOTIME_ERR;
+        }
+#ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+#else
+
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+    return time;
+}
+
+} // namespace clMath
+
+// scal performance test
+TEST_P(SCAL, sscal)
+{
+    TestParams params;
+
+    getParams(&params);
+    params.K = 0;                           // K will indicate wheather routine is csscal/zdscal
+    ScalPerformanceTest<float>::runInstance(FN_SSCAL, &params);
+}
+
+
+TEST_P(SCAL, dscal)
+{
+    TestParams params;
+
+    getParams(&params);
+    params.K = 0;                           // K will indicate wheather routine is csscal/zdscal
+    ScalPerformanceTest<double>::runInstance(FN_DSCAL, &params);
+}
+
+TEST_P(SCAL, cscal)
+{
+    TestParams params;
+
+    getParams(&params);
+    params.K = 0;                           // K will indicate wheather routine is csscal/zdscal
+    ScalPerformanceTest<FloatComplex>::runInstance(FN_CSCAL, &params);
+}
+
+
+TEST_P(SCAL, zscal)
+{
+    TestParams params;
+
+    getParams(&params);
+    params.K = 0;                           // K will indicate wheather routine is csscal/zdscal
+    ScalPerformanceTest<DoubleComplex>::runInstance(FN_ZSCAL, &params);
+}
+
+TEST_P(SCAL, csscal)
+{
+    TestParams params;
+
+    getParams(&params);
+    params.K = 1;                           // K will indicate wheather routine is csscal/zdscal
+    ScalPerformanceTest<FloatComplex>::runInstance(FN_CSSCAL, &params);
+}
+
+
+TEST_P(SCAL, zdscal)
+{
+    TestParams params;
+
+    getParams(&params);
+    params.K = 1;                           // K will indicate wheather routine is csscal/zdscal
+    ScalPerformanceTest<DoubleComplex>::runInstance(FN_ZDSCAL, &params);
+}
diff --git a/src/tests/performance/perf-spmv.cpp b/src/tests/performance/perf-spmv.cpp
new file mode 100644
index 0000000..3513854
--- /dev/null
+++ b/src/tests/performance/perf-spmv.cpp
@@ -0,0 +1,344 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Spmv performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <spmv.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class SpmvPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~SpmvPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        SpmvPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+		opFactor = 1; //FIX-ME
+
+        if ((fn == FN_DSPMV) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    SpmvPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType *AP_;
+	ElemType *X_;
+	ElemType *Y_;
+    ElemType *backY_;
+    cl_mem mobjAP_;
+    cl_mem mobjX_;
+	cl_mem mobjY_;
+	ElemType alpha, beta;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+SpmvPerformanceTest<ElemType>::SpmvPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest( fn, (problem_size_t)( ( ((2 * (( params->N * (params->N)) + params->N)) ) * sizeof(ElemType) ) ) ),
+    params_(*params), mobjAP_(NULL), mobjX_(NULL)
+{
+
+    AP_ = new ElemType[((params_.N * (params_.N + 1)) / 2 ) + params_.offA];
+    X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx)  + params_.offBX];
+	Y_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incy)  + params_.offCY];
+    backY_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incy)  + params_.offCY];
+	alpha = convertMultiplier<ElemType>(params_.alpha);
+	beta  = convertMultiplier<ElemType>(params_.beta);
+
+    base_ = ::clMath::BlasBase::getInstance();
+
+	mobjAP_ = NULL;
+	mobjX_ = NULL;
+	mobjY_ = NULL;
+}
+
+template <typename ElemType>
+SpmvPerformanceTest<ElemType>::~SpmvPerformanceTest()
+{
+	if(AP_ != NULL)
+    {
+        delete[] AP_;
+    }
+	if(X_ != NULL)
+	{
+        delete[] X_;
+	}
+	if(backY_ != NULL)
+	{
+		delete[] backY_;
+	}
+	if(Y_ != NULL)
+	{
+	    delete[] Y_;
+	}
+
+    if ( mobjAP_ != NULL )
+		clReleaseMemObject(mobjAP_);
+	if ( mobjX_ != NULL )
+	    clReleaseMemObject(mobjX_);
+	if ( mobjY_ != NULL )
+		clReleaseMemObject(mobjY_);
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+SpmvPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t n = params->N;
+
+	if((AP_ == NULL) || (X_ == NULL) || (Y_ == NULL) || (backY_ == NULL))
+	{
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize( 0 );
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    bool suff = ( sizeof(ElemType)*((n*(n+1))/2) < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations
+	suff = suff && ((( ((n*(n+1))/2) + (1 + (n-1)*abs(params->incx)) + (1 + (n-1)*abs(params->incy)))*sizeof(ElemType)) < gmemSize) ; //for total global allocations
+
+    return suff ;
+}
+
+template <typename ElemType> int
+SpmvPerformanceTest<ElemType>::prepare(void)
+{
+    size_t lenX, N, lenY;
+	N = params_.N;
+    lenX = 1 + (N-1) * abs(params_.incx);
+	lenY = 1 + (N-1) * abs(params_.incy);
+
+	randomSpmvMatrices(params_.order, params_.uplo, N, true, &alpha, (AP_ + params_.offA),
+                        (X_ + params_.offBX), params_.incx, true, &beta, (Y_ + params_.offCY), params_.incy);
+
+	memcpy(backY_, Y_, (lenY+ params_.offCY )* sizeof(ElemType));
+
+    mobjAP_ = base_->createEnqueueBuffer(AP_, (((params_.N * (params_.N + 1)) / 2 ) + params_.offA)* sizeof(*AP_), 0, CL_MEM_READ_ONLY);
+    mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX )* sizeof(*X_), 0, CL_MEM_READ_ONLY);
+	mobjY_ = base_->createEnqueueBuffer(Y_, (lenY + params_.offCY )* sizeof(*Y_), 0, CL_MEM_READ_WRITE);
+
+    return ( (mobjAP_ != NULL) &&  (mobjX_ != NULL) && (mobjY_ != NULL) ) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+SpmvPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+	clblasUplo fUplo;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+    order = params_.order;
+	fUplo = params_.uplo;
+
+#ifdef PERF_TEST_WITH_ACML
+
+	if (order != clblasColumnMajor)
+    {
+        order = clblasColumnMajor;
+		fUplo =  (params_.uplo == clblasUpper)? clblasLower : clblasUpper;
+   	}
+
+   	time = getCurrentTime();
+  	clMath::blas::spmv(order, fUplo, params_.N, alpha, AP_, params_.offA,
+							X_, params_.offBX, params_.incx, beta, Y_, params_.offCY, params_.incy);
+  	time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+SpmvPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+	int lenY = 1 + (params_.N-1) * abs(params_.incy);
+
+    status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0,
+                                  (lenY + params_.offCY )* sizeof(ElemType), backY_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Vector Y buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+
+	time = getCurrentTime();
+#define TIMING
+#ifdef TIMING
+
+	int iter = 20;
+	for ( int i = 1; i <= iter; i++)
+	{
+#endif
+		status = (cl_int)clMath::clblas::spmv(params_.order, params_.uplo, params_.N, alpha, mobjAP_, params_.offA,
+						mobjX_, params_.offBX, params_.incx, beta, mobjY_, params_.offCY, params_.incy,
+						1, &queue, 0, NULL, &event);
+
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS SPMV function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+#ifdef TIMING
+	} // iter loop
+	clFinish( queue);
+    time = getCurrentTime() - time;
+	time /= iter;
+#else
+
+	status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+
+	//printf("Time elapsed : %lu\n", time);
+#endif
+
+    return time;
+}
+
+} // namespace clMath
+
+
+TEST_P(SPMV, sspmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    SpmvPerformanceTest<cl_float>::runInstance(FN_SSPMV, &params);
+}
+
+TEST_P(SPMV, dspmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    SpmvPerformanceTest<cl_double>::runInstance(FN_DSPMV, &params);
+}
+
diff --git a/src/tests/performance/perf-spr.cpp b/src/tests/performance/perf-spr.cpp
new file mode 100644
index 0000000..2ce62eb
--- /dev/null
+++ b/src/tests/performance/perf-spr.cpp
@@ -0,0 +1,337 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <spr.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class SprPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~SprPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        SprPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor = 1;
+
+        if ((fn == FN_DSPR) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    SprPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType *AP_;
+    ElemType *X_;
+	ElemType *backAP_;
+    cl_mem mobjAP_;
+    cl_mem mobjX_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+SprPerformanceTest<ElemType>::SprPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn, (problem_size_t)((((params->N * params->N) + params->N) * 2 ) * sizeof(ElemType))),
+                          params_(*params), mobjAP_(NULL), mobjX_(NULL)
+{
+    AP_ = new ElemType[( ( params_.N*( params_.N + 1 ) )/2 )+ params_.offa];
+    X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx)  + params_.offBX];
+    backAP_ = new ElemType[( ( params_.N*( params_.N + 1 ) )/2 )+ params_.offa];
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+SprPerformanceTest<ElemType>::~SprPerformanceTest()
+{
+    if(AP_ != NULL)
+    {
+        delete[] AP_;
+    }
+	if(backAP_ != NULL)
+	{
+	    delete[] backAP_;
+	}
+	if(X_ != NULL)
+	{
+        delete[] X_;
+	}
+
+	if(mobjX_ != NULL) {
+		clReleaseMemObject(mobjX_);
+    }
+	if(mobjAP_ != NULL) {
+		clReleaseMemObject(mobjAP_);
+	}
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+SprPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t n = params->N;
+
+	if((AP_ == NULL) || (backAP_ == NULL) || (X_ == NULL))
+	{
+        return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize( 0 );
+    allocSize = (size_t)base->maxMemAllocSize();
+
+	bool suff = ( sizeof(ElemType)*( ( n*( n + 1 ) )/2 ) < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations
+    suff = suff && (((( (n *(n + 1 ) )/2 )+ (1 + (n-1)*abs(params->incx))*2)*sizeof(ElemType)) < gmemSize) ; //for total global allocations
+
+    return suff ;
+}
+
+template <typename ElemType> int
+SprPerformanceTest<ElemType>::prepare(void)
+{
+    bool useAlpha = true;
+	size_t lenX = 1 + (params_.N-1) * abs(params_.incx);
+
+    alpha_ = convertMultiplier<ElemType>(params_.alpha);
+/*
+	int creationFlags = 0;
+    creationFlags =  creationFlags | RANDOM_INIT;
+
+    // Default is Column-Major
+
+	creationFlags = ( (this-> params_.order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
+    creationFlags = ( (this-> params_.uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY);
+	BlasRoutineID BlasFn = CLBLAS_SYR;
+
+    // Matrix A
+    populate( (A_ + params_.offa), params_.N, params_.N, params_.lda, BlasFn, creationFlags);
+    populate( X_ , lenX + params_.offBX, 1, lenX + params_.offBX, BlasFn);
+    */
+
+	randomSyrMatrices(params_.order, params_.uplo, params_.N, useAlpha, &alpha_,
+                        (AP_ + params_.offa), 0, (X_ + params_.offBX), params_.incx);
+
+
+	memcpy(backAP_, AP_, ((( ( params_.N*( params_.N + 1 ) )/2 )+ params_.offa)* sizeof(ElemType)));
+
+    mobjAP_ = base_->createEnqueueBuffer(AP_, (( params_.N*( params_.N + 1 )/2 )+ params_.offa)* sizeof(*AP_), 0, CL_MEM_READ_WRITE);
+    mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX )* sizeof(*X_), 0, CL_MEM_READ_ONLY);
+
+    return ( (mobjAP_ != NULL) &&  (mobjX_ != NULL) ) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+SprPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+    clblasUplo fUplo;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+
+    order = params_.order;
+    fUplo = params_.uplo;
+
+#ifdef PERF_TEST_WITH_ACML
+
+	if (order != clblasColumnMajor)
+    {
+        order = clblasColumnMajor;
+        fUplo =  (params_.uplo == clblasUpper)? clblasLower : clblasUpper;
+
+        if( params_.transA == clblasConjTrans )
+            doConjugate( (AP_ +params_.offa), (( params_.N * (params_.N + 1)) / 2) , 1, 1 );
+
+    }
+
+   	time = getCurrentTime();
+   	clMath::blas::spr(order, fUplo, params_.N, alpha_, X_, params_.offBX, params_.incx, AP_, params_.offa);
+	time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+SprPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    status = clEnqueueWriteBuffer(queue, mobjAP_, CL_TRUE, 0,
+                                  (((( params_.N * (params_.N + 1)) / 2)) + params_.offa) *
+                                  sizeof(ElemType), backAP_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Matrix A buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+
+#define TIMING
+#ifdef TIMING
+    clFinish( queue);
+    time = getCurrentTime();
+
+    int iter = 100;
+    for ( int i = 1; i <= iter; i++)
+    {
+#endif
+    status = (cl_int)clMath::clblas::spr(params_.order, params_.uplo, params_.N, alpha_, mobjX_, params_.offBX, params_.incx,
+				mobjAP_, params_.offa, 1, &queue, 0, NULL, &event);
+
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS SPR function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+#ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+#else
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+
+    return time;
+}
+
+} // namespace clMath
+
+TEST_P(SPR, sspr)
+{
+    TestParams params;
+
+    getParams(&params);
+    SprPerformanceTest<float>::runInstance(FN_SSPR, &params);
+}
+
+TEST_P(SPR, dspr)
+{
+    TestParams params;
+
+    getParams(&params);
+    SprPerformanceTest<double>::runInstance(FN_DSPR, &params);
+}
diff --git a/src/tests/performance/perf-spr2.cpp b/src/tests/performance/perf-spr2.cpp
new file mode 100644
index 0000000..1b57810
--- /dev/null
+++ b/src/tests/performance/perf-spr2.cpp
@@ -0,0 +1,336 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <spr2.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class Spr2PerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~Spr2PerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        Spr2PerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor = 1;
+
+        if ((fn == FN_DSPR2) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    Spr2PerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType *AP_;
+    ElemType *X_;
+	ElemType *Y_;
+	ElemType *backAP_;
+    cl_mem mobjAP_;
+    cl_mem mobjX_;
+	cl_mem mobjY_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+Spr2PerformanceTest<ElemType>::Spr2PerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn, (problem_size_t)((((params->N * params->N) + (params->N)) * 3 ) * sizeof(ElemType))),
+	params_(*params), mobjAP_(NULL), mobjX_(NULL), mobjY_(NULL)
+{
+    AP_ = new ElemType[(( params_.N*( params_.N + 1 ) )/2) + params_.offa];
+    X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx) + params_.offBX];
+	Y_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incy) + params_.offCY];
+    backAP_ = new ElemType[( (params_.N*( params_.N + 1 ) )/2 )+ params_.offa];
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+Spr2PerformanceTest<ElemType>::~Spr2PerformanceTest()
+{
+    if(AP_ != NULL)
+    {
+        delete[] AP_;
+    }
+	if(backAP_ != NULL)
+	{
+		delete[] backAP_;
+	}
+	if(X_ != NULL)
+	{
+        delete[] X_;
+	}
+	if(Y_ != NULL)
+	{
+	    delete[] Y_;
+	}
+
+	if(mobjX_ != NULL) {
+		clReleaseMemObject(mobjX_);
+    }
+
+	if(mobjY_ != NULL) {
+		clReleaseMemObject(mobjY_);
+	}
+
+	if(mobjAP_ != NULL) {
+		clReleaseMemObject(mobjAP_);
+	}
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+Spr2PerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t n = params->N;
+
+	if((AP_ == NULL) || (backAP_ == NULL) || (X_ == NULL) || (Y_ == NULL))
+	{
+        return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize(0);
+    allocSize = (size_t)base->maxMemAllocSize();
+
+	bool suff = ( (sizeof(ElemType)*( params_.N*( params_.N + 1 ) )/2 )< allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize)
+				  && ((1 + (n-1)*abs(params->incy))*sizeof(ElemType) < allocSize); //for individual allocations
+    suff = suff && (((( ( params_.N*( params_.N + 1 ) )/2 )+ (1 + (n-1)*abs(params->incx)) + (1 + (n-1)*abs(params->incy)))*sizeof(ElemType)) < gmemSize) ; //for total global allocations
+
+    return suff ;
+}
+
+template <typename ElemType> int
+Spr2PerformanceTest<ElemType>::prepare(void)
+{
+    bool useAlpha = true;
+	size_t lenX = 1 + (params_.N-1) * abs(params_.incx);
+	size_t lenY = 1 + (params_.N-1) * abs(params_.incy);
+
+    alpha_ = convertMultiplier<ElemType>(params_.alpha);
+
+    randomSyr2Matrices( params_.order, params_.uplo, params_.N, useAlpha, &alpha_, AP_, 0, X_, params_.incx, Y_, params_.incy);
+
+	memcpy(backAP_, AP_, (((( params_.N*( params_.N + 1 ) )/2 )+ params_.offa)* sizeof(ElemType)));
+    mobjAP_ = base_->createEnqueueBuffer(AP_, ((( params_.N*( params_.N + 1 ) )/2 )+ params_.offa)* sizeof(*AP_), 0, CL_MEM_READ_WRITE);
+    mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX ) * sizeof(*X_), 0, CL_MEM_READ_ONLY);
+	mobjY_ = base_->createEnqueueBuffer(Y_, (lenY + params_.offCY ) * sizeof(*Y_), 0, CL_MEM_READ_ONLY);
+
+	return ((mobjAP_ != NULL) &&  (mobjX_ != NULL) && (mobjY_ != NULL)) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+Spr2PerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+    clblasUplo fUplo;
+	//size_t lda;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+
+    order = params_.order;
+    fUplo = params_.uplo;
+
+#ifdef PERF_TEST_WITH_ACML
+
+	if (order != clblasColumnMajor)
+    {
+        order = clblasColumnMajor;
+        fUplo =  (params_.uplo == clblasUpper)? clblasLower : clblasUpper;
+        doConjugate( (X_ + params_.offBX), (1 + (params_.N-1) * abs(params_.incx)), 1, 1 );
+    }
+
+   	time = getCurrentTime();
+   	clMath::blas::spr2(order, fUplo, params_.N, alpha_, X_, params_.offBX, params_.incx, Y_,
+					params_.offCY, params_.incy, AP_, params_.offa);
+	time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+Spr2PerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    status = clEnqueueWriteBuffer(queue, mobjAP_, CL_TRUE, 0,
+                                  ((( params_.N*( params_.N + 1 ) )/2 ) + params_.offa) *
+                                  sizeof(ElemType), backAP_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Matrix A buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+
+#define TIMING
+#ifdef TIMING
+    clFinish( queue);
+    time = getCurrentTime();
+
+    int iter = 100;
+    for ( int i = 1; i <= iter; i++)
+    {
+#endif
+    status = (cl_int)clMath::clblas::spr2(params_.order, params_.uplo, params_.N, alpha_, mobjX_, params_.offBX, params_.incx,
+				mobjY_, params_.offCY, params_.incy, mobjAP_, params_.offa, 1, &queue, 0, NULL, &event);
+
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS SPR2 function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+#ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+#else
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+
+    return time;
+}
+
+} // namespace clMath
+
+TEST_P(SPR2, sspr2)
+{
+    TestParams params;
+
+    getParams(&params);
+    Spr2PerformanceTest<float>::runInstance(FN_SSPR2, &params);
+}
+
+TEST_P(SPR2, dspr2)
+{
+    TestParams params;
+
+    getParams(&params);
+    Spr2PerformanceTest<double>::runInstance(FN_DSPR2, &params);
+}
diff --git a/src/tests/performance/perf-swap.cpp b/src/tests/performance/perf-swap.cpp
new file mode 100644
index 0000000..78f68af
--- /dev/null
+++ b/src/tests/performance/perf-swap.cpp
@@ -0,0 +1,352 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * SWAP performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <swap.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class SwapPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~SwapPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        SwapPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor =1;
+
+        if (((fn == FN_DSWAP) || (fn == FN_ZSWAP)) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    SwapPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType *X_;
+    ElemType *Y_;
+    ElemType *blasX_;
+    ElemType *blasY_;
+    cl_mem mobjX_;
+    cl_mem mobjY_;
+    size_t  lengthX;
+    size_t  lengthY;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+SwapPerformanceTest<ElemType>::SwapPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn,(problem_size_t) ( (4 * params->N)  * sizeof(ElemType) ) ), params_(*params), mobjX_(NULL), mobjY_(NULL)
+{
+
+    X_ = blasX_ = NULL;
+    Y_ = blasY_ = NULL;
+    lengthX = 1 + (params->N - 1) * abs(params_.incx);
+    lengthY = 1 + (params->N - 1) * abs(params_.incy);
+
+    try
+    {
+        X_     = new ElemType[lengthX + params_.offBX];
+        blasX_ = new ElemType[lengthX + params_.offBX];
+        Y_     = new ElemType[lengthY + params_.offCY];
+        blasY_ = new ElemType[lengthY + params_.offCY];
+    }
+    catch(bad_alloc& ba) {
+        X_ = Y_ = blasX_ = blasY_ = NULL;     // areResourcesSufficient() will handle the rest and return
+        mobjX_= mobjY_ = NULL;
+        ba = ba;
+    }
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+SwapPerformanceTest<ElemType>::~SwapPerformanceTest()
+{
+	if(X_ != NULL)
+    {
+        delete[] X_;
+	}
+	if(Y_ != NULL)
+    {
+        delete[] Y_;
+	}
+	if(blasX_ != NULL)
+    {
+        delete[] blasX_;
+	}
+	if(blasY_ != NULL)
+    {
+        delete[] blasY_;
+	}
+    if( mobjX_ != NULL )
+    {
+		clReleaseMemObject(mobjX_);
+    }
+    if( mobjY_ != NULL )
+    {
+		clReleaseMemObject(mobjY_);
+    }
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+SwapPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    bool ret;
+    size_t sizeX, sizeY;
+
+	if((X_ == NULL) || (blasX_ == NULL) || (Y_ == NULL) || (blasY_ == NULL) ) {
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize( 0 );
+    allocSize = (size_t)base->maxMemAllocSize();
+    sizeX = (lengthX + params->offBX) * sizeof(ElemType);
+    sizeY = (lengthY + params->offCY) * sizeof(ElemType);
+
+    ret = ((sizeX < allocSize) && (sizeY < allocSize));
+    ret = (ret && ((sizeX + sizeY) < gmemSize));
+
+    return ret;
+}
+
+template <typename ElemType> int
+SwapPerformanceTest<ElemType>::prepare(void)
+{
+
+    alpha_ = convertMultiplier<ElemType>(params_.alpha);
+    randomVectors(params_.N, (X_ + params_.offBX), params_.incx, (Y_ + params_.offCY), params_.incy);
+    memcpy(blasX_, X_, (lengthX + params_.offBX)* sizeof(ElemType));
+    memcpy(blasY_, Y_, (lengthY + params_.offCY)* sizeof(ElemType));
+
+	mobjX_ = base_->createEnqueueBuffer(X_, ((lengthX + params_.offBX) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
+	mobjY_ = base_->createEnqueueBuffer(Y_, ((lengthY + params_.offCY) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
+
+    return ((mobjX_ != NULL) && (mobjY_ != NULL))? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+SwapPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+
+#ifdef PERF_TEST_WITH_ACML
+
+		time = getCurrentTime();
+		clMath::blas::swap(params_.N, blasX_, params_.offBX, params_.incx, blasY_, params_.offCY, params_.incy);
+		time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+SwapPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    DataType type;
+    type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE:
+										( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+    status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0,
+                                  (lengthX + params_.offBX) * sizeof(ElemType), X_, 0, NULL, &event);
+    if (status != CL_SUCCESS)
+    {
+        cerr << "Vactor X buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0,
+                                  (lengthY + params_.offCY) * sizeof(ElemType), Y_, 0, NULL, &event);
+    if (status != CL_SUCCESS)
+    {
+        cerr << "Vector Y buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS)
+    {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+    time = getCurrentTime();
+
+#define TIMING
+#ifdef TIMING
+    clFinish( queue);
+    int iter = 100;
+    for ( int i=1; i <= iter; i++)
+    {
+#endif
+
+        status = (cl_int)clMath::clblas::swap(type, params_.N, mobjX_, params_.offBX, params_.incx,
+                             mobjY_, params_.offCY, params_.incy, 1, &queue, 0, NULL, &event);
+        if (status != CL_SUCCESS) {
+            cerr << "The CLBLAS SWAP function failed, status = " <<
+                    status << endl;
+
+            return NANOTIME_ERR;
+        }
+#ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+#else
+
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+    return time;
+}
+
+} // namespace clMath
+
+// swap performance test
+TEST_P(SWAPXY, sswap)
+{
+    TestParams params;
+
+    getParams(&params);
+    SwapPerformanceTest<float>::runInstance(FN_SSWAP, &params);
+}
+
+
+TEST_P(SWAPXY, dswap)
+{
+    TestParams params;
+
+    getParams(&params);
+    SwapPerformanceTest<double>::runInstance(FN_DSWAP, &params);
+}
+
+TEST_P(SWAPXY, cswap)
+{
+    TestParams params;
+
+    getParams(&params);
+    SwapPerformanceTest<FloatComplex>::runInstance(FN_CSWAP, &params);
+}
+
+
+TEST_P(SWAPXY, zswap)
+{
+    TestParams params;
+
+    getParams(&params);
+    SwapPerformanceTest<DoubleComplex>::runInstance(FN_ZSWAP, &params);
+}
diff --git a/src/tests/performance/perf-symm.cpp b/src/tests/performance/perf-symm.cpp
new file mode 100644
index 0000000..b64a8a3
--- /dev/null
+++ b/src/tests/performance/perf-symm.cpp
@@ -0,0 +1,404 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Symm performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <symm.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+//#define SHUNT_ACML_RUN
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class SymmPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~SymmPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        SymmPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        if (fn == FN_SSYMM || fn == FN_DSYMM) {
+            opFactor = 2;
+        }
+        else {
+            opFactor = 8;
+        }
+
+        if ((fn == FN_DSYMM || fn == FN_ZSYMM) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    SymmPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType beta_;
+    ElemType *A_;
+    ElemType *B_;
+    ElemType *C_;
+    ElemType *backC_;
+    cl_mem mobjA_;
+    cl_mem mobjB_;
+    cl_mem mobjC_;
+    size_t ka, kbc;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+SymmPerformanceTest<ElemType>::SymmPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn,
+								(problem_size_t) ( params->M * params->N * ( (params->side == clblasLeft)? params->M : params->N ) ) ),
+                        params_(*params), mobjA_(NULL), mobjB_(NULL), mobjC_(NULL)
+{
+	if( params_.side == clblasLeft )
+                ka = params_.M;
+        else    ka = params_.N;
+
+	if( params_.order == clblasColumnMajor )
+				kbc = params_.N;
+		else	kbc = params_.M;
+
+	A_ = new ElemType[params_.lda * ka + params_.offa];
+    B_ = new ElemType[params_.ldb * kbc + params_.offb];
+    C_ = new ElemType[params_.ldc * kbc + params_.offc];
+    backC_ = new ElemType[params_.ldc * kbc + params_.offc];
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+SymmPerformanceTest<ElemType>::~SymmPerformanceTest()
+{
+    if(A_ != NULL)
+    {
+    delete[] A_;
+    }
+	if(B_ != NULL)
+	{
+    delete[] B_;
+	}
+	if(C_ != NULL)
+	{
+    delete[] C_;
+	}
+	if(backC_ != NULL)
+	{
+    delete[] backC_;
+	}
+
+	if( mobjC_ != NULL )
+	    clReleaseMemObject(mobjC_);
+    if( mobjB_ != NULL )
+		clReleaseMemObject(mobjB_);
+	if( mobjA_ != NULL )
+	    clReleaseMemObject(mobjA_);
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+SymmPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    bool ret;
+    size_t m = params->M, n = params->N;
+
+	if((A_ == NULL) || (backC_ == NULL) || (C_ == NULL) || (B_ == NULL))
+	{
+        return 0;	// Not enough memory for host arrays
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize( 0 );
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    ret = std::max(m, n) * params_.lda * sizeof(ElemType) < allocSize;
+	ret = ret && (std::max(m, n) * params_.ldb * sizeof(ElemType) < allocSize);
+	ret = ret && (std::max(m, n) * params_.ldc * sizeof(ElemType) < allocSize);
+	ret = ret && (((std::max(m, n) * params_.lda) + (std::max(m, n) * params_.ldb) + (std::max(m, n) * params_.ldc)) < gmemSize);
+
+    return ret;
+}
+
+template <typename ElemType> int
+SymmPerformanceTest<ElemType>::prepare(void)
+{
+    bool useAlpha = base_->useAlpha();
+    bool useBeta = base_->useBeta();
+
+    if (useAlpha) {
+        alpha_ = convertMultiplier<ElemType>(params_.alpha);
+    }
+    if (useBeta) {
+        beta_ = convertMultiplier<ElemType>(params_.beta);
+    }
+
+
+	int creationFlags = 0;
+    int AcreationFlags = 0;
+    creationFlags =  creationFlags | RANDOM_INIT;
+
+    creationFlags = ( (params_.order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
+    AcreationFlags = ( (params_.uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY);
+	BlasRoutineID BlasFn = CLBLAS_SYMM;
+
+	populate( A_ + params_.offa, ka, ka, params_.lda, BlasFn, (AcreationFlags ));
+	populate( B_ + params_.offb, params_.M, params_.N, params_.ldb, BlasFn, creationFlags );
+	populate( C_ + params_.offc, params_.M, params_.N, params_.ldc, BlasFn, creationFlags );
+	memcpy( backC_, C_, (kbc * params_.ldc + params_.offc) * sizeof(ElemType) );
+
+		mobjA_ = base_->createEnqueueBuffer(A_, (params_.lda * ka  + params_.offa) * sizeof(ElemType), 0, CL_MEM_READ_ONLY);
+    if (mobjA_) {
+        mobjB_ = base_->createEnqueueBuffer(B_, (params_.ldb * kbc + params_.offb) * sizeof(ElemType), 0, CL_MEM_READ_ONLY);
+    }
+    if (mobjB_) {
+        mobjC_ = base_->createEnqueueBuffer(backC_, (params_.ldc * kbc + params_.offc) * sizeof(ElemType), 0, CL_MEM_READ_WRITE);
+    }
+
+    return (mobjC_) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+SymmPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+    clblasUplo fUplo;
+	clblasSide fSide;
+	size_t lda, ldb, ldc, fN, fM;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+
+    order = params_.order;
+	fUplo = params_.uplo;
+	fSide = params_.side;
+    lda = params_.lda;
+    ldb = params_.ldb;
+    ldc = params_.ldc;
+	fM = params_.M;
+	fN = params_.N;
+
+#ifdef PERF_TEST_WITH_ACML
+
+	if (order != clblasColumnMajor) {
+
+           order = clblasColumnMajor;
+		   fM = params_.N;
+           fN = params_.M;
+           fSide = (params_.side == clblasLeft)? clblasRight: clblasLeft;
+           fUplo = (params_.uplo == clblasUpper)? clblasLower: clblasUpper;
+       }
+
+
+    time = getCurrentTime();
+    #ifndef SHUNT_ACML_RUN
+    clMath::blas::symm(order, fSide, fUplo, fM, fN, alpha_,
+							A_, params_.offa, lda, B_, params_.offb, ldb, beta_, C_, params_.offc, ldc);
+    #endif
+    time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+SymmPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0,
+                                  (params_.ldc * kbc + params_.offc) * sizeof(ElemType), backC_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Matrix C buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+	time = getCurrentTime();
+//#define TIMING
+#ifdef TIMING
+	clFinish( queue);
+
+	int iter = 20;
+	for ( int i = 1; i <= iter; i++)
+	{
+#endif
+    status = (cl_int)clMath::clblas::symm(params_.order,
+        params_.side, params_.uplo, params_.M, params_.N, alpha_,
+        mobjA_, params_.offa, params_.lda, mobjB_, params_.offb, params_.ldb, beta_, mobjC_, params_.offc, params_.ldc, 1,
+        &queue, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS SYMM function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+#ifdef TIMING
+	} // iter loop
+	clFinish( queue);
+    time = getCurrentTime() - time;
+	time /= iter;
+#else
+
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+
+    return time;
+}
+
+} // namespace clMath
+
+// ssymm performance test
+TEST_P(SYMM, ssymm)
+{
+    TestParams params;
+
+    getParams(&params);
+    SymmPerformanceTest<float>::runInstance(FN_SSYMM, &params);
+}
+
+
+TEST_P(SYMM, dsymm)
+{
+    TestParams params;
+
+    getParams(&params);
+    SymmPerformanceTest<double>::runInstance(FN_DSYMM, &params);
+}
+
+
+TEST_P(SYMM, csymm)
+{
+    TestParams params;
+
+    getParams(&params);
+    SymmPerformanceTest<FloatComplex>::runInstance(FN_CSYMM, &params);
+}
+
+
+TEST_P(SYMM, zsymm)
+{
+    TestParams params;
+
+    getParams(&params);
+    SymmPerformanceTest<DoubleComplex>::runInstance(FN_ZSYMM, &params);
+}
diff --git a/src/tests/performance/perf-symv.cpp b/src/tests/performance/perf-symv.cpp
new file mode 100644
index 0000000..2ea9544
--- /dev/null
+++ b/src/tests/performance/perf-symv.cpp
@@ -0,0 +1,351 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Symv performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <symv.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class SymvPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~SymvPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        SymvPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor = (fn == FN_SSYMV) ? sizeof(cl_float) : sizeof(cl_double);
+
+        if ((fn == FN_DSYMV) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to insufficient resources" <<
+                        std::endl;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    SymvPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType beta_;
+    ElemType *A_;
+    ElemType *B_;
+    ElemType *C_;
+    ElemType *backC_;
+    cl_mem mobjA_;
+    cl_mem mobjB_;
+    cl_mem mobjC_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+SymvPerformanceTest<ElemType>::SymvPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn, (problem_size_t)params->N * params->N),
+                        params_(*params), mobjA_(NULL), mobjB_(NULL), mobjC_(NULL)
+{
+    A_ = new ElemType[params_.rowsA * params_.columnsA];
+    B_ = new ElemType[params_.rowsB * params_.columnsB];
+    C_ = new ElemType[params_.rowsC * params_.columnsC];
+    backC_ = new ElemType[params_.rowsC * params_.columnsC];
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+SymvPerformanceTest<ElemType>::~SymvPerformanceTest()
+{
+    if(A_ != NULL)
+    {
+        delete[] A_;
+    }
+    if(B_ != NULL)
+    {
+        delete[] B_;
+    }
+    if(C_ != NULL)
+    {
+        delete[] C_;
+    }
+    if(backC_ != NULL)
+    {
+        delete[] backC_;
+    }
+
+    if(mobjC_ != NULL)
+    {
+        clReleaseMemObject(mobjC_);
+    }
+    if(mobjB_ != NULL)
+    {
+        clReleaseMemObject(mobjB_);
+    }
+    if(mobjC_ != NULL)
+    {
+        clReleaseMemObject(mobjA_);
+    }
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+SymvPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize, maxMatrSize;
+    size_t n = params->N;
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize(0);
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    if((A_ == NULL) || (backC_ == NULL) || (C_ == NULL) || (B_ == NULL))
+    {
+        return 0;   // Not enough memory for host arrays
+    }
+
+
+    maxMatrSize = gmemSize / 3;
+
+    maxMatrSize = std::min(maxMatrSize, allocSize);
+
+    return (n * n * sizeof(ElemType) < maxMatrSize);
+}
+
+template <typename ElemType> int
+SymvPerformanceTest<ElemType>::prepare(void)
+{
+    size_t lenX, lenY;
+    bool useAlpha = base_->useAlpha();
+    bool useBeta = base_->useBeta();
+
+    if (useAlpha) {
+        alpha_ = convertMultiplier<ElemType>(params_.alpha);
+    }
+    if (useBeta) {
+        beta_ = convertMultiplier<ElemType>(params_.beta);
+    }
+
+    lenX = params_.N;
+    lenY = params_.N;
+    randomGemmxMatrices<ElemType>(params_.order, params_.transA, params_.transB,
+                           params_.transC, lenY, params_.N, lenX, useAlpha,
+                           &alpha_, A_, params_.lda, B_, params_.ldb, useBeta,
+                           &beta_, C_, params_.ldc);
+
+    mobjA_ = base_->createEnqueueBuffer(A_, params_.rowsA * params_.columnsA *
+                                     sizeof(*A_), params_.offA * sizeof(*A_),
+                                     CL_MEM_READ_ONLY);
+    mobjB_ = base_->createEnqueueBuffer(B_, params_.rowsB * params_.columnsB *
+                                     sizeof(*B_), 0, CL_MEM_READ_ONLY);
+    mobjC_ = base_->createEnqueueBuffer(backC_, params_.rowsC * params_.columnsC *
+                                     sizeof(*backC_), 0, CL_MEM_READ_WRITE);
+
+    return ((mobjA_ != NULL) && (mobjB_ != NULL) && (mobjC_ != NULL)) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+SymvPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+    size_t lda;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+
+    memcpy(C_, backC_, params_.rowsC * params_.columnsC * sizeof(ElemType));
+    order = params_.order;
+    lda = params_.lda;
+
+#ifdef PERF_TEST_WITH_ACML
+
+// #warning "SYMV performance test not implemented"
+    time = NANOTIME_MAX;
+    order = order;
+    lda = lda;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+SymvPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0,
+                                  params_.rowsC * params_.columnsC *
+                                  sizeof(ElemType), backC_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Vector Y buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+
+//#define TIMING
+#ifdef TIMING
+    clFinish( queue);
+
+    time = getCurrentTime();
+    int iter = 20;
+    for ( int i = 1; i <= iter; i++)
+    {
+#endif
+    status = (cl_int)clMath::clblas::symv(params_.order,
+        params_.uplo, params_.N, alpha_, mobjA_, params_.offA, params_.lda,
+        mobjB_, params_.offBX, params_.incx,
+        beta_, mobjC_, params_.offCY, params_.incy,
+        1, &queue, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS SYMV function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+#ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+#else
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+    return time;
+}
+
+} // namespace clMath
+
+// ssymv performance test
+TEST_P(SYMV, ssymv)
+{
+    TestParams params;
+
+    getParams(&params);
+    SymvPerformanceTest<float>::runInstance(FN_SSYMV, &params);
+}
+
+// dsymv performance test case
+TEST_P(SYMV, dsymv)
+{
+    TestParams params;
+
+    getParams(&params);
+    SymvPerformanceTest<double>::runInstance(FN_DSYMV, &params);
+}
diff --git a/src/tests/performance/perf-syr.cpp b/src/tests/performance/perf-syr.cpp
new file mode 100644
index 0000000..b67decf
--- /dev/null
+++ b/src/tests/performance/perf-syr.cpp
@@ -0,0 +1,340 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Syr performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <syr.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class SyrPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~SyrPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        SyrPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor = 1;
+
+        if ((fn == FN_DSYR) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    SyrPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType *A_;
+    ElemType *X_;
+	ElemType *backA_;
+    cl_mem mobjA_;
+    cl_mem mobjX_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+SyrPerformanceTest<ElemType>::SyrPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn, (problem_size_t)((((params->N * params->N) + params->N) * 2 ) * sizeof(ElemType))),
+                          params_(*params), mobjA_(NULL), mobjX_(NULL)
+{
+    A_ = new ElemType[params_.N * params_.lda + params_.offa];
+    X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx)  + params_.offBX];
+    backA_ = new ElemType[params_.N * params_.lda + params_.offa];
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+SyrPerformanceTest<ElemType>::~SyrPerformanceTest()
+{
+    if(A_ != NULL)
+    {
+    delete[] A_;
+    }
+	if(backA_ != NULL)
+	{
+		delete[] backA_;
+	}
+	if(X_ != NULL)
+	{
+    delete[] X_;
+	}
+
+	if(mobjX_ != NULL) {
+		clReleaseMemObject(mobjX_);
+    }
+	if(mobjA_ != NULL) {
+		clReleaseMemObject(mobjA_);
+	}
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+SyrPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t n = params->N;
+
+	if((A_ == NULL) || (backA_ == NULL) || (X_ == NULL))
+	{
+        return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize( 0 );
+    allocSize = (size_t)base->maxMemAllocSize();
+
+	bool suff = ( sizeof(ElemType)*n*params->lda < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations
+    suff = suff && ((( n*params->lda + (1 + (n-1)*abs(params->incx))*2)*sizeof(ElemType)) < gmemSize) ; //for total global allocations
+
+    return suff ;
+}
+
+template <typename ElemType> int
+SyrPerformanceTest<ElemType>::prepare(void)
+{
+    bool useAlpha = true;
+	size_t lenX = 1 + (params_.N-1) * abs(params_.incx);
+
+    alpha_ = convertMultiplier<ElemType>(params_.alpha);
+/*
+	int creationFlags = 0;
+    creationFlags =  creationFlags | RANDOM_INIT;
+
+    // Default is Column-Major
+
+	creationFlags = ( (this-> params_.order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
+    creationFlags = ( (this-> params_.uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY);
+	BlasRoutineID BlasFn = CLBLAS_SYR;
+
+    // Matrix A
+    populate( (A_ + params_.offa), params_.N, params_.N, params_.lda, BlasFn, creationFlags);
+    populate( X_ , lenX + params_.offBX, 1, lenX + params_.offBX, BlasFn);
+    */
+
+	randomSyrMatrices( params_.order, params_.uplo, params_.N, useAlpha, &alpha_, A_, params_.lda, X_, params_.incx);
+
+	memcpy(backA_, A_, ((params_.N * params_.lda + params_.offa)* sizeof(ElemType)));
+
+    mobjA_ = base_->createEnqueueBuffer(A_, (params_.N * params_.lda + params_.offa)* sizeof(*A_), 0, CL_MEM_READ_WRITE);
+    mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX )* sizeof(*X_), 0, CL_MEM_READ_ONLY);
+
+    return ( (mobjA_ != NULL) &&  (mobjX_ != NULL) ) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+SyrPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+    clblasUplo fUplo;
+	size_t lda;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+
+    order = params_.order;
+    lda = params_.lda;
+    fUplo = params_.uplo;
+
+#ifdef PERF_TEST_WITH_ACML
+
+	if (order != clblasColumnMajor)
+    {
+        order = clblasColumnMajor;
+        fUplo =  (params_.uplo == clblasUpper)? clblasLower : clblasUpper;
+    }
+
+   	time = getCurrentTime();
+   	clMath::blas::syr(order, fUplo, params_.N, alpha_, X_, params_.offBX, params_.incx, A_, params_.offa, lda);
+	time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+SyrPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    status = clEnqueueWriteBuffer(queue, mobjA_, CL_TRUE, 0,
+                                  ((params_.N * params_.lda) + params_.offa) *
+                                  sizeof(ElemType), backA_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Matrix A buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+
+#define TIMING
+#ifdef TIMING
+    clFinish( queue);
+    time = getCurrentTime();
+
+    int iter = 100;
+    for ( int i = 1; i <= iter; i++)
+    {
+#endif
+    status = (cl_int)clMath::clblas::syr(params_.order, params_.uplo, params_.N, alpha_, mobjX_, params_.offBX, params_.incx,
+				mobjA_, params_.offa, params_.lda, 1, &queue, 0, NULL, &event);
+
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS SYR function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+#ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+#else
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+
+    return time;
+}
+
+} // namespace clMath
+
+// ssyr performance test
+TEST_P(SYR, ssyr)
+{
+    TestParams params;
+
+    getParams(&params);
+    SyrPerformanceTest<float>::runInstance(FN_SSYR, &params);
+}
+
+// dsyr performance test case
+TEST_P(SYR, dsyr)
+{
+    TestParams params;
+
+    getParams(&params);
+    SyrPerformanceTest<double>::runInstance(FN_DSYR, &params);
+}
diff --git a/src/tests/performance/perf-syr2.cpp b/src/tests/performance/perf-syr2.cpp
new file mode 100644
index 0000000..d8ff199
--- /dev/null
+++ b/src/tests/performance/perf-syr2.cpp
@@ -0,0 +1,342 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Syr2 performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <syr2.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class Syr2PerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~Syr2PerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        Syr2PerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        opFactor = 1;
+
+        if ((fn == FN_DSYR2) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    Syr2PerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType *A_;
+    ElemType *X_;
+	ElemType *Y_;
+	ElemType *backA_;
+    cl_mem mobjA_;
+    cl_mem mobjX_;
+	cl_mem mobjY_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+Syr2PerformanceTest<ElemType>::Syr2PerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn, (problem_size_t)((((params->N * params->N) + (params->N)) * 3 ) * sizeof(ElemType))),
+	params_(*params), mobjA_(NULL), mobjX_(NULL), mobjY_(NULL)
+{
+    A_ = new ElemType[params_.N * params_.lda + params_.offa];
+    X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx) + params_.offBX];
+	Y_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incy) + params_.offCY];
+    backA_ = new ElemType[params_.N * params_.lda + params_.offa];
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+Syr2PerformanceTest<ElemType>::~Syr2PerformanceTest()
+{
+    if(A_ != NULL)
+    {
+    delete[] A_;
+    }
+	if(backA_ != NULL)
+	{
+		delete[] backA_;
+	}
+	if(X_ != NULL)
+	{
+    delete[] X_;
+	}
+	if(Y_ != NULL)
+	{
+	delete[] Y_;
+	}
+
+	if(mobjX_ != NULL) {
+		clReleaseMemObject(mobjX_);
+    }
+
+	if(mobjY_ != NULL) {
+		clReleaseMemObject(mobjY_);
+	}
+
+	if(mobjA_ != NULL) {
+		clReleaseMemObject(mobjA_);
+	}
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+Syr2PerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t n = params->N;
+
+	if((A_ == NULL) || (backA_ == NULL) || (X_ == NULL) || (Y_ == NULL))
+	{
+        return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize(0);
+    allocSize = (size_t)base->maxMemAllocSize();
+
+	bool suff = ( sizeof(ElemType)*n*params->lda < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize)
+				  && ((1 + (n-1)*abs(params->incy))*sizeof(ElemType) < allocSize); //for individual allocations
+    suff = suff && ((( n*params->lda + (1 + (n-1)*abs(params->incx)) + (1 + (n-1)*abs(params->incy)))*sizeof(ElemType)) < gmemSize) ; //for total global allocations
+
+    return suff ;
+}
+
+template <typename ElemType> int
+Syr2PerformanceTest<ElemType>::prepare(void)
+{
+    bool useAlpha = true;
+	size_t lenX = 1 + (params_.N-1) * abs(params_.incx);
+	size_t lenY = 1 + (params_.N-1) * abs(params_.incy);
+
+    alpha_ = convertMultiplier<ElemType>(params_.alpha);
+
+    randomSyr2Matrices( params_.order, params_.uplo, params_.N, useAlpha, &alpha_, A_, params_.lda, X_, params_.incx, Y_, params_.incy);
+
+	memcpy(backA_, A_, ((params_.N * params_.lda + params_.offa)* sizeof(ElemType)));
+
+    mobjA_ = base_->createEnqueueBuffer(A_, (params_.N * params_.lda + params_.offa)* sizeof(*A_), 0, CL_MEM_READ_WRITE);
+    mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX ) * sizeof(*X_), 0, CL_MEM_READ_ONLY);
+	mobjY_ = base_->createEnqueueBuffer(Y_, (lenY + params_.offCY ) * sizeof(*Y_), 0, CL_MEM_READ_ONLY);
+
+	return ((mobjA_ != NULL) &&  (mobjX_ != NULL) && (mobjY_ != NULL)) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+Syr2PerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+    clblasUplo fUplo;
+	size_t lda;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+
+    order = params_.order;
+    lda = params_.lda;
+    fUplo = params_.uplo;
+
+#ifdef PERF_TEST_WITH_ACML
+
+	if (order != clblasColumnMajor)
+    {
+        order = clblasColumnMajor;
+        fUplo =  (params_.uplo == clblasUpper)? clblasLower : clblasUpper;
+    }
+
+   	time = getCurrentTime();
+   	clMath::blas::syr2(order, fUplo, params_.N, alpha_, X_, params_.offBX, params_.incx, Y_,
+					params_.offCY, params_.incy, A_, params_.offa, lda);
+	time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+Syr2PerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    status = clEnqueueWriteBuffer(queue, mobjA_, CL_TRUE, 0,
+                                  ((params_.N * params_.lda) + params_.offa) *
+                                  sizeof(ElemType), backA_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Matrix A buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+
+#define TIMING
+#ifdef TIMING
+    clFinish( queue);
+    time = getCurrentTime();
+
+    int iter = 100;
+    for ( int i = 1; i <= iter; i++)
+    {
+#endif
+    status = (cl_int)clMath::clblas::syr2(params_.order, params_.uplo, params_.N, alpha_, mobjX_, params_.offBX, params_.incx,
+				mobjY_, params_.offCY, params_.incy, mobjA_, params_.offa, params_.lda, 1, &queue, 0, NULL, &event);
+
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS SYR2 function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+#ifdef TIMING
+    } // iter loop
+    clFinish( queue);
+    time = getCurrentTime() - time;
+    time /= iter;
+#else
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+
+    return time;
+}
+
+} // namespace clMath
+
+// ssyr performance test
+TEST_P(SYR2, ssyr2)
+{
+    TestParams params;
+
+    getParams(&params);
+    Syr2PerformanceTest<float>::runInstance(FN_SSYR2, &params);
+}
+
+// dsyr performance test case
+TEST_P(SYR2, dsyr2)
+{
+    TestParams params;
+
+    getParams(&params);
+    Syr2PerformanceTest<double>::runInstance(FN_DSYR2, &params);
+}
diff --git a/src/tests/performance/perf-syr2k.cpp b/src/tests/performance/perf-syr2k.cpp
new file mode 100644
index 0000000..41d4e17
--- /dev/null
+++ b/src/tests/performance/perf-syr2k.cpp
@@ -0,0 +1,343 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Syr2k performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <syr2k.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class Syr2kPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~Syr2kPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        Syr2kPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        if (fn == FN_SSYR2K || fn == FN_DSYR2K) {
+            opFactor = 2;
+        }
+        else {
+            opFactor = 8;
+        }
+
+        if ((fn == FN_DSYR2K || fn == FN_ZSYR2K) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    Syr2kPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType beta_;
+    ElemType *A_;
+    ElemType *B_;
+    ElemType *C_;
+    ElemType *backC_;
+    cl_mem mobjA_;
+    cl_mem mobjB_;
+    cl_mem mobjC_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+Syr2kPerformanceTest<ElemType>::Syr2kPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn, (problem_size_t)params->N * params->N
+                                            * params->K),
+                        params_(*params), mobjA_(NULL), mobjB_(NULL), mobjC_(NULL)
+{
+    A_ = new ElemType[params_.rowsA * params_.columnsA];
+    B_ = new ElemType[params_.rowsB * params_.columnsB];
+    C_ = new ElemType[params_.rowsC * params_.columnsC];
+    backC_ = new ElemType[params_.rowsC * params_.columnsC];
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+Syr2kPerformanceTest<ElemType>::~Syr2kPerformanceTest()
+{
+    delete[] A_;
+    delete[] B_;
+    delete[] C_;
+    delete[] backC_;
+
+    clReleaseMemObject(mobjC_);
+    clReleaseMemObject(mobjB_);
+    clReleaseMemObject(mobjA_);
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+Syr2kPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize, maxMatrSize;
+    size_t n = params->N, k = params->K;
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize(0);
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    maxMatrSize = gmemSize / 3;
+
+    maxMatrSize = std::min(maxMatrSize, allocSize);
+
+    return (n * k * sizeof(ElemType) < maxMatrSize);
+}
+
+template <typename ElemType> int
+Syr2kPerformanceTest<ElemType>::prepare(void)
+{
+    bool useAlpha = base_->useAlpha();
+    bool useBeta = base_->useBeta();
+    clblasTranspose transB;
+
+    if (useAlpha) {
+        alpha_ = convertMultiplier<ElemType>(params_.alpha);
+    }
+    if (useBeta) {
+        beta_ = convertMultiplier<ElemType>(params_.beta);
+    }
+
+    transB = (params_.transA == clblasNoTrans) ? clblasTrans :
+             clblasNoTrans;
+    randomGemmMatrices<ElemType>(params_.order, params_.transA, transB,
+        params_.N, params_.N, params_.K, useAlpha, &alpha_, A_, params_.lda,
+        B_, params_.ldb, useBeta, &beta_, C_, params_.ldc);
+
+
+    mobjA_ = base_->createEnqueueBuffer(A_, params_.rowsA * params_.columnsA *
+                                        sizeof(ElemType),
+                                        params_.offA * sizeof(ElemType),
+                                        CL_MEM_READ_ONLY);
+    if (mobjA_) {
+        mobjB_ = base_->createEnqueueBuffer(B_, params_.rowsB * params_.columnsB *
+                                            sizeof(ElemType),
+                                            params_.offBX * sizeof(ElemType),
+                                            CL_MEM_READ_ONLY);
+    }
+    if (mobjB_) {
+        mobjC_ = base_->createEnqueueBuffer(backC_, params_.rowsC * params_.columnsC *
+                                            sizeof(ElemType),
+                                            params_.offCY * sizeof(ElemType),
+                                            CL_MEM_READ_WRITE);
+    }
+
+    return (mobjC_) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+Syr2kPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+    size_t lda, ldb, ldc;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+
+    memcpy(C_, backC_, params_.rowsC * params_.columnsC * sizeof(ElemType));
+    order = params_.order;
+    lda = params_.lda;
+    ldb = params_.ldb;
+    ldc = params_.ldc;
+
+#ifdef PERF_TEST_WITH_ACML
+
+// #warning "SYR2K performance test not implemented"
+    time = NANOTIME_MAX;
+    order = order;
+    lda = lda;
+    ldb = ldb;
+    ldc = ldc;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+Syr2kPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0,
+                                  params_.rowsC * params_.columnsC *
+                                  sizeof(ElemType), backC_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Matrix C buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+    status = (cl_int)clMath::clblas::syr2k(params_.order,
+        params_.uplo, params_.transA, params_.N, params_.K, alpha_,
+        mobjA_, params_.offA, params_.lda, mobjB_, params_.offBX, params_.ldb,
+        beta_, mobjC_, params_.offCY, params_.ldc, 1, &queue, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS SYR2K function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+
+    return time;
+}
+
+} // namespace clMath
+
+// ssyr2k performance test
+TEST_P(SYR2K, ssyr2k)
+{
+    TestParams params;
+
+    getParams(&params);
+    Syr2kPerformanceTest<float>::runInstance(FN_SSYR2K, &params);
+}
+
+// dsyr2k performance test case
+TEST_P(SYR2K, dsyr2k)
+{
+    TestParams params;
+
+    getParams(&params);
+    Syr2kPerformanceTest<double>::runInstance(FN_DSYR2K, &params);
+}
+
+// csyr2k performance test
+TEST_P(SYR2K, csyr2k)
+{
+    TestParams params;
+
+    getParams(&params);
+    Syr2kPerformanceTest<FloatComplex>::runInstance(FN_CSYR2K, &params);
+}
+
+// zsyr2k performance test case
+TEST_P(SYR2K, zsyr2k)
+{
+    TestParams params;
+
+    getParams(&params);
+    Syr2kPerformanceTest<DoubleComplex>::runInstance(FN_ZSYR2K, &params);
+}
diff --git a/src/tests/performance/perf-syrk.cpp b/src/tests/performance/perf-syrk.cpp
new file mode 100644
index 0000000..a24dfca
--- /dev/null
+++ b/src/tests/performance/perf-syrk.cpp
@@ -0,0 +1,327 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Syrk performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <syrk.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class SyrkPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~SyrkPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        SyrkPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        if (fn == FN_SSYRK || fn == FN_DSYRK) {
+            opFactor = 1;
+        }
+        else {
+            opFactor = 4;
+        }
+
+        if ((fn == FN_DSYRK || fn == FN_ZSYRK) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    SyrkPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType alpha_;
+    ElemType beta_;
+    ElemType *A_;
+    ElemType *C_;
+    ElemType *backC_;
+    cl_mem mobjA_;
+    cl_mem mobjC_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+SyrkPerformanceTest<ElemType>::SyrkPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn, (problem_size_t)params->N * params->N
+                                            * params->K),
+                        params_(*params), mobjA_(NULL), mobjC_(NULL)
+{
+    A_ = new ElemType[params_.rowsA * params_.columnsA];
+    C_ = new ElemType[params_.rowsC * params_.columnsC];
+    backC_ = new ElemType[params_.rowsC * params_.columnsC];
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+SyrkPerformanceTest<ElemType>::~SyrkPerformanceTest()
+{
+    delete[] A_;
+    delete[] C_;
+    delete[] backC_;
+
+    clReleaseMemObject(mobjC_);
+    clReleaseMemObject(mobjA_);
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+SyrkPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize, maxMatrSize;
+    size_t n = params->N, k = params->K;
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize(0);
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    maxMatrSize = gmemSize / 3;
+
+    maxMatrSize = std::min(maxMatrSize, allocSize);
+
+    return (n * k * sizeof(ElemType) < maxMatrSize);
+}
+
+template <typename ElemType> int
+SyrkPerformanceTest<ElemType>::prepare(void)
+{
+    bool useAlpha = base_->useAlpha();
+    bool useBeta = base_->useBeta();
+
+    if (useAlpha) {
+        alpha_ = convertMultiplier<ElemType>(params_.alpha);
+    }
+    if (useBeta) {
+        beta_ = convertMultiplier<ElemType>(params_.beta);
+    }
+
+    randomGemmMatrices<ElemType>(params_.order, params_.transA, clblasNoTrans,
+        params_.N, params_.N, params_.K, useAlpha, &alpha_, A_, params_.lda,
+        NULL, 0, useBeta, &beta_, C_, params_.ldc);
+
+
+    mobjA_ = base_->createEnqueueBuffer(A_, params_.rowsA * params_.columnsA *
+                                        sizeof(ElemType),
+                                        params_.offA * sizeof(ElemType),
+                                        CL_MEM_READ_ONLY);
+    if (mobjA_) {
+        mobjC_ = base_->createEnqueueBuffer(backC_, params_.rowsC * params_.columnsC *
+                                            sizeof(ElemType),
+                                            params_.offCY * sizeof(ElemType),
+                                            CL_MEM_READ_WRITE);
+    }
+
+    return (mobjC_) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+SyrkPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+    size_t lda, ldc;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+
+    memcpy(C_, backC_, params_.rowsC * params_.columnsC * sizeof(ElemType));
+    order = params_.order;
+    lda = params_.lda;
+    ldc = params_.ldc;
+
+#ifdef PERF_TEST_WITH_ACML
+
+// #warning "SYRK performance test not implemented"
+    time = NANOTIME_MAX;
+    order = order;
+    lda = lda;
+    ldc = ldc;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+SyrkPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0,
+                                  params_.rowsC * params_.columnsC *
+                                  sizeof(ElemType), backC_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Matrix C buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+    status = (cl_int)clMath::clblas::syrk(params_.order,
+        params_.uplo, params_.transA, params_.N, params_.K, alpha_,
+        mobjA_, params_.offA, params_.lda, beta_, mobjC_, params_.offCY,
+        params_.ldc, 1, &queue, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS SYRK function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+
+    return time;
+}
+
+} // namespace clMath
+
+// ssyrk performance test
+TEST_P(SYRK, ssyrk)
+{
+    TestParams params;
+
+    getParams(&params);
+    SyrkPerformanceTest<float>::runInstance(FN_SSYRK, &params);
+}
+
+// dsyrk performance test case
+TEST_P(SYRK, dsyrk)
+{
+    TestParams params;
+
+    getParams(&params);
+    SyrkPerformanceTest<double>::runInstance(FN_DSYRK, &params);
+}
+
+// csyrk performance test
+TEST_P(SYRK, csyrk)
+{
+    TestParams params;
+
+    getParams(&params);
+    SyrkPerformanceTest<FloatComplex>::runInstance(FN_CSYRK, &params);
+}
+
+// zsyrk performance test case
+TEST_P(SYRK, zsyrk)
+{
+    TestParams params;
+
+    getParams(&params);
+    SyrkPerformanceTest<DoubleComplex>::runInstance(FN_ZSYRK, &params);
+}
diff --git a/src/tests/performance/perf-tbmv.cpp b/src/tests/performance/perf-tbmv.cpp
new file mode 100644
index 0000000..81d6471
--- /dev/null
+++ b/src/tests/performance/perf-tbmv.cpp
@@ -0,0 +1,329 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Tbmv performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <tbmv.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class TbmvPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~TbmvPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        TbmvPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor = 1;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        if ((fn == FN_DTBMV || fn == FN_ZTBMV) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    TbmvPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType *A_;
+    ElemType *X_;
+    ElemType *backX_;
+    cl_mem mobjA_;
+    cl_mem mobjX_;
+    cl_mem mobjScratch_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+TbmvPerformanceTest<ElemType>::TbmvPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn,
+    (problem_size_t)(   params->N * (params->K+1) * 2           // A & X access
+                     - (params->K * (params->K+1) )             // Substract hole-part for A & X
+                     + (2*params->N)   /* Y access */  ) * sizeof(ElemType)  ),
+                            params_(*params), mobjA_(NULL), mobjX_(NULL), mobjScratch_(NULL)
+{
+    size_t lenA, lenX;
+    lenA = params_.N  * params_.lda + params_.offA;
+    lenX = (params_.N  - 1)* params_.incx + 1 + params_.offBX;
+    A_ = new ElemType[ lenA ];
+    X_ = new ElemType[ lenX ];
+    backX_ = new ElemType[ lenX ];
+
+    base_ = ::clMath::BlasBase::getInstance();
+
+	mobjA_ = NULL;
+	mobjX_ = NULL;
+	mobjScratch_ = NULL;
+}
+
+template <typename ElemType>
+TbmvPerformanceTest<ElemType>::~TbmvPerformanceTest()
+{
+    if(A_ != NULL)
+    {
+        delete[] A_;
+    }
+	if(X_ != NULL)
+	{
+        delete[] X_;
+	}
+	if(backX_ != NULL)
+	{
+		delete[] backX_;
+	}
+
+    if ( mobjA_ != NULL )
+		clReleaseMemObject(mobjA_);
+	if ( mobjX_ != NULL )
+	    clReleaseMemObject(mobjX_);
+	if ( mobjScratch_ != NULL )
+		clReleaseMemObject(mobjScratch_);
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+TbmvPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t n = params->N, lda = params->lda;
+    size_t lenA = (n * lda  + params->offA)* sizeof(ElemType);
+    size_t lenX = ((params->N - 1)* params->incx + 1 + params->offBX) * sizeof(ElemType);
+
+    if((A_ == NULL) || (X_ == NULL) || (backX_ == NULL))
+	{
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize(0);
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    bool suff = (lenA < allocSize) && ( (lenA + 2 * lenX) < gmemSize );
+
+    return suff;
+}
+
+template <typename ElemType> int
+TbmvPerformanceTest<ElemType>::prepare(void)
+{
+    size_t lenX, lenA;
+
+    lenA = params_.N * params_.lda + params_.offA;
+    lenX = (params_.N - 1)*abs(params_.incx) + 1 + params_.offBX;
+
+    randomTbmvMatrices( params_.N, (A_+params_.offA), params_.lda, (X_+params_.offBX), params_.incx );
+
+    memcpy(backX_, X_, lenX * sizeof(ElemType));
+
+    mobjA_ = base_->createEnqueueBuffer(A_, lenA * sizeof(ElemType), 0, CL_MEM_READ_ONLY);
+    mobjX_ = base_->createEnqueueBuffer(X_, lenX * sizeof(ElemType), 0, CL_MEM_READ_WRITE);
+    mobjScratch_ = base_->createEnqueueBuffer(backX_, lenX * sizeof(ElemType), 0, CL_MEM_READ_WRITE);
+
+    return ((mobjA_ != NULL) && (mobjX_ != NULL) && (mobjScratch_ != NULL)) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+TbmvPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder fOrder;
+    clblasTranspose fTrans;
+    clblasUplo fUplo;
+    size_t lda, lenA, lenX;
+
+    lenA = params_.N * params_.lda;
+    lenX = (params_.N - 1)* params_.incx + 1 + params_.offBX;
+
+    memcpy(X_, backX_, lenX * sizeof(ElemType));
+    fOrder = params_.order;
+    fTrans = params_.transA;
+    fUplo = params_.uplo;
+    lda = params_.lda;
+
+    if (fOrder != clblasColumnMajor)
+    {
+        fOrder = clblasColumnMajor;
+        fTrans = (params_.transA == clblasNoTrans)? clblasTrans : clblasNoTrans;
+        fUplo = (params_.uplo == clblasLower)? clblasUpper : clblasLower;
+
+		if( params_.transA == clblasConjTrans )
+            doConjugate( (A_+params_.offA), 1, lenA, lda );
+   	}
+
+#ifdef PERF_TEST_WITH_ACML
+
+   	time = getCurrentTime();
+   	clMath::blas::tbmv(fOrder, fUplo, fTrans, params_.diag, params_.N, params_.K, A_, params_.offA, lda, X_, params_.offBX, params_.incx);
+  	time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+TbmvPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    size_t lenX;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    lenX = (params_.N - 1)* params_.incx + 1 + params_.offBX;
+    status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0,
+                                  lenX * sizeof(ElemType), backX_, 0, NULL, &event);
+
+    if (status != CL_SUCCESS) {
+        cerr << "Vector X buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+    DataType type;
+    type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE:
+										( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+    event = NULL;
+    time = getCurrentTime();
+    int iter = 20;
+	for ( int i = 1; i <= iter; i++)
+	{
+        status = clMath::clblas::tbmv(type, params_.order, params_.uplo, params_.transA, params_.diag, params_.N, params_.K,
+                                        mobjA_, params_.offA, params_.lda, mobjX_, params_.offBX, params_.incx,
+                                        mobjScratch_, 1, &queue, 0, NULL, &event);
+
+        if (status != CL_SUCCESS) {
+            cerr << "The CLBLAS TBMV function failed, status = " <<
+                    status << endl;
+            return NANOTIME_ERR;
+        }
+    }
+    clFinish( queue );
+    time = getCurrentTime() - time;
+	time /= iter;
+
+    return time;
+}
+
+} // namespace clMath
+
+// stbmv performance test
+TEST_P(TBMV, stbmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TbmvPerformanceTest<float>::runInstance(FN_STBMV, &params);
+}
+
+// dtbmv performance test case
+TEST_P(TBMV, dtbmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TbmvPerformanceTest<double>::runInstance(FN_DTBMV, &params);
+}
+
+// ctbmv performance test
+TEST_P(TBMV, ctbmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TbmvPerformanceTest<FloatComplex>::runInstance(FN_CTBMV, &params);
+}
+
+// ztbmv performance test case
+TEST_P(TBMV, ztbmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TbmvPerformanceTest<DoubleComplex>::runInstance(FN_ZTBMV, &params);
+}
diff --git a/src/tests/performance/perf-tbsv.cpp b/src/tests/performance/perf-tbsv.cpp
new file mode 100644
index 0000000..f17ead3
--- /dev/null
+++ b/src/tests/performance/perf-tbsv.cpp
@@ -0,0 +1,327 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Tbsv performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <tbmv.h>
+#include <tbsv.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class TbsvPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~TbsvPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        TbsvPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor = 1;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        if ((fn == FN_DTBSV || fn == FN_ZTBSV) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    TbsvPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType *A_;
+    ElemType *X_;
+    ElemType *backX_;
+    cl_mem mobjA_;
+    cl_mem mobjX_;
+    cl_mem mobjScratch_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+TbsvPerformanceTest<ElemType>::TbsvPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest(fn,
+    (problem_size_t)(   params->N * (params->K+1) * 2           // A & X access
+                     - (params->K * (params->K+1) )             // Substract hole-part for A & X
+                     + (2*params->N)   /* Y access */  ) * sizeof(ElemType)  ),
+                            params_(*params), mobjA_(NULL), mobjX_(NULL), mobjScratch_(NULL)
+{
+    size_t lenA, lenX;
+    lenA = params_.N  * params_.lda + params_.offA;
+    lenX = (params_.N  - 1)* params_.incx + 1 + params_.offBX;
+    A_ = new ElemType[ lenA ];
+    X_ = new ElemType[ lenX ];
+    backX_ = new ElemType[ lenX ];
+
+    base_ = ::clMath::BlasBase::getInstance();
+
+	mobjA_ = NULL;
+	mobjX_ = NULL;
+	mobjScratch_ = NULL;
+}
+
+template <typename ElemType>
+TbsvPerformanceTest<ElemType>::~TbsvPerformanceTest()
+{
+    if(A_ != NULL)
+    {
+        delete[] A_;
+    }
+	if(X_ != NULL)
+	{
+        delete[] X_;
+	}
+	if(backX_ != NULL)
+	{
+		delete[] backX_;
+	}
+
+    if ( mobjA_ != NULL )
+		clReleaseMemObject(mobjA_);
+	if ( mobjX_ != NULL )
+	    clReleaseMemObject(mobjX_);
+	if ( mobjScratch_ != NULL )
+		clReleaseMemObject(mobjScratch_);
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+TbsvPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t n = params->N, lda = params->lda;
+    size_t lenA = (n * lda  + params->offA)* sizeof(ElemType);
+    size_t lenX = ((params->N - 1)* params->incx + 1 + params->offBX) * sizeof(ElemType);
+
+    if((A_ == NULL) || (X_ == NULL) || (backX_ == NULL))
+	{
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize(0);
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    bool suff = (lenA < allocSize) && ( (lenA + 2 * lenX) < gmemSize );
+
+    return suff;
+}
+
+template <typename ElemType> int
+TbsvPerformanceTest<ElemType>::prepare(void)
+{
+    size_t lenX, lenA;
+
+    lenA = params_.N * params_.lda + params_.offA;
+    lenX = (params_.N - 1)*abs(params_.incx) + 1 + params_.offBX;
+
+    randomTbsvMatrices( params_.order, params_.uplo, params_.diag, params_.N, params_.K,
+                            (A_+params_.offA), params_.lda, (X_+params_.offBX), params_.incx );
+
+    memcpy(backX_, X_, lenX * sizeof(ElemType));
+
+    mobjA_ = base_->createEnqueueBuffer(A_, lenA * sizeof(ElemType), 0, CL_MEM_READ_ONLY);
+    mobjX_ = base_->createEnqueueBuffer(X_, lenX * sizeof(ElemType), 0, CL_MEM_READ_WRITE);
+    mobjScratch_ = base_->createEnqueueBuffer(backX_, lenX * sizeof(ElemType), 0, CL_MEM_READ_WRITE);
+
+    return ((mobjA_ != NULL) && (mobjX_ != NULL) && (mobjScratch_ != NULL)) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+TbsvPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder fOrder;
+    clblasTranspose fTrans;
+    clblasUplo fUplo;
+    size_t lda, lenA, lenX;
+
+    lenA = params_.N * params_.lda;
+    lenX = (params_.N - 1)* params_.incx + 1 + params_.offBX;
+
+    memcpy(X_, backX_, lenX * sizeof(ElemType));
+    fOrder = params_.order;
+    fTrans = params_.transA;
+    fUplo = params_.uplo;
+    lda = params_.lda;
+
+    if (fOrder != clblasColumnMajor)
+    {
+        fOrder = clblasColumnMajor;
+        fTrans = (params_.transA == clblasNoTrans)? clblasTrans : clblasNoTrans;
+        fUplo = (params_.uplo == clblasLower)? clblasUpper : clblasLower;
+
+		if( params_.transA == clblasConjTrans )
+            doConjugate( (A_+params_.offA), 1, lenA, lda );
+   	}
+
+#ifdef PERF_TEST_WITH_ACML
+
+   	time = getCurrentTime();
+   	clMath::blas::tbsv(fOrder, fUplo, fTrans, params_.diag, params_.N, params_.K, A_, params_.offA, lda, X_, params_.offBX, params_.incx);
+  	time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+TbsvPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    size_t lenX;
+    cl_command_queue queue = base_->commandQueues()[0];
+
+    lenX = (params_.N - 1)* params_.incx + 1 + params_.offBX;
+    status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0,
+                                  lenX * sizeof(ElemType), backX_, 0, NULL, &event);
+
+    if (status != CL_SUCCESS) {
+        cerr << "Vector X buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+    DataType type;
+    type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE:
+										( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+    event = NULL;
+    time = getCurrentTime();
+    int iter = 20;
+	for ( int i = 1; i <= iter; i++)
+	{
+        status = clMath::clblas::tbsv(type, params_.order, params_.uplo, params_.transA, params_.diag, params_.N, params_.K,
+                                        mobjA_, params_.offA, params_.lda, mobjX_, params_.offBX, params_.incx,
+                                         1, &queue, 0, NULL, &event);
+
+        if (status != CL_SUCCESS) {
+            cerr << "The CLBLAS TBSV function failed, status = " <<
+                    status << endl;
+            return NANOTIME_ERR;
+        }
+    }
+    clFinish( queue );
+    time = getCurrentTime() - time;
+	time /= iter;
+
+    return time;
+}
+
+} // namespace clMath
+
+TEST_P(TBSV, stbsv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TbsvPerformanceTest<float>::runInstance(FN_STBSV, &params);
+}
+
+TEST_P(TBSV, dtbsv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TbsvPerformanceTest<double>::runInstance(FN_DTBSV, &params);
+}
+
+TEST_P(TBSV, ctbsv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TbsvPerformanceTest<FloatComplex>::runInstance(FN_CTBSV, &params);
+}
+
+TEST_P(TBSV, ztbsv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TbsvPerformanceTest<DoubleComplex>::runInstance(FN_ZTBSV, &params);
+}
diff --git a/src/tests/performance/perf-tpmv.cpp b/src/tests/performance/perf-tpmv.cpp
new file mode 100644
index 0000000..0246c84
--- /dev/null
+++ b/src/tests/performance/perf-tpmv.cpp
@@ -0,0 +1,380 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <tpmv.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class TpmvPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~TpmvPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        TpmvPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        /*  *************Important*********************
+		if (fn == FN_STRMV || fn == FN_DTRMV) {
+            opFactor = 2;
+        }
+        else {
+            opFactor = 8;
+        }   this is only for blas-3 routines- operations factor
+			FOR BLAS-2(bandwidth intensive) ROUTINES MAKE opFactor AS 1 and pass the appropriate size
+			that is read and written in the constructor below           */
+		opFactor = 1; //FIX-ME
+
+        if ((fn == FN_DTPMV || fn == FN_ZTPMV) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    TpmvPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType *AP_;
+	ElemType *X_;
+    ElemType *backX_;
+    cl_mem mobjAP_;
+    cl_mem mobjX_;
+	cl_mem scratchBuff;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+TpmvPerformanceTest<ElemType>::TpmvPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest( fn, (problem_size_t)( ( params->N * (params->N+1) * sizeof(ElemType) ) ) ),
+    params_(*params), mobjAP_(NULL), mobjX_(NULL)
+{
+
+    AP_ = new ElemType[( ( params_.N *( params_.N + 1 ) )/2 ) + params_.offa];
+    X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx)  + params_.offBX];
+    backX_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx)  + params_.offBX];
+
+    base_ = ::clMath::BlasBase::getInstance();
+	mobjAP_ = NULL;
+	mobjX_ = NULL;
+	scratchBuff = NULL;
+}
+
+template <typename ElemType>
+TpmvPerformanceTest<ElemType>::~TpmvPerformanceTest()
+    // Matrix A
+{
+    if(AP_ != NULL)
+    {
+    delete[] AP_;
+    }
+	if(X_ != NULL)
+	{
+    delete[] X_;
+	}
+	if(backX_ != NULL)
+	{
+    delete[] backX_;
+	}
+
+    if ( mobjAP_ != NULL )
+		clReleaseMemObject(mobjAP_);
+	if ( mobjX_ != NULL )
+	    clReleaseMemObject(mobjX_);
+	if ( scratchBuff != NULL )
+		clReleaseMemObject(scratchBuff);
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+TpmvPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t n = params->N;
+
+	if((AP_ == NULL) || (X_ == NULL) || (backX_ == NULL))
+	{
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize( 0 );
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    bool suff = ( sizeof(ElemType)*( ( n *( n + 1 ) )/2 )< allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations
+	suff = suff && ((( ( ( n *( n + 1 ) )/2 ) + (1 + (n-1)*abs(params->incx))*2)*sizeof(ElemType)) < gmemSize) ; //for total global allocations
+
+    return suff ;
+}
+
+template <typename ElemType> int
+TpmvPerformanceTest<ElemType>::prepare(void)
+{
+    size_t lenX, n;
+	n = params_.N;
+    lenX = 1 + (n-1) * abs(params_.incx);
+
+
+	int creationFlags = 0;
+    creationFlags =  creationFlags | RANDOM_INIT | PACKED_MATRIX;
+
+    // Default is Column-Major
+    creationFlags = ( (this-> params_.order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
+    creationFlags = ( (this-> params_.uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY);
+	BlasRoutineID BlasFn = CLBLAS_TRMV;
+
+     populate( (AP_ + params_.offa), n, n, 0, BlasFn, creationFlags);
+     populate( X_ , lenX + params_.offBX, 1, lenX + params_.offBX, BlasFn);
+     memcpy(backX_, X_, ((1 + (params_.N-1) * abs(params_.incx))+ params_.offBX )* sizeof(ElemType));
+
+
+    mobjAP_ = base_->createEnqueueBuffer(AP_,( (( n *( n + 1 ) )/2 ) + params_.offa)* sizeof(*AP_), 0, CL_MEM_READ_ONLY);
+    mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX )* sizeof(*X_), 0, CL_MEM_WRITE_ONLY);
+	scratchBuff = base_->createEnqueueBuffer(NULL , lenX * sizeof(*X_), 0, CL_MEM_READ_ONLY);
+
+    return ( (mobjAP_ != NULL) &&  (mobjX_ != NULL) && (scratchBuff != NULL) ) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+TpmvPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+	clblasUplo fUplo;
+	clblasTranspose fTrans;
+    //size_t lda;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+    order = params_.order;
+	fUplo = params_.uplo;
+	fTrans = params_.transA;
+    //lda = params_.lda;
+
+#ifdef PERF_TEST_WITH_ACML
+
+	if (order != clblasColumnMajor)
+    {
+        order = clblasColumnMajor;
+		fUplo =  (params_.uplo == clblasUpper)? clblasLower : clblasUpper;
+        fTrans = (params_.transA == clblasNoTrans)? clblasTrans : clblasNoTrans;
+
+        if( params_.transA == clblasConjTrans )
+            doConjugate( (AP_+params_.offa), (( params_.N * (params_.N + 1)) / 2) , 1, 1 );
+    }
+
+
+   time = getCurrentTime();
+   clMath::blas::tpmv(order, fUplo,fTrans, params_.diag,
+                    params_.N, AP_, params_.offa, X_, params_.offBX, params_.incx);
+    time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+TpmvPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+	size_t lenX = 1 + (params_.N-1) * abs(params_.incx);
+
+    status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0,
+                                  (lenX + params_.offBX )* sizeof(ElemType), backX_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Vector X buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+
+	DataType type;
+    type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE:
+										( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+	time = getCurrentTime();
+#define TIMING
+#ifdef TIMING
+	clFinish( queue);
+
+	int iter = 20;
+	for ( int i = 1; i <= iter; i++)
+	{
+#endif
+    status = (cl_int)clMath::clblas::tpmv(type, params_.order, params_.uplo,
+        params_.transA, params_.diag, params_.N, mobjAP_, params_.offa,
+        mobjX_, params_.offBX, params_.incx, scratchBuff,
+        1, &queue, 0, NULL, &event);
+
+
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS TPMV function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+#ifdef TIMING
+	} // iter loop
+	clFinish( queue);
+    time = getCurrentTime() - time;
+	time /= iter;
+#else
+
+	status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+
+	//printf("Time elapsed : %lu\n", time);
+#endif
+
+    return time;
+}
+
+} // namespace clMath
+
+// strmv performance test
+TEST_P(TPMV, stpmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TpmvPerformanceTest<float>::runInstance(FN_STPMV, &params);
+}
+
+// dtrmv performance test case
+TEST_P(TPMV, dtpmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TpmvPerformanceTest<double>::runInstance(FN_DTPMV, &params);
+}
+// ctrmv performance test case
+TEST_P(TPMV, ctpmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TpmvPerformanceTest<FloatComplex>::runInstance(FN_CTPMV, &params);
+}
+// ztrmv performance test case
+TEST_P(TPMV, ztpmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TpmvPerformanceTest<DoubleComplex>::runInstance(FN_ZTPMV, &params);
+}
+
diff --git a/src/tests/performance/perf-tpsv.cpp b/src/tests/performance/perf-tpsv.cpp
new file mode 100644
index 0000000..556c043
--- /dev/null
+++ b/src/tests/performance/perf-tpsv.cpp
@@ -0,0 +1,365 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Gemv performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <tpsv.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class TpsvPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~TpsvPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        TpsvPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+		opFactor = 1;
+
+        if ((fn == FN_DTPSV || fn == FN_ZTPSV) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else {
+		ret = perfCase.run(opFactor);
+	}
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    TpsvPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+   // ElemType alpha_;
+    ElemType *A_;
+    ElemType *X_;
+    ElemType *backX_;
+    cl_mem mobjA_;
+    cl_mem mobjX_;
+    size_t lengthA;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+TpsvPerformanceTest<ElemType>::TpsvPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest( fn, (problem_size_t)( ( params->N * (params->N+1) * sizeof(ElemType) ) ) ),
+    params_(*params), mobjA_(NULL), mobjX_(NULL)
+{
+	lengthA = (params_.N * (params_.N + 1))/2;
+    A_ = 		new ElemType[(lengthA) + params_.offa];
+    X_ = 		new ElemType[ 1 + ((params_.N-1) * abs(params_.incx)) + params_.offBX ];
+    backX_ = 	new ElemType[ 1 + ((params_.N-1) * abs(params_.incx)) + params_.offBX ];
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+TpsvPerformanceTest<ElemType>::~TpsvPerformanceTest()
+{
+     if(A_ != NULL)
+    {
+    delete[] A_;
+    }
+	if(X_ != NULL)
+	{
+    delete[] X_;
+	}
+	if(backX_ != NULL)
+	{
+    delete[] backX_;
+	}
+
+    if( mobjA_ != NULL )
+		clReleaseMemObject(mobjA_);
+    if( mobjX_ != NULL )
+		clReleaseMemObject(mobjX_);
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+TpsvPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t n = params->N;
+
+	if ((A_ == NULL) || (X_ == NULL) || (backX_ == NULL))
+	{
+        return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize( 0 );
+    allocSize = (size_t)base->maxMemAllocSize();
+
+	bool suff = ( sizeof(ElemType)*((n*(n+1))/2) < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations
+    suff = suff && ((( ((n*(n+1))/2) + (1 + (n-1)*abs(params->incx))*2)*sizeof(ElemType)) < gmemSize) ; //for total global allocations
+    return suff ;
+}
+
+template <typename ElemType> int
+TpsvPerformanceTest<ElemType>::prepare(void)
+{
+    size_t lenX, N;
+
+    N = params_.N;
+    lenX = 1 + ((N-1) *abs(params_.incx)) + params_.offBX;
+
+
+	randomTrsvMatrices( params_.order, params_.uplo, params_.diag, params_.N, (A_ + params_.offa), 0,
+										(X_ + params_.offBX), params_.incx);
+    memcpy(backX_, X_, lenX * sizeof(ElemType));
+    mobjA_ = base_->createEnqueueBuffer(A_, ((lengthA) + params_.offa) *
+                                     sizeof(*A_), 0, CL_MEM_READ_ONLY);
+    mobjX_ = base_->createEnqueueBuffer(X_, lenX *
+                                     sizeof(*X_), 0, CL_MEM_READ_WRITE);
+    return ((mobjA_ != NULL) &&  (mobjX_ != NULL) ) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+TpsvPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+	clblasUplo fUplo;
+    clblasTranspose fTrans;
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+    memcpy(X_, backX_, ((1 + ((params_.N-1) * abs(params_.incx)))+params_.offBX) * sizeof(ElemType));
+    order = params_.order;
+	fUplo = params_.uplo;
+    fTrans = params_.transA;
+
+#ifdef PERF_TEST_WITH_ACML
+
+	if (order != clblasColumnMajor)
+    {
+        order = clblasColumnMajor;
+        fUplo =  (params_.uplo == clblasUpper)? clblasLower : clblasUpper;
+        fTrans = (params_.transA == clblasNoTrans)? clblasTrans : clblasNoTrans;
+
+        if( params_.transA == clblasConjTrans )
+            doConjugate( A_ + params_.offa, 1, lengthA, 1 );
+    }
+    //printf("Calling ACML TPSV\n");
+    //printf("X Before calling %f %f %f %f\n", X_[0], X_[1], X_[2], X_[3]);
+    time = getCurrentTime();
+    clMath::blas::tpsv(order, fUplo, fTrans, params_.diag,
+                    params_.N, A_, params_.offa, X_, params_.offBX, params_.incx);
+    time = getCurrentTime() - time;
+    //printf("X After Calling %f %f %f %f\n", X_[0], X_[1], X_[2], X_[3]);
+    //printf("time %lu\n", (unsigned long)time );
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+TpsvPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+    size_t lenX = 1 + ((params_.N-1) * abs(params_.incx)) + params_.offBX;
+
+    status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0,
+                                  lenX * sizeof(ElemType), backX_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Vector X buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+    //printf("backX before calling %f %f %f %f\n", backX_[0], backX_[1], backX_[2], backX_[3]);
+
+    DataType type;
+    type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE: ( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+	time = getCurrentTime();
+//#define TIMING
+#ifdef TIMING
+	clFinish( queue);
+
+	int iter = 20;
+	for ( int i = 1; i <= iter; i++)
+	{
+#endif
+    status = (cl_int)clMath::clblas::tpsv(type, params_.order, params_.uplo,
+        params_.transA, params_.diag, params_.N, mobjA_, params_.offa,
+        mobjX_, params_.offBX, params_.incx, 1, &queue, 0, NULL, &event);
+
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS TPSV function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+#ifdef TIMING
+	} // iter loop
+	clFinish( queue);
+    time = getCurrentTime() - time;
+	time /= iter;
+#else
+
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+
+        clEnqueueReadBuffer(queue, mobjX_, CL_TRUE, 0,
+                            lenX * sizeof(ElemType), backX_, 0, NULL, NULL);
+
+        /*
+        printf("X Vector is \n");
+        for(int i =0 ; i<params_.N; i++)
+            printf("%f ", backX_[i]);
+        printf("\n");
+        printf("backX After calling %4.10f %4.10f %4.10f %4.10f\n", backX_[0], backX_[1], backX_[2], backX_[3]);
+        */
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+    return time;
+}
+
+} // namespace clMath
+
+
+TEST_P(TPSV, stpsv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TpsvPerformanceTest<float>::runInstance(FN_STPSV, &params);
+}
+
+
+TEST_P(TPSV, dtpsv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TpsvPerformanceTest<double>::runInstance(FN_DTPSV, &params);
+}
+
+TEST_P(TPSV, ctpsv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TpsvPerformanceTest<FloatComplex>::runInstance(FN_CTPSV, &params);
+}
+
+TEST_P(TPSV, ztpsv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TpsvPerformanceTest<DoubleComplex>::runInstance(FN_ZTPSV, &params);
+}
+
diff --git a/src/tests/performance/perf-trmm.cpp b/src/tests/performance/perf-trmm.cpp
new file mode 100644
index 0000000..c7eb92e
--- /dev/null
+++ b/src/tests/performance/perf-trmm.cpp
@@ -0,0 +1,74 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <BlasBase.h>
+#include <trmm.h>
+
+#include "TrxmPerformanceTest.cpp"
+
+/*
+ * NOTE: operation factor takes into account the same as for
+ *       gemm but also the fact that only a half of data is actually
+ *       useful
+ */
+
+using namespace std;
+using namespace clMath;
+
+
+// strmm performance test case
+TEST_P(TRMM, strmm)
+{
+    TestParams params;
+
+    getParams(&params);
+    TrxmPerformanceTest<float>::runInstance(FN_STRMM, &params);
+}
+
+// dtrmm performance test case
+TEST_P(TRMM, dtrmm)
+{
+    TestParams params;
+
+    getParams(&params);
+    TrxmPerformanceTest<double>::runInstance(FN_DTRMM, &params);
+}
+
+// ctrmm performance test case
+TEST_P(TRMM, ctrmm)
+{
+    TestParams params;
+
+    getParams(&params);
+    TrxmPerformanceTest<FloatComplex>::runInstance(FN_CTRMM, &params);
+}
+
+// ztrmm performance test case
+TEST_P(TRMM, ztrmm)
+{
+    TestParams params;
+
+    getParams(&params);
+    TrxmPerformanceTest<DoubleComplex>::runInstance(FN_ZTRMM, &params);
+}
diff --git a/src/tests/performance/perf-trmv.cpp b/src/tests/performance/perf-trmv.cpp
new file mode 100644
index 0000000..7bb94fe
--- /dev/null
+++ b/src/tests/performance/perf-trmv.cpp
@@ -0,0 +1,384 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Trmv performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <trmv.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class TrmvPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~TrmvPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        TrmvPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+        /*  *************Important*********************
+		if (fn == FN_STRMV || fn == FN_DTRMV) {
+            opFactor = 2;
+        }
+        else {
+            opFactor = 8;
+        }   this is only for blas-3 routines- operations factor
+			FOR BLAS-2(bandwidth intensive) ROUTINES MAKE opFactor AS 1 and pass the appropriate size
+			that is read and written in the constructor below           */
+		opFactor = 1; //FIX-ME
+
+        if ((fn == FN_DTRMV || fn == FN_ZTRMV) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else {
+            ret = perfCase.run(opFactor);
+        }
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    TrmvPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+    ElemType *A_;
+	ElemType *X_;
+    ElemType *backX_;
+    cl_mem mobjA_;
+    cl_mem mobjX_;
+	cl_mem scratchBuff;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+TrmvPerformanceTest<ElemType>::TrmvPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest( fn, (problem_size_t)( ( params->N * (params->N+1) * sizeof(ElemType) ) ) ),  //**************Gbps formula here***********
+    params_(*params), mobjA_(NULL), mobjX_(NULL)
+{
+
+    A_ = new ElemType[params_.N * params_.lda + params_.offa];
+    X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx)  + params_.offBX];
+    backX_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx)  + params_.offBX];
+
+    base_ = ::clMath::BlasBase::getInstance();
+	mobjA_ = NULL;
+	mobjX_ = NULL;
+	scratchBuff = NULL;
+}
+
+template <typename ElemType>
+TrmvPerformanceTest<ElemType>::~TrmvPerformanceTest()
+{
+    if(A_ != NULL)
+    {
+    delete[] A_;
+    }
+	if(X_ != NULL)
+	{
+    delete[] X_;
+	}
+	if(backX_ != NULL)
+	{
+    delete[] backX_;
+	}
+
+    if ( mobjA_ != NULL )
+		clReleaseMemObject(mobjA_);
+	if ( mobjX_ != NULL )
+	    clReleaseMemObject(mobjX_);
+	if ( scratchBuff != NULL )
+		clReleaseMemObject(scratchBuff);
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+TrmvPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t n = params->N;
+
+	if((A_ == NULL) || (X_ == NULL) || (backX_ == NULL))
+	{
+		return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize( 0 );
+    allocSize = (size_t)base->maxMemAllocSize();
+
+    bool suff = ( sizeof(ElemType)*n*params->lda < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations
+	suff = suff && ((( n*params->lda + (1 + (n-1)*abs(params->incx))*2)*sizeof(ElemType)) < gmemSize) ; //for total global allocations
+
+    return suff ;
+}
+
+template <typename ElemType> int
+TrmvPerformanceTest<ElemType>::prepare(void)
+{
+    size_t lenX, N;
+	N = params_.N;
+    lenX = 1 + (N-1) * abs(params_.incx);
+
+
+	int creationFlags = 0;
+    creationFlags =  creationFlags | RANDOM_INIT;
+
+    // Default is Column-Major
+    creationFlags = ( (this-> params_.order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
+    creationFlags = ( (this-> params_.uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY);
+	BlasRoutineID BlasFn = CLBLAS_TRMV;
+
+    // Matrix A
+     populate( (A_ + params_.offa), N, N, params_.lda, BlasFn, creationFlags);
+     populate( X_ , lenX + params_.offBX, 1, lenX + params_.offBX, BlasFn);
+     memcpy(backX_, X_, ((1 + (params_.N-1) * abs(params_.incx))+ params_.offBX )* sizeof(ElemType));
+
+
+    mobjA_ = base_->createEnqueueBuffer(A_, (params_.N * params_.lda + params_.offa)* sizeof(*A_), 0, CL_MEM_READ_ONLY);
+    mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX )* sizeof(*X_), 0, CL_MEM_WRITE_ONLY);
+	scratchBuff  = base_->createEnqueueBuffer(NULL , lenX * sizeof(*X_), 0, CL_MEM_READ_ONLY);
+
+    return ( (mobjA_ != NULL) &&  (mobjX_ != NULL) && (scratchBuff != NULL) ) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+TrmvPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+	clblasUplo fUplo;
+	clblasTranspose fTrans;
+    size_t lda;
+
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+    order = params_.order;
+	fUplo = params_.uplo;
+	fTrans = params_.transA;
+    lda = params_.lda;
+
+#ifdef PERF_TEST_WITH_ACML
+
+	if (order != clblasColumnMajor)
+    {
+        order = clblasColumnMajor;
+		fUplo =  (params_.uplo == clblasUpper)? clblasLower : clblasUpper;
+        fTrans = (params_.transA == clblasNoTrans)? clblasTrans : clblasNoTrans;
+
+        if( params_.transA == clblasConjTrans )
+            doConjugate( (A_+params_.offa), params_.N, params_.N, lda );
+    }
+
+
+   time = getCurrentTime();
+   clMath::blas::trmv(order, fUplo,fTrans, params_.diag,
+                    params_.N, A_, params_.offa, lda, X_, params_.offBX, params_.incx);
+    time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+TrmvPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+	size_t lenX = 1 + (params_.N-1) * abs(params_.incx);
+
+    status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0,
+                                  (lenX + params_.offBX )* sizeof(ElemType), backX_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Vector X buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+
+	DataType type;
+    type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE:
+										( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+	time = getCurrentTime();
+#define TIMING
+#ifdef TIMING
+	clFinish( queue);
+
+	int iter = 20;
+	for ( int i = 1; i <= iter; i++)
+	{
+#endif
+    status = (cl_int)clMath::clblas::trmv(type, params_.order, params_.uplo,
+        params_.transA, params_.diag, params_.N, mobjA_, params_.offa, params_.lda,
+        mobjX_, params_.offBX, params_.incx, scratchBuff,
+        1, &queue, 0, NULL, &event);
+
+
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS TRMV function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+#ifdef TIMING
+	} // iter loop
+	clFinish( queue);
+    time = getCurrentTime() - time;
+	time /= iter;
+#else
+
+	status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    time = getCurrentTime();
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+
+	//printf("Time elapsed : %lu\n", time);
+#endif
+
+    return time;
+}
+
+} // namespace clMath
+
+// strmv performance test
+TEST_P(TRMV, strmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TrmvPerformanceTest<float>::runInstance(FN_STRMV, &params);
+}
+
+// dtrmv performance test case
+TEST_P(TRMV, dtrmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TrmvPerformanceTest<double>::runInstance(FN_DTRMV, &params);
+}
+// ctrmv performance test case
+TEST_P(TRMV, ctrmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TrmvPerformanceTest<FloatComplex>::runInstance(FN_CTRMV, &params);
+}
+// ztrmv performance test case
+TEST_P(TRMV, ztrmv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TrmvPerformanceTest<DoubleComplex>::runInstance(FN_ZTRMV, &params);
+}
+
diff --git a/src/tests/performance/perf-trsm.cpp b/src/tests/performance/perf-trsm.cpp
new file mode 100644
index 0000000..f570a08
--- /dev/null
+++ b/src/tests/performance/perf-trsm.cpp
@@ -0,0 +1,67 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <blas-internal.h>
+#include <BlasBase.h>
+#include <trsm.h>
+
+#include "TrxmPerformanceTest.cpp"
+
+using namespace std;
+using namespace clMath;
+
+// strsm performance test case
+TEST_P(TRSM, strsm)
+{
+    TestParams params;
+
+    getParams(&params);
+    TrxmPerformanceTest<float>::runInstance(FN_STRSM, &params);
+}
+
+// dtrsm performance test case
+TEST_P(TRSM, dtrsm)
+{
+    TestParams params;
+
+    getParams(&params);
+    TrxmPerformanceTest<double>::runInstance(FN_DTRSM, &params);
+}
+
+// ctrsm performance test case
+TEST_P(TRSM, ctrsm)
+{
+    TestParams params;
+
+    getParams(&params);
+    TrxmPerformanceTest<FloatComplex>::runInstance(FN_CTRSM, &params);
+}
+
+// ztrsm performance test case
+TEST_P(TRSM, ztrsm)
+{
+    TestParams params;
+
+    getParams(&params);
+    TrxmPerformanceTest<DoubleComplex>::runInstance(FN_ZTRSM, &params);
+}
diff --git a/src/tests/performance/perf-trsv.cpp b/src/tests/performance/perf-trsv.cpp
new file mode 100644
index 0000000..48b9d5c
--- /dev/null
+++ b/src/tests/performance/perf-trsv.cpp
@@ -0,0 +1,353 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/*
+ * Gemv performance test cases
+ */
+
+#include <stdlib.h>             // srand()
+#include <string.h>             // memcpy()
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+
+#include <common.h>
+#include <clBLAS-wrapper.h>
+#include <BlasBase.h>
+#include <trsv.h>
+#include <blas-random.h>
+
+#ifdef PERF_TEST_WITH_ACML
+#include <blas-internal.h>
+#include <blas-wrapper.h>
+#endif
+
+#include "PerformanceTest.h"
+
+/*
+ * NOTE: operation factor means overall number
+ *       of multiply and add per each operation involving
+ *       2 matrix elements
+ */
+
+using namespace std;
+using namespace clMath;
+
+#define CHECK_RESULT(ret)                                                   \
+do {                                                                        \
+    ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
+                         "perform an OpenCL request!" << endl;              \
+    EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
+                         endl;                                              \
+} while (0)
+
+namespace clMath {
+
+template <typename ElemType> class TrsvPerformanceTest : public PerformanceTest
+{
+public:
+    virtual ~TrsvPerformanceTest();
+
+    virtual int prepare(void);
+    virtual nano_time_t etalonPerfSingle(void);
+    virtual nano_time_t clblasPerfSingle(void);
+
+    static void runInstance(BlasFunction fn, TestParams *params)
+    {
+        TrsvPerformanceTest<ElemType> perfCase(fn, params);
+        int ret = 0;
+        int opFactor;
+        BlasBase *base;
+
+        base = clMath::BlasBase::getInstance();
+
+		opFactor = 1;
+
+        if ((fn == FN_DTRSV || fn == FN_ZTRSV) &&
+            !base->isDevSupportDoublePrecision()) {
+
+            std::cerr << ">> WARNING: The target device doesn't support native "
+                         "double precision floating point arithmetic" <<
+                         std::endl << ">> Test skipped" << std::endl;
+            return;
+        }
+
+        if (!perfCase.areResourcesSufficient(params)) {
+            std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
+                        std::endl;
+			return;
+        }
+        else {
+		ret = perfCase.run(opFactor);
+	}
+
+        ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
+                             "perform an OpenCL request!" << endl;
+        EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
+    }
+
+private:
+    TrsvPerformanceTest(BlasFunction fn, TestParams *params);
+
+    bool areResourcesSufficient(TestParams *params);
+
+    TestParams params_;
+   // ElemType alpha_;
+    ElemType *A_;
+    ElemType *X_;
+    ElemType *backX_;
+    cl_mem mobjA_;
+    cl_mem mobjX_;
+    ::clMath::BlasBase *base_;
+};
+
+template <typename ElemType>
+TrsvPerformanceTest<ElemType>::TrsvPerformanceTest(
+    BlasFunction fn,
+    TestParams *params) : PerformanceTest( fn, (problem_size_t)( ( params->N * (params->N+1) * sizeof(ElemType) ) ) ),
+    params_(*params), mobjA_(NULL), mobjX_(NULL)
+{
+
+    A_ = 		new ElemType[(params_.N * params_.lda) + params_.offa];
+    X_ = 		new ElemType[ 1 + ((params_.N-1) * abs(params_.incx)) + params_.offBX ];
+    backX_ = 	new ElemType[ 1 + ((params_.N-1) * abs(params_.incx)) + params_.offBX ];
+
+    base_ = ::clMath::BlasBase::getInstance();
+}
+
+template <typename ElemType>
+TrsvPerformanceTest<ElemType>::~TrsvPerformanceTest()
+{
+     if(A_ != NULL)
+    {
+    delete[] A_;
+    }
+	if(X_ != NULL)
+	{
+    delete[] X_;
+	}
+	if(backX_ != NULL)
+	{
+    delete[] backX_;
+	}
+
+    if( mobjA_ != NULL )
+		clReleaseMemObject(mobjA_);
+    if( mobjX_ != NULL )
+		clReleaseMemObject(mobjX_);
+}
+
+/*
+ * Check if available OpenCL resources are sufficient to
+ * run the test case
+ */
+template <typename ElemType> bool
+TrsvPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
+{
+    clMath::BlasBase *base;
+    size_t gmemSize, allocSize;
+    size_t n = params->N;
+
+	if ((A_ == NULL) || (X_ == NULL) || (backX_ == NULL))
+	{
+        return 0;
+	}
+
+    base = clMath::BlasBase::getInstance();
+    gmemSize = (size_t)base->availGlobalMemSize( 0 );
+    allocSize = (size_t)base->maxMemAllocSize();
+
+	bool suff = ( sizeof(ElemType)*n*params->lda < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations
+    suff = suff && ((( n*params->lda + (1 + (n-1)*abs(params->incx))*2)*sizeof(ElemType)) < gmemSize) ; //for total global allocations
+    return suff ;
+}
+
+template <typename ElemType> int
+TrsvPerformanceTest<ElemType>::prepare(void)
+{
+    size_t lenX, N;
+
+    N = params_.N;
+    lenX = 1 + ((N-1) *abs(params_.incx)) + params_.offBX;
+
+
+	randomTrsvMatrices( params_.order, params_.uplo, params_.diag, params_.N, (A_ + params_.offa), params_.lda,
+										(X_ + params_.offBX), params_.incx);
+    memcpy(backX_, X_, lenX * sizeof(ElemType));
+    mobjA_ = base_->createEnqueueBuffer(A_, ((params_.N * params_.lda) + params_.offa) *
+                                     sizeof(*A_), 0, CL_MEM_READ_ONLY);
+    mobjX_ = base_->createEnqueueBuffer(X_, lenX *
+                                     sizeof(*X_), 0, CL_MEM_READ_WRITE);
+
+    return ((mobjA_ != NULL) &&  (mobjX_ != NULL) ) ? 0 : -1;
+}
+
+template <typename ElemType> nano_time_t
+TrsvPerformanceTest<ElemType>::etalonPerfSingle(void)
+{
+    nano_time_t time = 0;
+    clblasOrder order;
+	clblasUplo fUplo;
+    clblasTranspose fTrans;
+    size_t lda;
+#ifndef PERF_TEST_WITH_ROW_MAJOR
+    if (params_.order == clblasRowMajor) {
+        cerr << "Row major order is not allowed" << endl;
+        return NANOTIME_ERR;
+    }
+#endif
+    memcpy(X_, backX_, ((1 + ((params_.N-1) * abs(params_.incx)))+params_.offBX) * sizeof(ElemType));
+    order = params_.order;
+	fUplo = params_.uplo;
+    fTrans = params_.transA;
+    lda = params_.lda;
+
+#ifdef PERF_TEST_WITH_ACML
+
+	if (order != clblasColumnMajor)
+    {
+        order = clblasColumnMajor;
+        fUplo =  (params_.uplo == clblasUpper)? clblasLower : clblasUpper;
+        fTrans = (params_.transA == clblasNoTrans)? clblasTrans : clblasNoTrans;
+
+        if( params_.transA == clblasConjTrans )
+            doConjugate( A_ + params_.offa, params_.N, params_.N, lda );
+    }
+
+   time = getCurrentTime();
+   clMath::blas::trsv(order, fUplo, fTrans, params_.diag,
+                    params_.N, A_, params_.offa, lda, X_, params_.offBX, params_.incx);
+   time = getCurrentTime() - time;
+
+#endif  // PERF_TEST_WITH_ACML
+
+    return time;
+}
+
+
+template <typename ElemType> nano_time_t
+TrsvPerformanceTest<ElemType>::clblasPerfSingle(void)
+{
+    nano_time_t time;
+    cl_event event;
+    cl_int status;
+    cl_command_queue queue = base_->commandQueues()[0];
+    size_t lenX = 1 + ((params_.N-1) * abs(params_.incx)) + params_.offBX;
+
+    status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0,
+                                  lenX * sizeof(ElemType), backX_, 0, NULL, &event);
+    if (status != CL_SUCCESS) {
+        cerr << "Vector X buffer object enqueuing error, status = " <<
+                 status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    status = clWaitForEvents(1, &event);
+    if (status != CL_SUCCESS) {
+        cout << "Wait on event failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+    event = NULL;
+
+    DataType type;
+    type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE: ( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
+
+	time = getCurrentTime();
+#define TIMING
+#ifdef TIMING
+	clFinish( queue);
+
+	int iter = 20;
+	for ( int i = 1; i <= iter; i++)
+	{
+#endif
+    status = (cl_int)clMath::clblas::trsv(type, params_.order, params_.uplo,
+        params_.transA, params_.diag, params_.N, mobjA_, params_.offa, params_.lda,
+        mobjX_, params_.offBX, params_.incx, 1, &queue, 0, NULL, &event);
+
+    if (status != CL_SUCCESS) {
+        cerr << "The CLBLAS TRSV function failed, status = " <<
+                status << endl;
+
+        return NANOTIME_ERR;
+    }
+
+#ifdef TIMING
+	} // iter loop
+	clFinish( queue);
+    time = getCurrentTime() - time;
+	time /= iter;
+#else
+
+    status = flushAll(1, &queue);
+    if (status != CL_SUCCESS) {
+        cerr << "clFlush() failed, status = " << status << endl;
+        return NANOTIME_ERR;
+    }
+
+    status = waitForSuccessfulFinish(1, &queue, &event);
+    if (status == CL_SUCCESS) {
+        time = getCurrentTime() - time;
+    }
+    else {
+        cerr << "Waiting for completion of commands to the queue failed, "
+                "status = " << status << endl;
+        time = NANOTIME_ERR;
+    }
+#endif
+    return time;
+}
+
+} // namespace clMath
+
+
+TEST_P(TRSV, strsv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TrsvPerformanceTest<float>::runInstance(FN_STRSV, &params);
+}
+
+
+TEST_P(TRSV, dtrsv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TrsvPerformanceTest<double>::runInstance(FN_DTRSV, &params);
+}
+
+TEST_P(TRSV, ctrsv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TrsvPerformanceTest<FloatComplex>::runInstance(FN_CTRSV, &params);
+}
+
+TEST_P(TRSV, ztrsv)
+{
+    TestParams params;
+
+    getParams(&params);
+    TrsvPerformanceTest<DoubleComplex>::runInstance(FN_ZTRSV, &params);
+}
+
diff --git a/src/tests/performance/test-performance.cpp b/src/tests/performance/test-performance.cpp
new file mode 100644
index 0000000..df74132
--- /dev/null
+++ b/src/tests/performance/test-performance.cpp
@@ -0,0 +1,1405 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#define DO_SYR
+#define DO_SPR
+#define DO_SYMM
+#define DO_TRMV
+#define DO_TPMV
+#define DO_TRSV
+#define DO_GEMM
+#define DO_TRMM
+#define DO_TRSM
+#define DO_GEMV
+#define DO_SYR2K
+#define DO_SYRK
+#define DO_GER
+#define DO_GERC
+#define DO_HER
+#define DO_HPR
+#define DO_SYR2
+#define DO_SPR2
+#define DO_SPR2
+#define DO_SBMV
+#define DO_HER2
+#define DO_HPR2
+#define DO_HEMV
+#define DO_HEMM
+#define DO_HERK
+#define DO_SYMV
+#define DO_TPSV
+#define DO_HPMV
+#define DO_SPMV
+#define DO_GBMV
+#define DO_HBMV
+#define DO_TBMV
+#define DO_TBSV
+#define DO_HER2K
+#define DO_SWAP
+#define DO_COPY
+#define DO_SCAL
+#define DO_AXPY
+#define DO_DOT
+#define DO_DOTC
+#define DO_ROTG
+#define DO_ROTM
+#define DO_ROT
+#define DO_ROTMG
+#define DO_NRM2
+#define DO_ASUM
+#define DO_iAMAX
+
+//#define DO_GEMM_2 - This needs to remain commented.
+
+#include <gtest/gtest.h>
+#include <clBLAS.h>
+#include <math.h>
+#include <float.h>
+
+#include <BlasBase.h>
+#include <ExtraTestSizes.h>
+#include <gemv.h>
+#include <symv.h>
+#include <gemm.h>
+#include <gemm-2.h>
+#include <trmm.h>
+#include <trsm.h>
+#include <syr2k.h>
+#include <syrk.h>
+#include <trmv.h>
+#include <tpmv.h>
+#include <trsv.h>
+#include <symm.h>
+#include <ger.h>
+#include <gerc.h>
+#include <syr.h>
+#include <sbmv.h>
+#include <spr.h>
+#include <syr2.h>
+#include <spr2.h>
+#include <her.h>
+#include <hpr.h>
+#include <her2.h>
+#include <hpr2.h>
+#include <hemm.h>
+#include <hemv.h>
+#include <herk.h>
+#include <tpsv.h>
+#include <hpmv.h>
+#include <spmv.h>
+#include <gbmv.h>
+#include <hbmv.h>
+#include <tbmv.h>
+#include <tbsv.h>
+#include <her2k.h>
+#include <swap.h>
+#include <copy.h>
+#include <scal.h>
+#include <axpy.h>
+#include <dot.h>
+#include <dotc.h>
+#include <rotg.h>
+#include <rotm.h>
+#include <rot.h>
+#include <rotmg.h>
+#include <nrm2.h>
+#include <asum.h>
+#include <iamax.h>
+
+#include "PerformanceRecorder.h"
+
+#define EXPECTED_SINGLE_FLOAT_PERF_RATIO 10.0
+#define EXPECTED_DOUBLE_FLOAT_PERF_RATIO 4.0
+
+using ::testing::TestWithParam;
+using ::testing::Values;
+using ::testing::ValuesIn;
+using ::testing::Combine;
+using namespace std;
+using namespace clMath;
+
+PerformanceRecorder *perfRecorder;
+
+static bool
+isDoubleZero(double d)
+{
+    return (fabs(d) < 0.000001);
+}
+
+static const char
+*functionToString(BlasFunction function)
+{
+    const char *s = NULL;
+
+    switch (function) {
+    case FN_SGEMV:
+        s = "SGEMV";
+        break;
+    case FN_DGEMV:
+        s = "DGEMV";
+        break;
+    case FN_CGEMV:
+        s = "CGEMV";
+        break;
+    case FN_ZGEMV:
+        s = "ZGEMV";
+        break;
+    case FN_SSYMV:
+        s = "SSYMV";
+        break;
+    case FN_DSYMV:
+        s = "DSYMV";
+        break;
+    case FN_SGEMM:
+        s = "SGEMM";
+        break;
+    case FN_DGEMM:
+        s = "DGEMM";
+        break;
+    case FN_CGEMM:
+        s = "CGEMM";
+        break;
+    case FN_ZGEMM:
+        s = "ZGEMM";
+        break;
+    case FN_SGEMM_2:
+        s = "SGEMM_2";
+        break;
+    case FN_DGEMM_2:
+        s = "DGEMM_2";
+        break;
+    case FN_CGEMM_2:
+        s = "CGEMM_2";
+        break;
+    case FN_ZGEMM_2:
+        s = "ZGEMM_2";
+        break;
+    case FN_STRMM:
+        s = "STRMM";
+        break;
+    case FN_DTRMM:
+        s = "DTRMM";
+        break;
+    case FN_CTRMM:
+        s = "CTRMM";
+        break;
+    case FN_ZTRMM:
+        s = "ZTRMM";
+        break;
+    case FN_STRSM:
+        s = "STRSM";
+        break;
+    case FN_DTRSM:
+        s = "DTRSM";
+        break;
+    case FN_CTRSM:
+        s = "CTRSM";
+        break;
+    case FN_ZTRSM:
+        s = "ZTRSM";
+        break;
+    case FN_SSYR2K:
+        s = "SSYR2K";
+        break;
+    case FN_DSYR2K:
+        s = "DSYR2K";
+        break;
+    case FN_CSYR2K:
+        s = "CSYR2K";
+        break;
+    case FN_ZSYR2K:
+        s = "ZSYR2K";
+        break;
+    case FN_SSYRK:
+        s = "SSYRK";
+        break;
+    case FN_DSYRK:
+        s = "DSYRK";
+        break;
+    case FN_CSYRK:
+        s = "CSYRK";
+        break;
+    case FN_ZSYRK:
+        s = "ZSYRK";
+        break;
+	case FN_STRMV:
+		s = "STRMV";
+		break;
+	case FN_DTRMV:
+        s = "DTRMV";
+		break;
+	case FN_CTRMV:
+        s = "CTRMV";
+		break;
+	case FN_ZTRMV:
+        s = "ZTRMV";
+	    break;
+    case FN_STPMV:
+        s = "STPMV";
+        break;
+    case FN_DTPMV:
+        s = "DTPMV";
+        break;
+    case FN_CTPMV:
+        s = "CTPMV";
+        break;
+    case FN_ZTPMV:
+        s = "ZTPMV";
+        break;
+
+	case FN_STRSV:
+        s = "STRSV";
+        break;
+    case FN_DTRSV:
+        s = "DTRSV";
+        break;
+    case FN_CTRSV:
+        s = "CTRSV";
+        break;
+    case FN_ZTRSV:
+        s = "ZTRSV";
+		break;
+
+    case FN_STBSV:
+        s = "STBSV";
+        break;
+    case FN_DTBSV:
+        s = "DTBSV";
+        break;
+    case FN_CTBSV:
+        s = "CTBSV";
+        break;
+    case FN_ZTBSV:
+        s = "ZTBSV";
+        break;
+
+
+    case FN_STPSV:
+        s = "STPSV";
+        break;
+    case FN_DTPSV:
+        s = "DTPSV";
+        break;
+    case FN_CTPSV:
+        s = "CTPSV";
+        break;
+    case FN_ZTPSV:
+        s = "ZTPSV";
+        break;
+
+
+	case FN_SSYMM:
+	    s = "SSYMM";
+		break;
+	case FN_DSYMM:
+        s = "DSYMM";
+		break;
+	case FN_CSYMM:
+        s = "CSYMM";
+		break;
+	case FN_ZSYMM:
+        s = "ZSYMM";
+		break;
+
+    case FN_SGER:
+        s = "SGER";
+        break;
+    case FN_DGER:
+        s = "DGER";
+        break;
+    case FN_CGERU:
+        s = "CGERU";
+        break;
+    case FN_ZGERU:
+        s = "ZGERU";
+        break;
+	case FN_CGERC:
+        s = "CGERC";
+        break;
+    case FN_ZGERC:
+        s = "ZGERC";
+        break;
+    case FN_CHER:
+        s = "CHER";
+        break;
+    case FN_ZHER:
+        s = "ZHER";
+        break;
+     case FN_CHPR:
+        s = "CHPR";
+        break;
+    case FN_ZHPR:
+        s = "ZHPR";
+        break;
+
+	case FN_CHER2:
+        s = "CHER2";
+        break;
+    case FN_ZHER2:
+        s = "ZHER2";
+        break;
+	case FN_SSYR:
+		s = "SSYR";
+		break;
+	case FN_DSYR:
+		s = "DSYR";
+		break;
+    case FN_SSPR2:
+        s = "SSPR2";
+        break;
+    case FN_DSPR2:
+        s = "DSPR2";
+        break;
+    case FN_SSPR:
+        s = "SSPR";
+        break;
+    case FN_DSPR:
+        s = "DSPR";
+        break;
+	case FN_SSYR2:
+		s = "SSYR2";
+		break;
+	case FN_DSYR2:
+		s = "DSYR2";
+		break;
+	case FN_CHEMM:
+        s = "CHEMM";
+        break;
+    case FN_ZHEMM:
+        s = "ZHEMM";
+        break;
+	case FN_CHEMV:
+        s = "CHEMV";
+        break;
+    case FN_ZHEMV:
+        s = "ZHEMV";
+        break;
+    case FN_CHERK:
+        s = "CHERK";
+        break;
+    case FN_ZHERK:
+        s = "ZHERK";
+        break;
+    case FN_SSBMV:
+        s = "SSBMV";
+        break;
+    case FN_DSBMV:
+        s = "DSBMV";
+        break;
+    case FN_CHBMV:
+        s = "CHBMV";
+        break;
+    case FN_ZHBMV:
+        s = "ZHBMV";
+        break;
+    case FN_CHER2K:
+        s = "CHER2K";
+        break;
+    case FN_ZHER2K:
+        s = "ZHER2K";
+        break;
+
+    case FN_SSWAP:
+        s = "SSWAP";
+        break;
+    case FN_DSWAP:
+        s = "DSWAP";
+        break;
+    case FN_CSWAP:
+        s = "CSWAP";
+        break;
+    case FN_ZSWAP:
+        s = "ZSWAP";
+        break;
+
+    case FN_SSCAL:
+	    s = "SSCAL";
+		break;
+	case FN_DSCAL:
+        s = "DSCAL";
+		break;
+	case FN_CSCAL:
+        s = "CSCAL";
+		break;
+	case FN_ZSCAL:
+        s = "ZSCAL";
+		break;
+	case FN_CSSCAL:
+        s = "CSSCAL";
+		break;
+	case FN_ZDSCAL:
+        s = "ZDSCAL";
+		break;
+
+	case FN_SCOPY:
+        s = "SCOPY";
+        break;
+    case FN_DCOPY:
+        s = "DCOPY";
+        break;
+    case FN_CCOPY:
+        s = "CCOPY";
+        break;
+    case FN_ZCOPY:
+        s = "ZCOPY";
+        break;
+	 case FN_SDOT:
+        s = "SDOT";
+        break;
+    case FN_DDOT:
+        s = "DDOT";
+        break;
+
+    case FN_CDOTU:
+        s = "CDOTU";
+        break;
+    case FN_ZDOTU:
+        s = "ZDOTU";
+        break;
+
+    case FN_CDOTC:
+        s = "CDOTC";
+        break;
+    case FN_ZDOTC:
+        s = "ZDOTC";
+        break;
+
+    case FN_SAXPY:
+        s = "SAXPY";
+        break;
+    case FN_DAXPY:
+        s = "DAXPY";
+        break;
+    case FN_CAXPY:
+        s = "CAXPY";
+        break;
+    case FN_ZAXPY:
+        s = "ZAXPY";
+        break;
+
+
+    case FN_SROTG:
+        s = "SROTG";
+        break;
+    case FN_DROTG:
+        s = "DROTG";
+        break;
+    case FN_CROTG:
+        s = "CROTG";
+        break;
+    case FN_ZROTG:
+        s = "ZROTG";
+        break;
+
+    case FN_SROTM:
+        s = "SROTM";
+        break;
+    case FN_DROTM:
+        s = "DROTM";
+        break;
+
+	case FN_SROT:
+        s = "SROT";
+        break;
+    case FN_DROT:
+        s = "DROT";
+        break;
+	case FN_CSROT:
+        s = "CSROT";
+        break;
+    case FN_ZDROT:
+        s = "ZDROT";
+        break;
+
+    case FN_SROTMG:
+        s = "SROTMG";
+        break;
+    case FN_DROTMG:
+        s = "DROTMG";
+        break;
+
+    case FN_SNRM2:
+        s = "SNRM2";
+        break;
+    case FN_DNRM2:
+        s = "DNRM2";
+        break;
+	case FN_SCNRM2:
+        s = "SCNRM2";
+        break;
+    case FN_DZNRM2:
+        s = "DZNRM2";
+        break;
+
+    case FN_SASUM:
+        s = "SASUM";
+        break;
+    case FN_DASUM:
+        s = "DASUM";
+        break;
+    case FN_SCASUM:
+        s = "SCASUM";
+        break;
+    case FN_DZASUM:
+        s = "DZASUM";
+        break;
+
+    case FN_iSAMAX:
+        s = "iSAMAX";
+        break;
+    case FN_iDAMAX:
+        s = "iDAMAX";
+        break;
+    case FN_iCAMAX:
+        s = "iCAMAX";
+        break;
+    case FN_iZAMAX:
+        s = "iZAMAX";
+        break;
+
+    default:
+        break;
+    }
+
+    return s;
+}
+
+static const clblasOrder orderSet[] =
+#ifdef PERF_TEST_WITH_ROW_MAJOR
+    { clblasColumnMajor, clblasRowMajor };
+#else
+    { clblasColumnMajor };
+#endif
+static const clblasTranspose transSet[] =
+    { clblasNoTrans, clblasTrans, clblasConjTrans };
+static const clblasSide sideSet[] =
+    { clblasLeft, clblasRight };
+static const clblasUplo uploSet[] =
+    { clblasUpper, clblasLower };
+static const clblasDiag diagSet[] =
+    { clblasUnit, clblasNonUnit };
+
+const int sizeRange[] = { 2048, 2800, 4096, 5600 };
+// Since blas-1 contains only vector arrays, huge vectors has to be provided to reach the peak of the card
+const int blas1sizeRange[] = {4194304, 7840000, 16777216, 31360000 };
+//const int sizeRange[] = { 2800, 4096, 5600};
+const int KRange[] = { 2047, 2799, 4095, 5599 };
+const int ldaRange[] = { 0, 5496, 5497 };
+const int offsetRange[] = { 0, 100 };
+const size_t offs[] =	{0, 63, 128, 258 };
+const int incRange[] = { 1, 10 };
+
+
+const double realAlphaRange[] = {(double)50.0, (double)100.0, (double)999999};
+const cl_float2 complexAlphaRange[] = {floatComplex(1,2), floatComplex(4,5)};
+const ComplexLong alphaBetaRange[] = {{50,50}, {20,20}};
+
+#ifdef DO_GEMV
+// generic gemv test looking over a set of sizes
+INSTANTIATE_TEST_CASE_P(Generic, GEMV, Combine(
+    ValuesIn(orderSet), ValuesIn(transSet),
+    ValuesIn(sizeRange), ValuesIn(sizeRange),
+    Values(ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, GEMV, Combine(
+    ValuesIn(orderSet), ValuesIn(transSet),
+    Values(32), Values(32),
+    Values(ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+#endif
+
+#ifdef DO_SYMV
+// generic symv test looking over a set of sizes
+INSTANTIATE_TEST_CASE_P(Generic, SYMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    ValuesIn(sizeRange),
+    Values(ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, SYMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(32),
+    Values(ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+#endif
+
+#ifdef DO_GEMM_2
+// generic gemm test looking over a set of sizes
+
+INSTANTIATE_TEST_CASE_P(Generic, gemm2, Combine(
+    Values(clblasColumnMajor), Values(clblasNoTrans), Values(clblasNoTrans),
+    ValuesIn(sizeRange), ValuesIn(sizeRange), ValuesIn(sizeRange),
+    Values(ExtraTestSizes()), Values(1)));
+#endif
+
+#ifdef DO_GEMM
+// generic gemm test looking over a set of sizes
+INSTANTIATE_TEST_CASE_P(Generic, GEMM, Combine(
+    ValuesIn(orderSet), ValuesIn(transSet), ValuesIn(transSet),
+    ValuesIn(sizeRange), ValuesIn(sizeRange), ValuesIn(sizeRange),
+    Values(ExtraTestSizes()), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, GEMM, Combine(
+    ValuesIn(orderSet), ValuesIn(transSet), ValuesIn(transSet),
+    Values(32), Values(32), Values(32),
+    Values(ExtraTestSizes()), Values(1)));
+#endif
+
+#ifdef DO_TRMM
+// generic trmm test looking over a set of sizes
+INSTANTIATE_TEST_CASE_P(Generic, TRMM, Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet), ValuesIn(sizeRange),
+    ValuesIn(sizeRange), Values(ExtraTestSizes()), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, TRMM,  Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    Values(32), Values(32),
+    Values(ExtraTestSizes()), Values(1)));
+#endif
+
+#ifdef DO_TRSM
+// generic trsm test looking over a set of sizes
+INSTANTIATE_TEST_CASE_P(Generic, TRSM, Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet), ValuesIn(sizeRange),
+    ValuesIn(sizeRange), Values(ExtraTestSizes()), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, TRSM,  Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(transSet), ValuesIn(diagSet),
+    Values(32), Values(32),
+    Values(ExtraTestSizes()), Values(1)));
+#endif
+
+#ifdef DO_SYR2K
+// generic syr2k test looking over a set of sizes
+INSTANTIATE_TEST_CASE_P(Generic, SYR2K, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet),
+    ValuesIn(sizeRange), ValuesIn(sizeRange),
+    Values(ExtraTestSizes()), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, SYR2K, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(32), Values(32),
+    Values(ExtraTestSizes()), Values(1)));
+#endif
+
+#ifdef DO_SYRK
+// generic syrk test looking over a set of sizes
+INSTANTIATE_TEST_CASE_P(Generic, SYRK, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet),
+    ValuesIn(sizeRange), ValuesIn(sizeRange),
+    Values(ExtraTestSizes()), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, SYRK, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet),
+    Values(32), Values(32),
+    Values(ExtraTestSizes()), Values(1)));
+#endif
+
+#ifdef DO_HERK
+// generic syrk test looking over a set of sizes
+INSTANTIATE_TEST_CASE_P(Generic, HERK, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans, clblasConjTrans),
+    ValuesIn(sizeRange), ValuesIn(sizeRange),ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange),
+    Values(ExtraTestSizes()), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, HERK, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans, clblasConjTrans),
+    Values(32), Values(32), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange),
+    Values(ExtraTestSizes()), Values(1)));
+#endif
+
+
+#ifdef DO_TRMV
+// generic trmv test looking over a set of sizes
+// N, LDA, INCX, OFFA, OFFX, NUMQUEUES
+INSTANTIATE_TEST_CASE_P(Generic, TRMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),
+	ValuesIn(sizeRange), Values(0), Values(1), Values(0,10), Values(0,9), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, TRMV,  Combine(
+   ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),
+   Values(5000), Values(0), Values(1), Values(0,10), Values(0,9),Values(1)));
+#endif
+
+#ifdef DO_TPMV
+// generic trmv test looking over a set of sizes
+// N, LDA, INCX, OFFA, OFFX, NUMQUEUES
+INSTANTIATE_TEST_CASE_P(Generic, TPMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),
+    ValuesIn(sizeRange), Values(0),Values(1), Values(0,10), Values(0,9), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, TPMV,  Combine(
+   ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),
+   Values(5000), Values(0),Values(1), Values(0,10), Values(0,9),Values(1)));
+#endif
+
+#ifdef DO_TRSV
+INSTANTIATE_TEST_CASE_P(Generic, TRSV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),
+        ValuesIn(sizeRange), Values(0), Values(1),  Values(0,10), Values(0,9), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, TRSV,  Combine(
+   ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),
+   Values(1024), Values(0), Values(1), Values(0,10), Values(0,9), Values(1)));
+#endif
+
+#ifdef DO_TPSV
+INSTANTIATE_TEST_CASE_P(Generic, TPSV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),
+        ValuesIn(sizeRange), Values(0), Values(1),  Values(0,10), Values(0,9), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, TPSV,  Combine(
+   ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),
+   Values(1024), Values(0), Values(1), Values(0,10), Values(0,9), Values(1)));
+#endif
+
+
+#ifdef DO_SYMM
+INSTANTIATE_TEST_CASE_P(Generic, SYMM, Combine(
+	ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet),
+	ValuesIn(sizeRange), ValuesIn(sizeRange),
+	ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 0)),
+	Values(1) ) );
+
+INSTANTIATE_TEST_CASE_P(custom, SYMM, Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet),
+    Values(1024), Values(1024),
+    ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 0)),
+    Values(1) ) );
+#endif
+
+
+#ifdef DO_HEMM
+INSTANTIATE_TEST_CASE_P(Generic, HEMM, Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet),
+    ValuesIn(sizeRange), ValuesIn(sizeRange), ValuesIn(complexAlphaRange),
+	ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes()),
+	//ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes((size_t)0, (size_t)0, (size_t)0, (size_t)12, (size_t)0, (size_t)1)),
+	Values(1) ) );
+
+INSTANTIATE_TEST_CASE_P(custom, HEMM, Combine(
+    ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet),
+    Values(1024), Values(1024), Values(complexAlphaRange[0]), Values(complexAlphaRange[1]), Values(clMath::ExtraTestSizes((size_t)0, (size_t)0, (size_t)0, (size_t)8, (size_t)0, (size_t)1 )),
+	Values(1) ) );
+#endif
+
+#ifdef DO_GER
+INSTANTIATE_TEST_CASE_P(Generic, GER, Combine(
+        ValuesIn(orderSet),ValuesIn(sizeRange), ValuesIn(sizeRange),
+        Values(0), Values(1), Values(1), Values(0, 10),
+        Values(0, 8),Values(0, 9),Values(1) ) );
+
+INSTANTIATE_TEST_CASE_P(custom, GER, Combine(
+         ValuesIn(orderSet),ValuesIn(sizeRange), ValuesIn(sizeRange),
+        Values(0), Values(1), Values(1), Values(0, 10),
+        Values(0, 8),Values(0, 9), Values(1) ) );
+#endif
+
+#ifdef DO_GERC
+INSTANTIATE_TEST_CASE_P(Generic, GERC, Combine(
+        ValuesIn(orderSet),ValuesIn(sizeRange), ValuesIn(sizeRange),
+        Values(0), Values(1), Values(1), Values(0, 10),
+        Values(0, 8),Values(0, 9),Values(1) ) );
+
+INSTANTIATE_TEST_CASE_P(custom, GERC, Combine(
+         ValuesIn(orderSet),ValuesIn(sizeRange), ValuesIn(sizeRange),
+        Values(0), Values(1), Values(1), Values(0, 10),
+        Values(0, 8),Values(0, 9), Values(1) ) );
+#endif
+
+#ifdef DO_HER
+INSTANTIATE_TEST_CASE_P(Generic, HER, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(realAlphaRange),
+    ValuesIn(ldaRange), ValuesIn(incRange), ValuesIn(offsetRange),ValuesIn(offsetRange),
+    Values(1) ) );
+
+INSTANTIATE_TEST_CASE_P(Custom, HER, Combine(
+	ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(realAlphaRange),
+    ValuesIn(ldaRange), ValuesIn(incRange), ValuesIn(offsetRange),ValuesIn(offsetRange),
+    Values(1) ) );
+#endif
+
+#ifdef DO_HPR
+INSTANTIATE_TEST_CASE_P(Generic, HPR, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(realAlphaRange),
+    Values(0), ValuesIn(incRange), ValuesIn(offsetRange),ValuesIn(offsetRange),
+    Values(1) ) );
+
+INSTANTIATE_TEST_CASE_P(Custom, HPR, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(realAlphaRange),
+    Values(0), ValuesIn(incRange), ValuesIn(offsetRange),ValuesIn(offsetRange),
+    Values(1) ) );
+#endif
+
+
+#ifdef DO_HER2
+INSTANTIATE_TEST_CASE_P(Generic, HER2, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(complexAlphaRange),
+    ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange),ValuesIn(offsetRange), ValuesIn(ldaRange),
+    Values(1) ) );
+
+INSTANTIATE_TEST_CASE_P(Custom, HER2, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(complexAlphaRange),
+    ValuesIn(offsetRange), Values(1), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(ldaRange),
+    Values(1) ) );
+#endif
+
+#ifdef DO_HPR2
+INSTANTIATE_TEST_CASE_P(Generic, HPR2, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(complexAlphaRange),
+    ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange),ValuesIn(offsetRange), ValuesIn(ldaRange),
+    Values(1) ) );
+
+INSTANTIATE_TEST_CASE_P(Custom, HPR2, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(complexAlphaRange),
+    ValuesIn(offsetRange), Values(1), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(ldaRange),
+    Values(1) ) );
+#endif
+
+#ifdef DO_SYR
+INSTANTIATE_TEST_CASE_P(Generic, SYR, Combine(
+	ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(realAlphaRange),
+	ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange),
+	Values(0), Values(1) ) );
+
+INSTANTIATE_TEST_CASE_P(Custom, SYR, Combine(
+	ValuesIn(orderSet), ValuesIn(uploSet), Values(4099), ValuesIn(realAlphaRange),
+    ValuesIn(offsetRange), Values(1), ValuesIn(offsetRange),
+    ValuesIn(ldaRange), Values(1) ) );
+#endif
+
+#ifdef DO_SPR
+INSTANTIATE_TEST_CASE_P(Generic, SPR, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(realAlphaRange),
+    ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange),
+    Values(0), Values(1) ) );
+
+INSTANTIATE_TEST_CASE_P(Custom, SPR, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(4099), ValuesIn(realAlphaRange),
+    ValuesIn(offsetRange), Values(1), ValuesIn(offsetRange),
+    Values(0), Values(1) ) );
+#endif
+
+#ifdef DO_SYR2
+INSTANTIATE_TEST_CASE_P(Generic, SYR2, Combine(
+	ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(realAlphaRange),
+	ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange),
+	Values(0), Values(1) ) );
+
+INSTANTIATE_TEST_CASE_P(Custom, SYR2, Combine(
+	ValuesIn(orderSet), ValuesIn(uploSet), Values(4099), ValuesIn(realAlphaRange),
+    ValuesIn(offsetRange), Values(1), ValuesIn(offsetRange), ValuesIn(offsetRange),
+    ValuesIn(ldaRange), Values(1) ) );
+#endif
+
+#ifdef DO_SPR2
+INSTANTIATE_TEST_CASE_P(Generic, SPR2, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(realAlphaRange),
+    ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange),
+    Values(0), Values(1) ) );
+
+INSTANTIATE_TEST_CASE_P(Custom, SPR2, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(4099), ValuesIn(realAlphaRange),
+    ValuesIn(offsetRange), Values(1), ValuesIn(offsetRange), ValuesIn(offsetRange),
+    Values(0), Values(1) ) );
+#endif
+
+#ifdef DO_HEMV
+INSTANTIATE_TEST_CASE_P(Generic, HEMV, Combine(
+	ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(alphaBetaRange),
+	ValuesIn(alphaBetaRange), Values((size_t)0), ValuesIn(offs), ValuesIn(offs), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(Custom, HEMV, Combine(
+	ValuesIn(orderSet), ValuesIn(uploSet), Values(4099), ValuesIn(alphaBetaRange),
+	ValuesIn(alphaBetaRange), Values((size_t)0), ValuesIn(offs), ValuesIn(offs), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+#endif
+
+
+#ifdef DO_HPMV
+INSTANTIATE_TEST_CASE_P(Generic, HPMV, Combine(
+	ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(alphaBetaRange),
+	ValuesIn(alphaBetaRange), Values((size_t)0), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(Custom, HPMV, Combine(
+	ValuesIn(orderSet), ValuesIn(uploSet), Values(4099), ValuesIn(alphaBetaRange),
+	ValuesIn(alphaBetaRange), Values((size_t)0), ValuesIn(offs), ValuesIn(offs), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+#endif
+
+
+#ifdef DO_SPMV
+INSTANTIATE_TEST_CASE_P(Generic, SPMV, Combine(
+	ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(alphaBetaRange),
+	ValuesIn(alphaBetaRange), Values((size_t)0), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+
+INSTANTIATE_TEST_CASE_P(Custom, SPMV, Combine(
+	ValuesIn(orderSet), ValuesIn(uploSet), Values(4099), ValuesIn(alphaBetaRange),
+	ValuesIn(alphaBetaRange), Values((size_t)0), ValuesIn(offs), ValuesIn(offs), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1)));
+#endif
+
+#ifdef DO_GBMV
+// generic gemv test looking over a set of sizes
+INSTANTIATE_TEST_CASE_P(Generic, GBMV, Combine(
+    ValuesIn(orderSet), ValuesIn(transSet),
+    ValuesIn(sizeRange), ValuesIn(sizeRange), ValuesIn(KRange), ValuesIn(KRange),
+    Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, GBMV, Combine(
+    ValuesIn(orderSet), ValuesIn(transSet),
+    Values(32), Values(32), Values(30), Values(25),
+    Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+#endif
+
+#ifdef DO_SBMV
+// generic gemv test looking over a set of sizes
+INSTANTIATE_TEST_CASE_P(Generic, SBMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    ValuesIn(sizeRange), ValuesIn(KRange),
+    Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, SBMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(32), Values(25),
+    Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+#endif
+
+#ifdef DO_HBMV
+// generic gemv test looking over a set of sizes
+INSTANTIATE_TEST_CASE_P(Generic, HBMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    ValuesIn(sizeRange), ValuesIn(KRange),
+    Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, HBMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet),
+    Values(32), Values(25),
+    Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+#endif
+
+
+#ifdef DO_TBMV
+// generic gemv test looking over a set of sizes
+INSTANTIATE_TEST_CASE_P(Generic, TBMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),
+    ValuesIn(sizeRange),ValuesIn(KRange), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, TBMV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),
+    Values(32),Values(30),Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), Values(1)));
+#endif
+
+#ifdef DO_TBSV
+// generic gemv test looking over a set of sizes
+INSTANTIATE_TEST_CASE_P(Generic, TBSV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),
+    ValuesIn(sizeRange),ValuesIn(KRange), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, TBSV, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),
+    Values(32),Values(30),Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), Values(1)));
+#endif
+
+#ifdef DO_HER2K
+
+INSTANTIATE_TEST_CASE_P(Generic, HER2K, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans, clblasConjTrans),
+    ValuesIn(sizeRange), ValuesIn(sizeRange),ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange),
+    Values(ExtraTestSizes()), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, HER2K, Combine(
+    ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans, clblasConjTrans),
+    Values(32), Values(32), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange),
+    Values(ExtraTestSizes()), Values(1)));
+#endif
+
+#ifdef DO_SWAP
+
+INSTANTIATE_TEST_CASE_P(Generic, SWAPXY, Combine(
+    ValuesIn(blas1sizeRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(incRange), Values(1) ) );
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, SWAPXY, Combine(
+    Values(819430), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(incRange), Values(1) ) );
+#endif
+
+#ifdef DO_DOT
+
+INSTANTIATE_TEST_CASE_P(Generic, DOT, Combine(
+    ValuesIn(blas1sizeRange), ValuesIn(incRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) );
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, DOT, Combine(
+    Values(819430), ValuesIn(incRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) );
+#endif
+
+#ifdef DO_DOTC
+
+INSTANTIATE_TEST_CASE_P(Generic, DOTC, Combine(
+    ValuesIn(blas1sizeRange), ValuesIn(incRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) );
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, DOTC, Combine(
+    Values(819430), ValuesIn(incRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) );
+#endif
+
+
+#ifdef DO_COPY
+
+INSTANTIATE_TEST_CASE_P(Generic, COPY, Combine(
+    ValuesIn(blas1sizeRange), ValuesIn(incRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) );
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, COPY, Combine(
+    Values(32), ValuesIn(incRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) );
+#endif
+
+
+#ifdef DO_SCAL
+
+INSTANTIATE_TEST_CASE_P(Generic, SCAL, Combine(
+    ValuesIn(blas1sizeRange), ValuesIn(alphaBetaRange), ValuesIn(offsetRange), ValuesIn(incRange), Values(1) ) );
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, SCAL, Combine(
+    Values(819430), ValuesIn(alphaBetaRange), ValuesIn(offsetRange), Values(1, 2), Values(1) ) );
+#endif
+
+#ifdef DO_AXPY
+
+INSTANTIATE_TEST_CASE_P(Generic, AXPY, Combine(
+    ValuesIn(blas1sizeRange), ValuesIn(alphaBetaRange),  ValuesIn(offsetRange),
+    ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(incRange), Values(1) ) );
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, AXPY, Combine(
+    Values(819430), ValuesIn(alphaBetaRange), ValuesIn(offsetRange),
+    Values(1, 2), ValuesIn(offsetRange), Values(1, 2), Values(1) ) );
+#endif
+
+#ifdef DO_ROTG
+INSTANTIATE_TEST_CASE_P(Generic, ROTG, Combine(
+    ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, ROTG, Combine(
+    ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1)));
+#endif
+
+#ifdef DO_ROTM
+INSTANTIATE_TEST_CASE_P(Generic, ROTM, Combine(
+    ValuesIn(blas1sizeRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(alphaBetaRange), Values(1)));
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, ROTM, Combine(
+    ValuesIn(blas1sizeRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(alphaBetaRange), Values(1)));
+#endif
+
+#ifdef DO_ROT
+INSTANTIATE_TEST_CASE_P(Generic, ROT, Combine(
+    ValuesIn(blas1sizeRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, ROT, Combine(
+    ValuesIn(blas1sizeRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1)));
+#endif
+
+#ifdef DO_ROTMG
+INSTANTIATE_TEST_CASE_P(Generic, ROTMG, Combine(
+    ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange),
+    ValuesIn(alphaBetaRange), Values(1)));
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, ROTMG, Combine(
+    ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange),
+    ValuesIn(alphaBetaRange), Values(1)));
+#endif
+
+#ifdef DO_NRM2
+
+INSTANTIATE_TEST_CASE_P(Generic, NRM2, Combine(
+    ValuesIn(blas1sizeRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) );
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, NRM2, Combine(
+    Values(819430), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) );
+#endif
+
+#ifdef DO_ASUM
+
+INSTANTIATE_TEST_CASE_P(Generic, ASUM, Combine(
+    ValuesIn(blas1sizeRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) );
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, ASUM, Combine(
+    Values(819430), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) );
+#endif
+
+#ifdef DO_iAMAX
+
+INSTANTIATE_TEST_CASE_P(Generic, iAMAX, Combine(
+    ValuesIn(blas1sizeRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) );
+
+// Custom test - use command line arguments to tweak it
+INSTANTIATE_TEST_CASE_P(Custom, iAMAX, Combine(
+    Values(819430), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) );
+#endif
+
+#if 0
+// ensure that a TRMM function is faster then the respective GEMM one
+static void
+checkIsTrmmFaster(BlasFunction trmmFn, BlasFunction gemmFn)
+{
+    const char *s1, *s2;
+    gflops_t gf1, gf2;
+
+    gf1 = perfRecorder->clblasAvgPerf(trmmFn);
+    gf2 = perfRecorder->clblasAvgPerf(gemmFn);
+
+    if (isDoubleZero((double)gf1) || isDoubleZero((double)gf2)) {
+        // skip, respective tests has not been run
+        return;
+    }
+
+    s1 = functionToString(trmmFn);
+    s2 = functionToString(gemmFn);
+    cerr << "Check if the " << s1 << " function is faster than the " <<
+            s2 << " one" << endl;
+
+    if (gf1 * 2 > gf2) { // since TRMM has in twice as less operations as GEMM
+        cerr << "PASS" << endl << endl;
+    }
+    else {
+        cerr << "FAIL" << endl << endl;
+    }
+}
+#endif
+
+int
+main(int argc, char *argv[])
+{
+    int ret;
+    int fn;
+    gflops_t gflops1, gflops2;
+	gbps_t gbps1,gbps2;
+    double ratio;
+    const char *name;
+    ::clMath::BlasBase *base;
+    TestParams params;
+#if 0
+    BlasFunction estimFuncs[][2] = {
+        {FN_SGEMM, FN_CGEMM }, // FN_STRMM, FN_CTRMM},
+        {FN_DGEMM, FN_ZGEMM }  // FN_DTRMM, FN_ZTRMM}};
+		};
+    const char *message[2] = {
+        "Check if the resulting average ratio for single float types "
+        "(for GEMM and TRMM) matches the expected one ",
+        "Check if the resulting average ratio for double float "
+        "precision types (for GEMM and TRMM) matches the expected one "};
+    double estimRatios[2] = {
+        EXPECTED_SINGLE_FLOAT_PERF_RATIO,
+        EXPECTED_DOUBLE_FLOAT_PERF_RATIO};
+#endif
+
+    if ((argc > 1) && !strcmp(argv[1], "--test-help")) {
+        printUsage("test-performance");
+        return 0;
+    }
+
+    ::testing::InitGoogleTest(&argc, argv);
+    ::std::cerr << "Initialize OpenCL and CLBLAS..." << ::std::endl;
+    base = ::clMath::BlasBase::getInstance();
+    if (base == NULL) {
+        ::std::cerr << "Fatal error, OpenCL or clblas initialization failed! "
+                       "Leaving the test." << ::std::endl;
+        return -1;
+    }
+
+    base->setSeed(DEFAULT_SEED);
+
+    if (argc > 1) {
+        params.optFlags = NO_FLAGS;
+        params.devType = CL_DEVICE_TYPE_GPU;
+        params.devName = NULL;
+        if (parseBlasCmdLineArgs(argc, argv, &params) != 0) {
+            printUsage(argv[0]);
+            return 1;
+        }
+        if (params.optFlags & SET_SEED) {
+            base->setSeed(params.seed);
+        }
+        if (params.optFlags & SET_ALPHA) {
+            base->setAlpha(params.alpha);
+        }
+        if (params.optFlags & SET_BETA) {
+            base->setBeta(params.beta);
+        }
+        if (params.optFlags & SET_M) {
+            base->setM(params.M);
+        }
+        if (params.optFlags & SET_N) {
+            base->setN(params.N);
+        }
+        if (params.optFlags & SET_K) {
+            base->setK(params.K);
+        }
+		if (params.optFlags & SET_INCX) {
+            base->setIncX(params.incx);
+        }
+
+        if (params.optFlags & SET_DEVICE_TYPE) {
+            if (!base->setDeviceType(&params.devType, params.devName)) {
+                ::std::cerr << "Fatal error, OpenCL or clblas "
+                        "initialization failed! Leaving the test." <<
+                        ::std::endl;
+                return -1;
+            }
+        }
+        if (params.optFlags & SET_NUM_COMMAND_QUEUES) {
+            base->setNumCommandQueues(params.numCommandQueues);
+        }
+    }
+
+    parseEnv(&params);
+    if (params.optFlags & SET_USE_IMAGES) {
+        base->setUseImages(params.useImages);
+    }
+
+    perfRecorder = new PerformanceRecorder;
+
+	/* Use of image based buffers is deprecated
+    if (base->useImages()) {
+        if (base->addScratchImages()) {
+            std::cerr << "FATAL ERROR, CANNOT CREATE SCRATCH IMAGES!" << std::endl;
+        }
+    }
+	*/
+
+    ret = RUN_ALL_TESTS();
+
+    if (base->useImages()) {
+        base->removeScratchImages();
+    }
+
+    cerr << endl << endl;
+    cerr << "----------------------------------------------" << endl <<
+            "Overall performance information:" << endl <<
+            "----------------------------------------------" << endl;
+
+    // now, check average speed ratio
+    for (fn = 0; fn < BLAS_FUNCTION_END; fn++) {
+        name = functionToString(static_cast<BlasFunction>(fn));
+
+        /*
+         * For global memory based solutions print only average performance,
+         * and for those of image based perform just comparison
+         */
+
+        ratio = perfRecorder->avgTimeRatio(static_cast<BlasFunction>(fn));
+        if (isDoubleZero(ratio)) {
+            // skip, this group of tests has not been run
+            continue;
+        }
+		if (functionBlasLevel(static_cast<BlasFunction>(fn)) != 3) 	//display metrics in GBps if it is a BLAS-1 or BLAS-2 function
+		{
+	        gbps1 = perfRecorder->etalonAvgGbpsPerf(
+	            static_cast<BlasFunction>(fn));
+	        gbps2 = perfRecorder->clblasAvgGbpsPerf(
+	            static_cast<BlasFunction>(fn));
+
+	        cout << "Average reference " << name << endl <<
+	                " performance is " << gbps1 <<
+	                " GBps; for CLBLAS implementation: " << endl <<
+	                "average performance = " << gbps2 << " GBps, "
+	                "average time ratio = " << ratio << endl << endl;
+		}
+		else						//display metrics in GFlops if its a BLAS-3 function
+		{
+			gflops1 = perfRecorder->etalonAvgPerf(
+                static_cast<BlasFunction>(fn));
+            gflops2 = perfRecorder->clblasAvgPerf(
+                static_cast<BlasFunction>(fn));
+
+            cout << "Average reference " << name << endl <<
+                    " performance is " << gflops1 <<
+                    " giga-flops; for CLBLAS implementation: " << endl <<
+                    "average performance = " << gflops2 << " giga-flops, "
+                    "average time ratio = " << ratio << endl << endl;
+		}
+    }
+
+    // check if TRMM is faster than GEMM
+#if 0
+    checkIsTrmmFaster(FN_STRMM, FN_SGEMM);
+    checkIsTrmmFaster(FN_DTRMM, FN_DGEMM);
+    checkIsTrmmFaster(FN_CTRMM, FN_CGEMM);
+    checkIsTrmmFaster(FN_ZTRMM, FN_ZGEMM);
+
+    /*
+     * Now, do the final average ratio comparison if there is
+     * the image based version. Involve only GEMM and TRMM as using
+     * 2 images
+     */
+    if (base->useImages()) {
+        int j;
+
+        for (i = 0; i < 2; i++) {
+            ratio = 0;
+            nruns = 0;
+            for (j = 0; j < 2; j++) {
+                r = perfRecorder->avgTimeRatio(estimFuncs[i][j]);
+                if (!isDoubleZero(r)) {
+                    ratio += r;
+                    nruns++;
+                }
+            }
+            if (nruns) {
+                ratio /= nruns;
+                cerr << message[i] << endl;
+                if (ratio >= estimRatios[i]) {
+                    cerr << "PASS (" << ratio << ")" << endl << endl;
+                }
+                else {
+                    cerr << "FAIL (" << ratio << ")" << endl << endl;
+                }
+            }
+        }
+    }
+#endif
+    /*
+     * Explicitely tell the singleton to release all resources,
+     * before we return from main.
+     */
+    base->release( );
+
+    return ret;
+}
diff --git a/src/tests/timer.c b/src/tests/timer.c
new file mode 100644
index 0000000..e304f4f
--- /dev/null
+++ b/src/tests/timer.c
@@ -0,0 +1,125 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+#include "timer.h"
+
+#if defined(_MSC_VER)
+
+#include <windows.h>
+
+nano_time_t
+conv2nanosec(nano_time_t t)
+{
+    LARGE_INTEGER count;
+
+	if (QueryPerformanceFrequency(&count) == FALSE) {
+        return 0;
+    }
+    t = (t * 1000000)/count.QuadPart;
+
+    return (nano_time_t)(t * 1000);
+}
+
+nano_time_t
+conv2microsec(nano_time_t t)
+{
+    LARGE_INTEGER count;
+
+	if (QueryPerformanceFrequency(&count) == FALSE) {
+        return 0;
+    }
+
+    return (t * 1000000ULL)/count.QuadPart;
+}
+
+nano_time_t
+conv2millisec(nano_time_t t)
+{
+    LARGE_INTEGER count;
+
+    if (QueryPerformanceFrequency(&count) == FALSE) {
+        return 0;
+    }
+
+    return (t * 1000) / count.QuadPart;
+}
+
+nano_time_t
+getCurrentTime(void)
+{
+     LARGE_INTEGER count;
+
+     if (QueryPerformanceCounter(&count) == FALSE) {
+         return 0;
+     }
+     return (nano_time_t)count.QuadPart;
+}
+
+void
+sleepTime(nano_time_t time) {
+    DWORD tms = (DWORD)(time/1000000);
+
+    Sleep(tms);
+}
+#else /* defined(_MCS_VER) */
+
+#include <time.h>
+
+nano_time_t
+conv2nanosec(nano_time_t t)
+{
+    /* clock_... functions measure time in nanoseconds */
+    return t;
+}
+
+nano_time_t
+conv2microsec(nano_time_t t)
+{
+    return t/1000;
+}
+
+nano_time_t
+conv2millisec(nano_time_t t)
+{
+    return t/1000000;
+}
+
+nano_time_t
+getCurrentTime(void)
+{
+    int err;
+    struct timespec t;
+
+    err = clock_gettime(CLOCK_REALTIME, &t);
+    if (err == 0) {
+        return (t.tv_sec * 1000000000UL + t.tv_nsec);
+    }
+    return 0;
+}
+
+void
+sleepTime(nano_time_t time) {
+    struct timespec t1;
+
+    t1.tv_sec = 0;
+    t1.tv_nsec = time;
+    nanosleep(&t1, NULL);
+}
+
+// namespace )
+
+#endif  /* defined(_MCS_VER) */
diff --git a/src/version.h.in b/src/version.h.in
new file mode 100644
index 0000000..a05cbce
--- /dev/null
+++ b/src/version.h.in
@@ -0,0 +1,22 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+/* the configured version and settings for clblas
+ */
+#define clblasVersionMajor @clBLAS_VERSION_MAJOR@
+#define clblasVersionMinor @clBLAS_VERSION_MINOR@
+#define clblasVersionPatch @clBLAS_VERSION_PATCH@

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git



More information about the debian-science-commits mailing list